[Openmp-commits] [openmp] r302929 - Clang-format and whitespace cleanup of source code

Fri May 12 11:01:35 PDT 2017

Author: jlpeyton
Date: Fri May 12 13:01:32 2017
New Revision: 302929

URL: http://llvm.org/viewvc/llvm-project?rev=302929&view=rev
Log:
Clang-format and whitespace cleanup of source code

This patch contains the clang-format and cleanup of the entire code base. Some
of clang-formats changes made the code look worse in places. A best effort was
made to resolve the bulk of these problems, but many remain. Most of the
problems were mangling line-breaks and tabbing of comments.

Patch by Terry Wilmarth

Differential Revision: https://reviews.llvm.org/D32659

Modified:
    openmp/trunk/runtime/src/extractExternal.cpp
    openmp/trunk/runtime/src/kmp.h
    openmp/trunk/runtime/src/kmp_affinity.cpp
    openmp/trunk/runtime/src/kmp_affinity.h
    openmp/trunk/runtime/src/kmp_alloc.cpp
    openmp/trunk/runtime/src/kmp_atomic.cpp
    openmp/trunk/runtime/src/kmp_atomic.h
    openmp/trunk/runtime/src/kmp_barrier.cpp
    openmp/trunk/runtime/src/kmp_cancel.cpp
    openmp/trunk/runtime/src/kmp_csupport.cpp
    openmp/trunk/runtime/src/kmp_debug.cpp
    openmp/trunk/runtime/src/kmp_debug.h
    openmp/trunk/runtime/src/kmp_debugger.cpp
    openmp/trunk/runtime/src/kmp_debugger.h
    openmp/trunk/runtime/src/kmp_dispatch.cpp
    openmp/trunk/runtime/src/kmp_environment.cpp
    openmp/trunk/runtime/src/kmp_environment.h
    openmp/trunk/runtime/src/kmp_error.cpp
    openmp/trunk/runtime/src/kmp_error.h
    openmp/trunk/runtime/src/kmp_ftn_cdecl.cpp
    openmp/trunk/runtime/src/kmp_ftn_entry.h
    openmp/trunk/runtime/src/kmp_ftn_extra.cpp
    openmp/trunk/runtime/src/kmp_ftn_os.h
    openmp/trunk/runtime/src/kmp_ftn_stdcall.cpp
    openmp/trunk/runtime/src/kmp_global.cpp
    openmp/trunk/runtime/src/kmp_gsupport.cpp
    openmp/trunk/runtime/src/kmp_i18n.cpp
    openmp/trunk/runtime/src/kmp_i18n.h
    openmp/trunk/runtime/src/kmp_import.cpp
    openmp/trunk/runtime/src/kmp_io.cpp
    openmp/trunk/runtime/src/kmp_io.h
    openmp/trunk/runtime/src/kmp_itt.cpp
    openmp/trunk/runtime/src/kmp_itt.h
    openmp/trunk/runtime/src/kmp_itt.inl
    openmp/trunk/runtime/src/kmp_lock.cpp
    openmp/trunk/runtime/src/kmp_lock.h
    openmp/trunk/runtime/src/kmp_omp.h
    openmp/trunk/runtime/src/kmp_os.h
    openmp/trunk/runtime/src/kmp_platform.h
    openmp/trunk/runtime/src/kmp_runtime.cpp
    openmp/trunk/runtime/src/kmp_safe_c_api.h
    openmp/trunk/runtime/src/kmp_sched.cpp
    openmp/trunk/runtime/src/kmp_settings.cpp
    openmp/trunk/runtime/src/kmp_settings.h
    openmp/trunk/runtime/src/kmp_stats.cpp
    openmp/trunk/runtime/src/kmp_stats.h
    openmp/trunk/runtime/src/kmp_stats_timing.cpp
    openmp/trunk/runtime/src/kmp_stats_timing.h
    openmp/trunk/runtime/src/kmp_str.cpp
    openmp/trunk/runtime/src/kmp_str.h
    openmp/trunk/runtime/src/kmp_stub.cpp
    openmp/trunk/runtime/src/kmp_stub.h
    openmp/trunk/runtime/src/kmp_taskdeps.cpp
    openmp/trunk/runtime/src/kmp_tasking.cpp
    openmp/trunk/runtime/src/kmp_taskq.cpp
    openmp/trunk/runtime/src/kmp_threadprivate.cpp
    openmp/trunk/runtime/src/kmp_utility.cpp
    openmp/trunk/runtime/src/kmp_version.cpp
    openmp/trunk/runtime/src/kmp_version.h
    openmp/trunk/runtime/src/kmp_wait_release.cpp
    openmp/trunk/runtime/src/kmp_wait_release.h
    openmp/trunk/runtime/src/kmp_wrapper_getpid.h
    openmp/trunk/runtime/src/kmp_wrapper_malloc.h
    openmp/trunk/runtime/src/ompt-event-specific.h
    openmp/trunk/runtime/src/ompt-general.cpp
    openmp/trunk/runtime/src/ompt-internal.h
    openmp/trunk/runtime/src/ompt-specific.cpp
    openmp/trunk/runtime/src/ompt-specific.h
    openmp/trunk/runtime/src/tsan_annotations.cpp
    openmp/trunk/runtime/src/tsan_annotations.h
    openmp/trunk/runtime/src/z_Linux_asm.s
    openmp/trunk/runtime/src/z_Linux_util.cpp
    openmp/trunk/runtime/src/z_Windows_NT-586_asm.asm
    openmp/trunk/runtime/src/z_Windows_NT-586_util.cpp
    openmp/trunk/runtime/src/z_Windows_NT_util.cpp

Modified: openmp/trunk/runtime/src/extractExternal.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/extractExternal.cpp?rev=302929&r1=302928&r2=302929&view=diff
==============================================================================

--- openmp/trunk/runtime/src/extractExternal.cpp (original)
+++ openmp/trunk/runtime/src/extractExternal.cpp Fri May 12 13:01:32 2017
@@ -13,13 +13,13 @@
 //===----------------------------------------------------------------------===//
 
 
-#include <stdlib.h>
-#include <iostream>
-#include <strstream>
 #include <fstream>
-#include <string>
-#include <set>
+#include <iostream>
 #include <map>
+#include <set>
+#include <stdlib.h>
+#include <string>
+#include <strstream>
 
 /* Given a set of n object files h ('external' object files) and a set of m
    object files o ('internal' object files),
@@ -30,468 +30,457 @@
    Usage:
    hide.exe <n> <filenames for h> <filenames for o>
 
-   Thus, the prefixed symbols become hidden in the sense that they now have a special
-   prefix.
+   Thus, the prefixed symbols become hidden in the sense that they now have a
+   special prefix.
 */
 
 using namespace std;
 
-void stop(char* errorMsg) {
-    printf("%s\n", errorMsg);
-    exit(1);
+void stop(char *errorMsg) {
+  printf("%s\n", errorMsg);
+  exit(1);
 }
 
 // an entry in the symbol table of a .OBJ file
 class Symbol {
 public:
-    __int64 name;
-    unsigned value;
-    unsigned short sectionNum, type;
-    char storageClass, nAux;
+  __int64 name;
+  unsigned value;
+  unsigned short sectionNum, type;
+  char storageClass, nAux;
 };
 
 class _rstream : public istrstream {
 private:
-    const char *buf;
+  const char *buf;
+
 protected:
-    _rstream(pair<const char*, streamsize> p):istrstream(p.first,p.second),buf(p.first){}
-    ~_rstream() {
-	delete[]buf;
-    }
+  _rstream(pair<const char *, streamsize> p)
+      : istrstream(p.first, p.second), buf(p.first) {}
+  ~_rstream() { delete[] buf; }
 };
 
-/* A stream encapuslating the content of a file or the content of a string, overriding the
-   >> operator to read various integer types in binary form, as well as a symbol table
-   entry.
-*/
+// A stream encapuslating the content of a file or the content of a string,
+// overriding the >> operator to read various integer types in binary form,
+// as well as a symbol table entry.
 class rstream : public _rstream {
 private:
-    template<class T>
-    inline rstream& doRead(T &x) {
-	read((char*)&x, sizeof(T));
-	return *this;
-    }
-    static pair<const char*, streamsize> getBuf(const char *fileName) {
-	ifstream raw(fileName,ios::binary | ios::in);
-	if(!raw.is_open())
-	    stop("rstream.getBuf: Error opening file");
-	raw.seekg(0,ios::end);
-	streampos fileSize = raw.tellg();
-	if(fileSize < 0)
-	    stop("rstream.getBuf: Error reading file");
-	char *buf = new char[fileSize];
-	raw.seekg(0,ios::beg);
-	raw.read(buf, fileSize);
-	return pair<const char*, streamsize>(buf,fileSize);
-    }
+  template <class T> inline rstream &doRead(T &x) {
+    read((char *)&x, sizeof(T));
+    return *this;
+  }
+  static pair<const char *, streamsize> getBuf(const char *fileName) {
+    ifstream raw(fileName, ios::binary | ios::in);
+    if (!raw.is_open())
+      stop("rstream.getBuf: Error opening file");
+    raw.seekg(0, ios::end);
+    streampos fileSize = raw.tellg();
+    if (fileSize < 0)
+      stop("rstream.getBuf: Error reading file");
+    char *buf = new char[fileSize];
+    raw.seekg(0, ios::beg);
+    raw.read(buf, fileSize);
+    return pair<const char *, streamsize>(buf, fileSize);
+  }
+
 public:
-    // construct from a string
-    rstream(const char *buf,streamsize size):_rstream(pair<const char*,streamsize>(buf, size)){}
-    /* construct from a file whole content is fully read once to initialize the content of
-       this stream
-    */
-    rstream(const char *fileName):_rstream(getBuf(fileName)){}
-    rstream& operator>>(int &x) {
-	return doRead(x);
-    }
-    rstream& operator>>(unsigned &x) {
-	return doRead(x);
-    }
-    rstream& operator>>(short &x) {
-	return doRead(x);
-    }
-    rstream& operator>>(unsigned short &x) {
-	return doRead(x);
-    }
-    rstream& operator>>(Symbol &e) {
-	read((char*)&e, 18);
-	return *this;
-    }
+  // construct from a string
+  rstream(const char *buf, streamsize size)
+      : _rstream(pair<const char *, streamsize>(buf, size)) {}
+  // construct from a file whole content is fully read once to initialize the
+  // content of this stream
+  rstream(const char *fileName) : _rstream(getBuf(fileName)) {}
+  rstream &operator>>(int &x) { return doRead(x); }
+  rstream &operator>>(unsigned &x) { return doRead(x); }
+  rstream &operator>>(short &x) { return doRead(x); }
+  rstream &operator>>(unsigned short &x) { return doRead(x); }
+  rstream &operator>>(Symbol &e) {
+    read((char *)&e, 18);
+    return *this;
+  }
 };
 
 // string table in a .OBJ file
 class StringTable {
 private:
-    map<string, unsigned> directory;
-    size_t length;
-    char *data;
-
-    // make <directory> from <length> bytes in <data>
-    void makeDirectory(void) {
-	unsigned i = 4;
-	while(i < length) {
-	    string s = string(data + i);
-	    directory.insert(make_pair(s, i));
-	    i += s.size() + 1;
-	}
-    }
-    // initialize <length> and <data> with contents specified by the arguments
-    void init(const char *_data) {
-	unsigned _length = *(unsigned*)_data;
-
-	if(_length < sizeof(unsigned) || _length != *(unsigned*)_data)
-	    stop("StringTable.init: Invalid symbol table");
-	if(_data[_length - 1]) {
-	    // to prevent runaway strings, make sure the data ends with a zero
-	    data = new char[length = _length + 1];
-	    data[_length] = 0;
-	} else {
-	    data = new char[length = _length];
-	}
-	*(unsigned*)data = length;
-	KMP_MEMCPY(data + sizeof(unsigned), _data + sizeof(unsigned),
-	           length - sizeof(unsigned));
-	makeDirectory();
-    }
+  map<string, unsigned> directory;
+  size_t length;
+  char *data;
+
+  // make <directory> from <length> bytes in <data>
+  void makeDirectory(void) {
+    unsigned i = 4;
+    while (i < length) {
+      string s = string(data + i);
+      directory.insert(make_pair(s, i));
+      i += s.size() + 1;
+    }
+  }
+  // initialize <length> and <data> with contents specified by the arguments
+  void init(const char *_data) {
+    unsigned _length = *(unsigned *)_data;
+
+    if (_length < sizeof(unsigned) || _length != *(unsigned *)_data)
+      stop("StringTable.init: Invalid symbol table");
+    if (_data[_length - 1]) {
+      // to prevent runaway strings, make sure the data ends with a zero
+      data = new char[length = _length + 1];
+      data[_length] = 0;
+    } else {
+      data = new char[length = _length];
+    }
+    *(unsigned *)data = length;
+    KMP_MEMCPY(data + sizeof(unsigned), _data + sizeof(unsigned),
+               length - sizeof(unsigned));
+    makeDirectory();
+  }
+
 public:
-    StringTable(rstream &f) {
-	/* Construct string table by reading from f.
-	 */
-	streampos s;
-	unsigned strSize;
-	char *strData;
-
-	s = f.tellg();
-	f>>strSize;
-	if(strSize < sizeof(unsigned))
-	    stop("StringTable: Invalid string table");
-	strData = new char[strSize];
-	*(unsigned*)strData = strSize;
-	// read the raw data into <strData>
-	f.read(strData + sizeof(unsigned), strSize - sizeof(unsigned));
-	s = f.tellg() - s;
-	if(s < strSize)
-	    stop("StringTable: Unexpected EOF");
-	init(strData);
-	delete[]strData;
-    }
-    StringTable(const set<string> &strings) {
-	/* Construct string table from given strings.
-	 */
-	char *p;
-	set<string>::const_iterator it;
-	size_t s;
-
-	// count required size for data
-	for(length = sizeof(unsigned), it = strings.begin(); it != strings.end(); ++it) {
-	    size_t l = (*it).size();
-
-	    if(l > (unsigned) 0xFFFFFFFF)
-		stop("StringTable: String too long");
-	    if(l > 8) {
-		length += l + 1;
-		if(length > (unsigned) 0xFFFFFFFF)
-		    stop("StringTable: Symbol table too long");
-	    }
-	}
-	data = new char[length];
-	*(unsigned*)data = length;
-	// populate data and directory
-	for(p = data + sizeof(unsigned), it = strings.begin(); it != strings.end(); ++it) {
-	    const string &str = *it;
-	    size_t l = str.size();
-	    if(l > 8) {
-		directory.insert(make_pair(str, p - data));
-		KMP_MEMCPY(p, str.c_str(), l);
-		p[l] = 0;
-		p += l + 1;
-	    }
-	}
-    }
-    ~StringTable() {
-	delete[] data;
-    }
-    /* Returns encoding for given string based on this string table.
-       Error if string length is greater than 8 but string is not in
-       the string table--returns 0.
-    */
-    __int64 encode(const string &str) {
-	__int64 r;
-
-	if(str.size() <= 8) {
-	    // encoded directly
-	    ((char*)&r)[7] = 0;
-	    KMP_STRNCPY_S((char*)&r, sizeof(r), str.c_str(), 8);
-	    return r;
-	} else {
-	    // represented as index into table
-	    map<string,unsigned>::const_iterator it = directory.find(str);
-	    if(it == directory.end())
-		stop("StringTable::encode: String now found in string table");
-	    ((unsigned*)&r)[0] = 0;
-	    ((unsigned*)&r)[1] = (*it).second;
-	    return r;
-	}
-    }
-    /* Returns string represented by x based on this string table.
-       Error if x references an invalid position in the table--returns
-       the empty string.
-    */
-    string decode(__int64 x) const {
-	if(*(unsigned*)&x == 0) {
-	    // represented as index into table
-	    unsigned &p = ((unsigned*)&x)[1];
-	    if(p >= length)
-		stop("StringTable::decode: Invalid string table lookup");
-	    return string(data + p);
-	} else {
-	    // encoded directly
-	    char *p = (char*)&x;
-	    int i;
-
-	    for(i = 0; i < 8 && p[i]; ++i);
-	    return string(p, i);
-	}
-    }
-    void write(ostream &os) {
-	os.write(data, length);
+  StringTable(rstream &f) {
+    // Construct string table by reading from f.
+    streampos s;
+    unsigned strSize;
+    char *strData;
+
+    s = f.tellg();
+    f >> strSize;
+    if (strSize < sizeof(unsigned))
+      stop("StringTable: Invalid string table");
+    strData = new char[strSize];
+    *(unsigned *)strData = strSize;
+    // read the raw data into <strData>
+    f.read(strData + sizeof(unsigned), strSize - sizeof(unsigned));
+    s = f.tellg() - s;
+    if (s < strSize)
+      stop("StringTable: Unexpected EOF");
+    init(strData);
+    delete[] strData;
+  }
+  StringTable(const set<string> &strings) {
+    // Construct string table from given strings.
+    char *p;
+    set<string>::const_iterator it;
+    size_t s;
+
+    // count required size for data
+    for (length = sizeof(unsigned), it = strings.begin(); it != strings.end();
+         ++it) {
+      size_t l = (*it).size();
+
+      if (l > (unsigned)0xFFFFFFFF)
+        stop("StringTable: String too long");
+      if (l > 8) {
+        length += l + 1;
+        if (length > (unsigned)0xFFFFFFFF)
+          stop("StringTable: Symbol table too long");
+      }
+    }
+    data = new char[length];
+    *(unsigned *)data = length;
+    // populate data and directory
+    for (p = data + sizeof(unsigned), it = strings.begin(); it != strings.end();
+         ++it) {
+      const string &str = *it;
+      size_t l = str.size();
+      if (l > 8) {
+        directory.insert(make_pair(str, p - data));
+        KMP_MEMCPY(p, str.c_str(), l);
+        p[l] = 0;
+        p += l + 1;
+      }
+    }
+  }
+  ~StringTable() { delete[] data; }
+  // Returns encoding for given string based on this string table. Error if
+  // string length is greater than 8 but string is not in the string table
+  // -- returns 0.
+  __int64 encode(const string &str) {
+    __int64 r;
+
+    if (str.size() <= 8) {
+      // encoded directly
+      ((char *)&r)[7] = 0;
+      KMP_STRNCPY_S((char *)&r, sizeof(r), str.c_str(), 8);
+      return r;
+    } else {
+      // represented as index into table
+      map<string, unsigned>::const_iterator it = directory.find(str);
+      if (it == directory.end())
+        stop("StringTable::encode: String now found in string table");
+      ((unsigned *)&r)[0] = 0;
+      ((unsigned *)&r)[1] = (*it).second;
+      return r;
+    }
+  }
+  // Returns string represented by x based on this string table. Error if x
+  // references an invalid position in the table--returns the empty string.
+  string decode(__int64 x) const {
+    if (*(unsigned *)&x == 0) {
+      // represented as index into table
+      unsigned &p = ((unsigned *)&x)[1];
+      if (p >= length)
+        stop("StringTable::decode: Invalid string table lookup");
+      return string(data + p);
+    } else {
+      // encoded directly
+      char *p = (char *)&x;
+      int i;
+
+      for (i = 0; i < 8 && p[i]; ++i)
+        ;
+      return string(p, i);
     }
+  }
+  void write(ostream &os) { os.write(data, length); }
 };
 
-/* for the named object file, determines the set of defined symbols and the set of undefined external symbols
-   and writes them to <defined> and <undefined> respectively
-*/
-void computeExternalSymbols(const char *fileName, set<string> *defined, set<string> *undefined){
-    streampos fileSize;
-    size_t strTabStart;
-    unsigned symTabStart, symNEntries;
-    rstream f(fileName);
-
-    f.seekg(0,ios::end);
-    fileSize = f.tellg();
-
-    f.seekg(8);
-    f >> symTabStart >> symNEntries;
-    // seek to the string table
-    f.seekg(strTabStart = symTabStart + 18 * (size_t)symNEntries);
-    if(f.eof()) {
-	printf("computeExternalSymbols: fileName='%s', fileSize = %lu, symTabStart = %u, symNEntries = %u\n",
-	       fileName, (unsigned long) fileSize, symTabStart, symNEntries);
-	stop("computeExternalSymbols: Unexpected EOF 1");
-    }
-    StringTable stringTable(f); // read the string table
-    if(f.tellg() != fileSize)
-	stop("computeExternalSymbols: Unexpected data after string table");
-
-    f.clear();
-    f.seekg(symTabStart); // seek to the symbol table
-
-    defined->clear(); undefined->clear();
-    for(int i = 0; i < symNEntries; ++i) {
-	// process each entry
-	Symbol e;
-
-	if(f.eof())
-	    stop("computeExternalSymbols: Unexpected EOF 2");
-	f>>e;
-	if(f.fail())
-	    stop("computeExternalSymbols: File read error");
-	if(e.nAux) { // auxiliary entry: skip
-	    f.seekg(e.nAux * 18, ios::cur);
-	    i += e.nAux;
-	}
-	// if symbol is extern and defined in the current file, insert it
-	if(e.storageClass == 2)
-	    if(e.sectionNum)
-		defined->insert(stringTable.decode(e.name));
-	    else
-		undefined->insert(stringTable.decode(e.name));
-    }
+// for the named object file, determines the set of defined symbols and the set
+// of undefined external symbols and writes them to <defined> and <undefined>
+// respectively
+void computeExternalSymbols(const char *fileName, set<string> *defined,
+                            set<string> *undefined) {
+  streampos fileSize;
+  size_t strTabStart;
+  unsigned symTabStart, symNEntries;
+  rstream f(fileName);
+
+  f.seekg(0, ios::end);
+  fileSize = f.tellg();
+
+  f.seekg(8);
+  f >> symTabStart >> symNEntries;
+  // seek to the string table
+  f.seekg(strTabStart = symTabStart + 18 * (size_t)symNEntries);
+  if (f.eof()) {
+    printf("computeExternalSymbols: fileName='%s', fileSize = %lu, symTabStart "
+           "= %u, symNEntries = %u\n",
+           fileName, (unsigned long)fileSize, symTabStart, symNEntries);
+    stop("computeExternalSymbols: Unexpected EOF 1");
+  }
+  StringTable stringTable(f); // read the string table
+  if (f.tellg() != fileSize)
+    stop("computeExternalSymbols: Unexpected data after string table");
+
+  f.clear();
+  f.seekg(symTabStart); // seek to the symbol table
+
+  defined->clear();
+  undefined->clear();
+  for (int i = 0; i < symNEntries; ++i) {
+    // process each entry
+    Symbol e;
+
+    if (f.eof())
+      stop("computeExternalSymbols: Unexpected EOF 2");
+    f >> e;
+    if (f.fail())
+      stop("computeExternalSymbols: File read error");
+    if (e.nAux) { // auxiliary entry: skip
+      f.seekg(e.nAux * 18, ios::cur);
+      i += e.nAux;
+    }
+    // if symbol is extern and defined in the current file, insert it
+    if (e.storageClass == 2)
+      if (e.sectionNum)
+        defined->insert(stringTable.decode(e.name));
+      else
+        undefined->insert(stringTable.decode(e.name));
+  }
 }
 
-/* For each occurrence of an external symbol in the object file named by
-   by <fileName> that is a member of <hide>, renames it by prefixing
-   with "__kmp_external_", writing back the file in-place
-*/
+// For each occurrence of an external symbol in the object file named by
+// by <fileName> that is a member of <hide>, renames it by prefixing
+// with "__kmp_external_", writing back the file in-place
 void hideSymbols(char *fileName, const set<string> &hide) {
-    static const string prefix("__kmp_external_");
-    set<string> strings; // set of all occurring symbols, appropriately prefixed
-    streampos fileSize;
-    size_t strTabStart;
-    unsigned symTabStart, symNEntries;
-    int i;
-    rstream in(fileName);
-
-    in.seekg(0,ios::end);
-    fileSize = in.tellg();
-
-    in.seekg(8);
-    in >> symTabStart >> symNEntries;
-    in.seekg(strTabStart = symTabStart + 18 * (size_t)symNEntries);
-    if(in.eof())
-	stop("hideSymbols: Unexpected EOF");
-    StringTable stringTableOld(in); // read original string table
-
-    if(in.tellg() != fileSize)
-	stop("hideSymbols: Unexpected data after string table");
-
-    // compute set of occurring strings with prefix added
-    for(i = 0; i < symNEntries; ++i) {
-	Symbol e;
-
-	in.seekg(symTabStart + i * 18);
-	if(in.eof())
-	    stop("hideSymbols: Unexpected EOF");
-	in >> e;
-	if(in.fail())
-	    stop("hideSymbols: File read error");
-	if(e.nAux)
-	    i += e.nAux;
-	const string &s = stringTableOld.decode(e.name);
-	// if symbol is extern and found in <hide>, prefix and insert into strings,
-	// otherwise, just insert into strings without prefix
-	strings.insert( (e.storageClass == 2 && hide.find(s) != hide.end()) ?
-			prefix + s : s);
-    }
-
-    ofstream out(fileName, ios::trunc | ios::out | ios::binary);
-    if(!out.is_open())
-	stop("hideSymbols: Error opening output file");
-
-    // make new string table from string set
-    StringTable stringTableNew = StringTable(strings);
-
-    // copy input file to output file up to just before the symbol table
-    in.seekg(0);
-    char *buf = new char[symTabStart];
-    in.read(buf, symTabStart);
-    out.write(buf, symTabStart);
-    delete []buf;
-
-    // copy input symbol table to output symbol table with name translation
-    for(i = 0; i < symNEntries; ++i) {
-	Symbol e;
-
-	in.seekg(symTabStart + i*18);
-	if(in.eof())
-	    stop("hideSymbols: Unexpected EOF");
-	in >> e;
-	if(in.fail())
-	    stop("hideSymbols: File read error");
-	const string &s = stringTableOld.decode(e.name);
-	out.seekp(symTabStart + i*18);
-	e.name = stringTableNew.encode( (e.storageClass == 2 && hide.find(s) != hide.end()) ?
-					prefix + s : s);
-	out.write((char*)&e, 18);
-	if(out.fail())
-	    stop("hideSymbols: File write error");
-	if(e.nAux) {
-	    // copy auxiliary symbol table entries
-	    int nAux = e.nAux;
-	    for(int j = 1; j <= nAux; ++j) {
-		in >> e;
-		out.seekp(symTabStart + (i + j) * 18);
-		out.write((char*)&e, 18);
-	    }
-	    i += nAux;
-	}
-    }
-    // output string table
-    stringTableNew.write(out);
+  static const string prefix("__kmp_external_");
+  set<string> strings; // set of all occurring symbols, appropriately prefixed
+  streampos fileSize;
+  size_t strTabStart;
+  unsigned symTabStart, symNEntries;
+  int i;
+  rstream in(fileName);
+
+  in.seekg(0, ios::end);
+  fileSize = in.tellg();
+
+  in.seekg(8);
+  in >> symTabStart >> symNEntries;
+  in.seekg(strTabStart = symTabStart + 18 * (size_t)symNEntries);
+  if (in.eof())
+    stop("hideSymbols: Unexpected EOF");
+  StringTable stringTableOld(in); // read original string table
+
+  if (in.tellg() != fileSize)
+    stop("hideSymbols: Unexpected data after string table");
+
+  // compute set of occurring strings with prefix added
+  for (i = 0; i < symNEntries; ++i) {
+    Symbol e;
+
+    in.seekg(symTabStart + i * 18);
+    if (in.eof())
+      stop("hideSymbols: Unexpected EOF");
+    in >> e;
+    if (in.fail())
+      stop("hideSymbols: File read error");
+    if (e.nAux)
+      i += e.nAux;
+    const string &s = stringTableOld.decode(e.name);
+    // if symbol is extern and found in <hide>, prefix and insert into strings,
+    // otherwise, just insert into strings without prefix
+    strings.insert(
+        (e.storageClass == 2 && hide.find(s) != hide.end()) ? prefix + s : s);
+  }
+
+  ofstream out(fileName, ios::trunc | ios::out | ios::binary);
+  if (!out.is_open())
+    stop("hideSymbols: Error opening output file");
+
+  // make new string table from string set
+  StringTable stringTableNew = StringTable(strings);
+
+  // copy input file to output file up to just before the symbol table
+  in.seekg(0);
+  char *buf = new char[symTabStart];
+  in.read(buf, symTabStart);
+  out.write(buf, symTabStart);
+  delete[] buf;
+
+  // copy input symbol table to output symbol table with name translation
+  for (i = 0; i < symNEntries; ++i) {
+    Symbol e;
+
+    in.seekg(symTabStart + i * 18);
+    if (in.eof())
+      stop("hideSymbols: Unexpected EOF");
+    in >> e;
+    if (in.fail())
+      stop("hideSymbols: File read error");
+    const string &s = stringTableOld.decode(e.name);
+    out.seekp(symTabStart + i * 18);
+    e.name = stringTableNew.encode(
+        (e.storageClass == 2 && hide.find(s) != hide.end()) ? prefix + s : s);
+    out.write((char *)&e, 18);
+    if (out.fail())
+      stop("hideSymbols: File write error");
+    if (e.nAux) {
+      // copy auxiliary symbol table entries
+      int nAux = e.nAux;
+      for (int j = 1; j <= nAux; ++j) {
+        in >> e;
+        out.seekp(symTabStart + (i + j) * 18);
+        out.write((char *)&e, 18);
+      }
+      i += nAux;
+    }
+  }
+  // output string table
+  stringTableNew.write(out);
 }
 
 // returns true iff <a> and <b> have no common element
-template <class T>
-bool isDisjoint(const set<T> &a, const set<T> &b) {
-    set<T>::const_iterator ita, itb;
-
-    for(ita = a.begin(), itb = b.begin(); ita != a.end() && itb != b.end();) {
-	const T &ta = *ita, &tb = *itb;
-	if(ta < tb)
-	    ++ita;
-	else if (tb < ta)
-	    ++itb;
-	else
-	    return false;
-    }
-    return true;
+template <class T> bool isDisjoint(const set<T> &a, const set<T> &b) {
+  set<T>::const_iterator ita, itb;
+
+  for (ita = a.begin(), itb = b.begin(); ita != a.end() && itb != b.end();) {
+    const T &ta = *ita, &tb = *itb;
+    if (ta < tb)
+      ++ita;
+    else if (tb < ta)
+      ++itb;
+    else
+      return false;
+  }
+  return true;
 }
 
-/* precondition: <defined> and <undefined> are arrays with <nTotal> elements where
-   <nTotal> >= <nExternal>.  The first <nExternal> elements correspond to the external object
-   files and the rest correspond to the internal object files.
-   postcondition: file x is said to depend on file y if undefined[x] and defined[y] are not
-   disjoint.  Returns the transitive closure of the set of internal object files, as a set of
-   file indexes, under the 'depends on' relation, minus the set of internal object files.
-*/
-set<int> *findRequiredExternal(int nExternal, int nTotal, set<string> *defined, set<string> *undefined) {
-    set<int> *required = new set<int>;
-    set<int> fresh[2];
-    int i, cur = 0;
-    bool changed;
-
-    for(i = nTotal - 1; i >= nExternal; --i)
-	fresh[cur].insert(i);
-    do {
-	changed = false;
-	for(set<int>::iterator it = fresh[cur].begin(); it != fresh[cur].end(); ++it) {
-	    set<string> &s = undefined[*it];
-
-	    for(i = 0; i < nExternal; ++i) {
-		if(required->find(i) == required->end()) {
-		    if(!isDisjoint(defined[i], s)) {
-			// found a new qualifying element
-			required->insert(i);
-			fresh[1 - cur].insert(i);
-			changed = true;
-		    }
-		}
-	    }
-	}
-	fresh[cur].clear();
-	cur = 1 - cur;
-    } while(changed);
-    return required;
+// PRE: <defined> and <undefined> are arrays with <nTotal> elements where
+// <nTotal> >= <nExternal>.  The first <nExternal> elements correspond to the
+// external object files and the rest correspond to the internal object files.
+// POST: file x is said to depend on file y if undefined[x] and defined[y] are
+// not disjoint. Returns the transitive closure of the set of internal object
+// files, as a set of file indexes, under the 'depends on' relation, minus the
+// set of internal object files.
+set<int> *findRequiredExternal(int nExternal, int nTotal, set<string> *defined,
+                               set<string> *undefined) {
+  set<int> *required = new set<int>;
+  set<int> fresh[2];
+  int i, cur = 0;
+  bool changed;
+
+  for (i = nTotal - 1; i >= nExternal; --i)
+    fresh[cur].insert(i);
+  do {
+    changed = false;
+    for (set<int>::iterator it = fresh[cur].begin(); it != fresh[cur].end();
+         ++it) {
+      set<string> &s = undefined[*it];
+
+      for (i = 0; i < nExternal; ++i) {
+        if (required->find(i) == required->end()) {
+          if (!isDisjoint(defined[i], s)) {
+            // found a new qualifying element
+            required->insert(i);
+            fresh[1 - cur].insert(i);
+            changed = true;
+          }
+        }
+      }
+    }
+    fresh[cur].clear();
+    cur = 1 - cur;
+  } while (changed);
+  return required;
 }
 
 int main(int argc, char **argv) {
-    int nExternal, nInternal, i;
-    set<string> *defined, *undefined;
-    set<int>::iterator it;
-
-    if(argc < 3)
-	stop("Please specify a positive integer followed by a list of object filenames");
-    nExternal = atoi(argv[1]);
-    if(nExternal <= 0)
-	stop("Please specify a positive integer followed by a list of object filenames");
-    if(nExternal +  2 > argc)
-	stop("Too few external objects");
-    nInternal = argc - nExternal - 2;
-    defined = new set<string>[argc - 2];
-    undefined = new set<string>[argc - 2];
-
-    // determine the set of defined and undefined external symbols
-    for(i = 2; i < argc; ++i)
-	computeExternalSymbols(argv[i], defined + i - 2, undefined + i - 2);
-
-    // determine the set of required external files
-    set<int> *requiredExternal = findRequiredExternal(nExternal, argc - 2, defined, undefined);
-    set<string> hide;
-
-    /* determine the set of symbols to hide--namely defined external symbols of the
-       required external files
-    */
-    for(it = requiredExternal->begin(); it != requiredExternal->end(); ++it) {
-	int idx = *it;
-	set<string>::iterator it2;
-	/* We have to insert one element at a time instead of inserting a range because
-	   the insert member function taking a range doesn't exist on Windows* OS, at least
-	   at the time of this writing.
-	*/
-	for(it2 = defined[idx].begin(); it2 != defined[idx].end(); ++it2)
-	    hide.insert(*it2);
-    }
-
-    /* process the external files--removing those that are not required and hiding
-       the appropriate symbols in the others
-    */
-    for(i = 0; i < nExternal; ++i)
-	if(requiredExternal->find(i) != requiredExternal->end())
-	    hideSymbols(argv[2 + i], hide);
-	else
-	    remove(argv[2 + i]);
-    // hide the appropriate symbols in the internal files
-    for(i = nExternal + 2; i < argc; ++i)
-	hideSymbols(argv[i], hide);
-    return 0;
+  int nExternal, nInternal, i;
+  set<string> *defined, *undefined;
+  set<int>::iterator it;
+
+  if (argc < 3)
+    stop("Please specify a positive integer followed by a list of object "
+         "filenames");
+  nExternal = atoi(argv[1]);
+  if (nExternal <= 0)
+    stop("Please specify a positive integer followed by a list of object "
+         "filenames");
+  if (nExternal + 2 > argc)
+    stop("Too few external objects");
+  nInternal = argc - nExternal - 2;
+  defined = new set<string>[argc - 2];
+  undefined = new set<string>[argc - 2];
+
+  // determine the set of defined and undefined external symbols
+  for (i = 2; i < argc; ++i)
+    computeExternalSymbols(argv[i], defined + i - 2, undefined + i - 2);
+
+  // determine the set of required external files
+  set<int> *requiredExternal =
+      findRequiredExternal(nExternal, argc - 2, defined, undefined);
+  set<string> hide;
+
+  // determine the set of symbols to hide--namely defined external symbols of
+  // the required external files
+  for (it = requiredExternal->begin(); it != requiredExternal->end(); ++it) {
+    int idx = *it;
+    set<string>::iterator it2;
+    // We have to insert one element at a time instead of inserting a range
+    // because the insert member function taking a range doesn't exist on
+    // Windows* OS, at least at the time of this writing.
+    for (it2 = defined[idx].begin(); it2 != defined[idx].end(); ++it2)
+      hide.insert(*it2);
+  }
+
+  // process the external files--removing those that are not required and hiding
+  //   the appropriate symbols in the others
+  for (i = 0; i < nExternal; ++i)
+    if (requiredExternal->find(i) != requiredExternal->end())
+      hideSymbols(argv[2 + i], hide);
+    else
+      remove(argv[2 + i]);
+  // hide the appropriate symbols in the internal files
+  for (i = nExternal + 2; i < argc; ++i)
+    hideSymbols(argv[i], hide);
+  return 0;
 }

Modified: openmp/trunk/runtime/src/kmp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp.h?rev=302929&r1=302928&r2=302929&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp.h (original)
+++ openmp/trunk/runtime/src/kmp.h Fri May 12 13:01:32 2017
@@ -22,36 +22,35 @@
 /* #define BUILD_PARALLEL_ORDERED 1 */
 
 /* This fix replaces gettimeofday with clock_gettime for better scalability on
-   the Altix.  Requires user code to be linked with -lrt.
-*/
+   the Altix.  Requires user code to be linked with -lrt. */
 //#define FIX_SGI_CLOCK
 
 /* Defines for OpenMP 3.0 tasking and auto scheduling */
 
-# ifndef KMP_STATIC_STEAL_ENABLED
-#  define KMP_STATIC_STEAL_ENABLED 1
-# endif
+#ifndef KMP_STATIC_STEAL_ENABLED
+#define KMP_STATIC_STEAL_ENABLED 1
+#endif
 
-#define TASK_CURRENT_NOT_QUEUED  0
-#define TASK_CURRENT_QUEUED      1
+#define TASK_CURRENT_NOT_QUEUED 0
+#define TASK_CURRENT_QUEUED 1
 
 #ifdef BUILD_TIED_TASK_STACK
-#define TASK_STACK_EMPTY         0  // entries when the stack is empty
-
-// Used to define TASK_STACK_SIZE and TASK_STACK_MASK
-#define TASK_STACK_BLOCK_BITS    5
-#define TASK_STACK_BLOCK_SIZE    ( 1 << TASK_STACK_BLOCK_BITS ) // Number of entries in each task stack array
-#define TASK_STACK_INDEX_MASK    ( TASK_STACK_BLOCK_SIZE - 1 )  // Mask for determining index into stack block
+#define TASK_STACK_EMPTY 0 // entries when the stack is empty
+#define TASK_STACK_BLOCK_BITS 5 // Used in TASK_STACK_SIZE and TASK_STACK_MASK
+// Number of entries in each task stack array
+#define TASK_STACK_BLOCK_SIZE (1 << TASK_STACK_BLOCK_BITS)
+// Mask for determining index into stack block
+#define TASK_STACK_INDEX_MASK (TASK_STACK_BLOCK_SIZE - 1)
 #endif // BUILD_TIED_TASK_STACK
 
-#define TASK_NOT_PUSHED          1
+#define TASK_NOT_PUSHED 1
 #define TASK_SUCCESSFULLY_PUSHED 0
-#define TASK_TIED                1
-#define TASK_UNTIED              0
-#define TASK_EXPLICIT            1
-#define TASK_IMPLICIT            0
-#define TASK_PROXY               1
-#define TASK_FULL                0
+#define TASK_TIED 1
+#define TASK_UNTIED 0
+#define TASK_EXPLICIT 1
+#define TASK_IMPLICIT 0
+#define TASK_PROXY 1
+#define TASK_FULL 0
 
 #define KMP_CANCEL_THREADS
 #define KMP_THREAD_ATTR
@@ -62,14 +61,14 @@
 #undef KMP_CANCEL_THREADS
 #endif
 
+#include <signal.h>
+#include <stdarg.h>
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <stddef.h>
-#include <stdarg.h>
 #include <string.h>
-#include <signal.h>
-/*  include <ctype.h> don't use; problems with /MD on Windows* OS NT due to bad Microsoft library  */
-/*  some macros provided below to replace some of these functions  */
+/* include <ctype.h> don't use; problems with /MD on Windows* OS NT due to bad
+   Microsoft library. Some macros provided below to replace these functions  */
 #ifndef __ABSOFT_WIN
 #include <sys/types.h>
 #endif
@@ -100,9 +99,9 @@ class kmp_stats_list;
 #include <xmmintrin.h>
 #endif
 
-#include "kmp_version.h"
 #include "kmp_debug.h"
 #include "kmp_lock.h"
+#include "kmp_version.h"
 #if USE_DEBUGGER
 #include "kmp_debugger.h"
 #endif
@@ -112,14 +111,14 @@ class kmp_stats_list;
 
 #include "kmp_wrapper_malloc.h"
 #if KMP_OS_UNIX
-# include <unistd.h>
-# if !defined NSIG && defined _NSIG
-#  define NSIG _NSIG
-# endif
+#include <unistd.h>
+#if !defined NSIG && defined _NSIG
+#define NSIG _NSIG
+#endif
 #endif
 
 #if KMP_OS_LINUX
-# pragma weak clock_gettime
+#pragma weak clock_gettime
 #endif
 
 #if OMPT_SUPPORT
@@ -128,7 +127,7 @@ class kmp_stats_list;
 
 /*Select data placement in NUMA memory */
 #define NO_FIRST_TOUCH 0
-#define FIRST_TOUCH 1       /* Exploit SGI's first touch page placement algo */
+#define FIRST_TOUCH 1 /* Exploit SGI's first touch page placement algo */
 
 /* If not specified on compile command line, assume no first touch */
 #ifndef BUILD_MEMORY
@@ -136,27 +135,28 @@ class kmp_stats_list;
 #endif
 
 // 0 - no fast memory allocation, alignment: 8-byte on x86, 16-byte on x64.
-// 3 - fast allocation using sync, non-sync free lists of any size, non-self free lists of limited size.
+// 3 - fast allocation using sync, non-sync free lists of any size, non-self
+// free lists of limited size.
 #ifndef USE_FAST_MEMORY
 #define USE_FAST_MEMORY 3
 #endif
 
 #ifndef KMP_NESTED_HOT_TEAMS
-# define KMP_NESTED_HOT_TEAMS 0
-# define USE_NESTED_HOT_ARG(x)
+#define KMP_NESTED_HOT_TEAMS 0
+#define USE_NESTED_HOT_ARG(x)
+#else
+#if KMP_NESTED_HOT_TEAMS
+#if OMP_40_ENABLED
+#define USE_NESTED_HOT_ARG(x) , x
 #else
-# if KMP_NESTED_HOT_TEAMS
-#  if OMP_40_ENABLED
-#   define USE_NESTED_HOT_ARG(x) ,x
-#  else
 // Nested hot teams feature depends on omp 4.0, disable it for earlier versions
-#   undef KMP_NESTED_HOT_TEAMS
-#   define KMP_NESTED_HOT_TEAMS 0
-#   define USE_NESTED_HOT_ARG(x)
-#  endif
-# else
-#  define USE_NESTED_HOT_ARG(x)
-# endif
+#undef KMP_NESTED_HOT_TEAMS
+#define KMP_NESTED_HOT_TEAMS 0
+#define USE_NESTED_HOT_ARG(x)
+#endif
+#else
+#define USE_NESTED_HOT_ARG(x)
+#endif
 #endif
 
 // Assume using BGET compare_exchange instruction instead of lock by default.
@@ -177,129 +177,134 @@ class kmp_stats_list;
 @{
 */
 
-// FIXME DOXYGEN... need to group these flags somehow (Making them an anonymous enum would do it...)
+// FIXME DOXYGEN... need to group these flags somehow (Making them an anonymous
+// enum would do it...)
 /*!
 Values for bit flags used in the ident_t to describe the fields.
 */
 /*! Use trampoline for internal microtasks */
-#define KMP_IDENT_IMB             0x01
+#define KMP_IDENT_IMB 0x01
 /*! Use c-style ident structure */
-#define KMP_IDENT_KMPC            0x02
+#define KMP_IDENT_KMPC 0x02
 /* 0x04 is no longer used */
 /*! Entry point generated by auto-parallelization */
-#define KMP_IDENT_AUTOPAR         0x08
+#define KMP_IDENT_AUTOPAR 0x08
 /*! Compiler generates atomic reduction option for kmpc_reduce* */
-#define KMP_IDENT_ATOMIC_REDUCE   0x10
+#define KMP_IDENT_ATOMIC_REDUCE 0x10
 /*! To mark a 'barrier' directive in user code */
-#define KMP_IDENT_BARRIER_EXPL    0x20
+#define KMP_IDENT_BARRIER_EXPL 0x20
 /*! To Mark implicit barriers. */
-#define KMP_IDENT_BARRIER_IMPL           0x0040
-#define KMP_IDENT_BARRIER_IMPL_MASK      0x01C0
-#define KMP_IDENT_BARRIER_IMPL_FOR       0x0040
-#define KMP_IDENT_BARRIER_IMPL_SECTIONS  0x00C0
+#define KMP_IDENT_BARRIER_IMPL 0x0040
+#define KMP_IDENT_BARRIER_IMPL_MASK 0x01C0
+#define KMP_IDENT_BARRIER_IMPL_FOR 0x0040
+#define KMP_IDENT_BARRIER_IMPL_SECTIONS 0x00C0
 
-#define KMP_IDENT_BARRIER_IMPL_SINGLE    0x0140
+#define KMP_IDENT_BARRIER_IMPL_SINGLE 0x0140
 #define KMP_IDENT_BARRIER_IMPL_WORKSHARE 0x01C0
 
 /*!
  * The ident structure that describes a source location.
  */
 typedef struct ident {
-    kmp_int32 reserved_1;   /**<  might be used in Fortran; see above  */
-    kmp_int32 flags;        /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC identifies this union member  */
-    kmp_int32 reserved_2;   /**<  not really used in Fortran any more; see above */
+  kmp_int32 reserved_1; /**<  might be used in Fortran; see above  */
+  kmp_int32 flags; /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
+                      identifies this union member  */
+  kmp_int32 reserved_2; /**<  not really used in Fortran any more; see above */
 #if USE_ITT_BUILD
-                            /*  but currently used for storing region-specific ITT */
-                            /*  contextual information. */
+/*  but currently used for storing region-specific ITT */
+/*  contextual information. */
 #endif /* USE_ITT_BUILD */
-    kmp_int32 reserved_3;   /**< source[4] in Fortran, do not use for C++  */
-    char const *psource;    /**< String describing the source location.
-                            The string is composed of semi-colon separated fields which describe the source file,
-                            the function and a pair of line numbers that delimit the construct.
-                             */
+  kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for C++  */
+  char const *psource; /**< String describing the source location.
+                       The string is composed of semi-colon separated fields
+                       which describe the source file, the function and a pair
+                       of line numbers that delimit the construct. */
 } ident_t;
 /*!
 @}
 */
 
 // Some forward declarations.
-
-typedef union  kmp_team      kmp_team_t;
-typedef struct kmp_taskdata  kmp_taskdata_t;
-typedef union  kmp_task_team kmp_task_team_t;
-typedef union  kmp_team      kmp_team_p;
-typedef union  kmp_info      kmp_info_p;
-typedef union  kmp_root      kmp_root_p;
+typedef union kmp_team kmp_team_t;
+typedef struct kmp_taskdata kmp_taskdata_t;
+typedef union kmp_task_team kmp_task_team_t;
+typedef union kmp_team kmp_team_p;
+typedef union kmp_info kmp_info_p;
+typedef union kmp_root kmp_root_p;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
 
 /* Pack two 32-bit signed integers into a 64-bit signed integer */
 /* ToDo: Fix word ordering for big-endian machines. */
-#define KMP_PACK_64(HIGH_32,LOW_32) \
-    ( (kmp_int64) ((((kmp_uint64)(HIGH_32))<<32) | (kmp_uint64)(LOW_32)) )
-
+#define KMP_PACK_64(HIGH_32, LOW_32)                                           \
+  ((kmp_int64)((((kmp_uint64)(HIGH_32)) << 32) | (kmp_uint64)(LOW_32)))
 
-/*
- * Generic string manipulation macros.
- * Assume that _x is of type char *
- */
-#define SKIP_WS(_x)     { while (*(_x) == ' ' || *(_x) == '\t') (_x)++; }
-#define SKIP_DIGITS(_x) { while (*(_x) >= '0' && *(_x) <= '9') (_x)++; }
-#define SKIP_TO(_x,_c)  { while (*(_x) != '\0' && *(_x) != (_c)) (_x)++; }
+// Generic string manipulation macros. Assume that _x is of type char *
+#define SKIP_WS(_x)                                                            \
+  {                                                                            \
+    while (*(_x) == ' ' || *(_x) == '\t')                                      \
+      (_x)++;                                                                  \
+  }
+#define SKIP_DIGITS(_x)                                                        \
+  {                                                                            \
+    while (*(_x) >= '0' && *(_x) <= '9')                                       \
+      (_x)++;                                                                  \
+  }
+#define SKIP_TO(_x, _c)                                                        \
+  {                                                                            \
+    while (*(_x) != '\0' && *(_x) != (_c))                                     \
+      (_x)++;                                                                  \
+  }
 
 /* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
 
-#define KMP_MAX( x, y ) ( (x) > (y) ? (x) : (y) )
-#define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
+#define KMP_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
 
 /* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
-
-
 /* Enumeration types */
 
 enum kmp_state_timer {
-    ts_stop,
-    ts_start,
-    ts_pause,
+  ts_stop,
+  ts_start,
+  ts_pause,
 
-    ts_last_state
+  ts_last_state
 };
 
 enum dynamic_mode {
-    dynamic_default,
+  dynamic_default,
 #ifdef USE_LOAD_BALANCE
-    dynamic_load_balance,
+  dynamic_load_balance,
 #endif /* USE_LOAD_BALANCE */
-    dynamic_random,
-    dynamic_thread_limit,
-    dynamic_max
+  dynamic_random,
+  dynamic_thread_limit,
+  dynamic_max
 };
 
-/* external schedule constants, duplicate enum omp_sched in omp.h in order to not include it here */
+/* external schedule constants, duplicate enum omp_sched in omp.h in order to
+ * not include it here */
 #ifndef KMP_SCHED_TYPE_DEFINED
 #define KMP_SCHED_TYPE_DEFINED
 typedef enum kmp_sched {
-    kmp_sched_lower             = 0,     // lower and upper bounds are for routine parameter check
-    // Note: need to adjust __kmp_sch_map global array in case this enum is changed
-    kmp_sched_static            = 1,     // mapped to kmp_sch_static_chunked           (33)
-    kmp_sched_dynamic           = 2,     // mapped to kmp_sch_dynamic_chunked          (35)
-    kmp_sched_guided            = 3,     // mapped to kmp_sch_guided_chunked           (36)
-    kmp_sched_auto              = 4,     // mapped to kmp_sch_auto                     (38)
-    kmp_sched_upper_std         = 5,     // upper bound for standard schedules
-    kmp_sched_lower_ext         = 100,   // lower bound of Intel extension schedules
-    kmp_sched_trapezoidal       = 101,   // mapped to kmp_sch_trapezoidal              (39)
+  kmp_sched_lower = 0, // lower and upper bounds are for routine parameter check
+  // Note: need to adjust __kmp_sch_map global array in case enum is changed
+  kmp_sched_static = 1, // mapped to kmp_sch_static_chunked           (33)
+  kmp_sched_dynamic = 2, // mapped to kmp_sch_dynamic_chunked          (35)
+  kmp_sched_guided = 3, // mapped to kmp_sch_guided_chunked           (36)
+  kmp_sched_auto = 4, // mapped to kmp_sch_auto                     (38)
+  kmp_sched_upper_std = 5, // upper bound for standard schedules
+  kmp_sched_lower_ext = 100, // lower bound of Intel extension schedules
+  kmp_sched_trapezoidal = 101, // mapped to kmp_sch_trapezoidal (39)
 #if KMP_STATIC_STEAL_ENABLED
-    kmp_sched_static_steal      = 102,   // mapped to kmp_sch_static_steal             (44)
+  kmp_sched_static_steal = 102, // mapped to kmp_sch_static_steal (44)
 #endif
-    kmp_sched_upper,
-    kmp_sched_default = kmp_sched_static // default scheduling
+  kmp_sched_upper,
+  kmp_sched_default = kmp_sched_static // default scheduling
 } kmp_sched_t;
 #endif
 
@@ -308,149 +313,148 @@ typedef enum kmp_sched {
  * Describes the loop schedule to be used for a parallel for loop.
  */
 enum sched_type {
-    kmp_sch_lower                     = 32,   /**< lower bound for unordered values */
-    kmp_sch_static_chunked            = 33,
-    kmp_sch_static                    = 34,   /**< static unspecialized */
-    kmp_sch_dynamic_chunked           = 35,
-    kmp_sch_guided_chunked            = 36,   /**< guided unspecialized */
-    kmp_sch_runtime                   = 37,
-    kmp_sch_auto                      = 38,   /**< auto */
-    kmp_sch_trapezoidal               = 39,
-
-    /* accessible only through KMP_SCHEDULE environment variable */
-    kmp_sch_static_greedy             = 40,
-    kmp_sch_static_balanced           = 41,
-    /* accessible only through KMP_SCHEDULE environment variable */
-    kmp_sch_guided_iterative_chunked  = 42,
-    kmp_sch_guided_analytical_chunked = 43,
-
-    kmp_sch_static_steal              = 44,   /**< accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_lower = 32, /**< lower bound for unordered values */
+  kmp_sch_static_chunked = 33,
+  kmp_sch_static = 34, /**< static unspecialized */
+  kmp_sch_dynamic_chunked = 35,
+  kmp_sch_guided_chunked = 36, /**< guided unspecialized */
+  kmp_sch_runtime = 37,
+  kmp_sch_auto = 38, /**< auto */
+  kmp_sch_trapezoidal = 39,
+
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_static_greedy = 40,
+  kmp_sch_static_balanced = 41,
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_guided_iterative_chunked = 42,
+  kmp_sch_guided_analytical_chunked = 43,
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_static_steal = 44,
 
 #if OMP_45_ENABLED
-    kmp_sch_static_balanced_chunked   = 45,   /**< static with chunk adjustment (e.g., simd) */
+  /* static with chunk adjustment (e.g., simd) */
+  kmp_sch_static_balanced_chunked = 45,
 #endif
 
-    /* accessible only through KMP_SCHEDULE environment variable */
-    kmp_sch_upper                     = 46,   /**< upper bound for unordered values */
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_upper = 46, /**< upper bound for unordered values */
 
-    kmp_ord_lower                     = 64,   /**< lower bound for ordered values, must be power of 2 */
-    kmp_ord_static_chunked            = 65,
-    kmp_ord_static                    = 66,   /**< ordered static unspecialized */
-    kmp_ord_dynamic_chunked           = 67,
-    kmp_ord_guided_chunked            = 68,
-    kmp_ord_runtime                   = 69,
-    kmp_ord_auto                      = 70,   /**< ordered auto */
-    kmp_ord_trapezoidal               = 71,
-    kmp_ord_upper                     = 72,   /**< upper bound for ordered values */
-
-#if OMP_40_ENABLED
-    /* Schedules for Distribute construct */
-    kmp_distribute_static_chunked     = 91,   /**< distribute static chunked */
-    kmp_distribute_static             = 92,   /**< distribute static unspecialized */
-#endif
-
-    /*
-     * For the "nomerge" versions, kmp_dispatch_next*() will always return
-     * a single iteration/chunk, even if the loop is serialized.  For the
-     * schedule types listed above, the entire iteration vector is returned
-     * if the loop is serialized.  This doesn't work for gcc/gcomp sections.
-     */
-    kmp_nm_lower                      = 160,  /**< lower bound for nomerge values */
-
-    kmp_nm_static_chunked             = (kmp_sch_static_chunked - kmp_sch_lower + kmp_nm_lower),
-    kmp_nm_static                     = 162,  /**< static unspecialized */
-    kmp_nm_dynamic_chunked            = 163,
-    kmp_nm_guided_chunked             = 164,  /**< guided unspecialized */
-    kmp_nm_runtime                    = 165,
-    kmp_nm_auto                       = 166,  /**< auto */
-    kmp_nm_trapezoidal                = 167,
-
-    /* accessible only through KMP_SCHEDULE environment variable */
-    kmp_nm_static_greedy              = 168,
-    kmp_nm_static_balanced            = 169,
-    /* accessible only through KMP_SCHEDULE environment variable */
-    kmp_nm_guided_iterative_chunked   = 170,
-    kmp_nm_guided_analytical_chunked  = 171,
-    kmp_nm_static_steal               = 172,  /* accessible only through OMP_SCHEDULE environment variable */
-
-    kmp_nm_ord_static_chunked         = 193,
-    kmp_nm_ord_static                 = 194,  /**< ordered static unspecialized */
-    kmp_nm_ord_dynamic_chunked        = 195,
-    kmp_nm_ord_guided_chunked         = 196,
-    kmp_nm_ord_runtime                = 197,
-    kmp_nm_ord_auto                   = 198,  /**< auto */
-    kmp_nm_ord_trapezoidal            = 199,
-    kmp_nm_upper                      = 200,  /**< upper bound for nomerge values */
+  kmp_ord_lower = 64, /**< lower bound for ordered values, must be power of 2 */
+  kmp_ord_static_chunked = 65,
+  kmp_ord_static = 66, /**< ordered static unspecialized */
+  kmp_ord_dynamic_chunked = 67,
+  kmp_ord_guided_chunked = 68,
+  kmp_ord_runtime = 69,
+  kmp_ord_auto = 70, /**< ordered auto */
+  kmp_ord_trapezoidal = 71,
+  kmp_ord_upper = 72, /**< upper bound for ordered values */
+
+#if OMP_40_ENABLED
+  /* Schedules for Distribute construct */
+  kmp_distribute_static_chunked = 91, /**< distribute static chunked */
+  kmp_distribute_static = 92, /**< distribute static unspecialized */
+#endif
+
+  /* For the "nomerge" versions, kmp_dispatch_next*() will always return a
+     single iteration/chunk, even if the loop is serialized. For the schedule
+     types listed above, the entire iteration vector is returned if the loop is
+     serialized. This doesn't work for gcc/gcomp sections. */
+  kmp_nm_lower = 160, /**< lower bound for nomerge values */
+
+  kmp_nm_static_chunked =
+      (kmp_sch_static_chunked - kmp_sch_lower + kmp_nm_lower),
+  kmp_nm_static = 162, /**< static unspecialized */
+  kmp_nm_dynamic_chunked = 163,
+  kmp_nm_guided_chunked = 164, /**< guided unspecialized */
+  kmp_nm_runtime = 165,
+  kmp_nm_auto = 166, /**< auto */
+  kmp_nm_trapezoidal = 167,
+
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_nm_static_greedy = 168,
+  kmp_nm_static_balanced = 169,
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_nm_guided_iterative_chunked = 170,
+  kmp_nm_guided_analytical_chunked = 171,
+  kmp_nm_static_steal =
+      172, /* accessible only through OMP_SCHEDULE environment variable */
+
+  kmp_nm_ord_static_chunked = 193,
+  kmp_nm_ord_static = 194, /**< ordered static unspecialized */
+  kmp_nm_ord_dynamic_chunked = 195,
+  kmp_nm_ord_guided_chunked = 196,
+  kmp_nm_ord_runtime = 197,
+  kmp_nm_ord_auto = 198, /**< auto */
+  kmp_nm_ord_trapezoidal = 199,
+  kmp_nm_upper = 200, /**< upper bound for nomerge values */
 
 #if OMP_45_ENABLED
-    /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
-     * Since we need to distinguish the three possible cases (no modifier, monotonic modifier,
-     * nonmonotonic modifier), we need separate bits for each modifier.
-     * The absence of monotonic does not imply nonmonotonic, especially since 4.5 says
-     * that the behaviour of the "no modifier" case is implementation defined in 4.5,
-     * but will become "nonmonotonic" in 5.0.
-     *
-     * Since we're passing a full 32 bit value, we can use a couple of high bits for these
-     * flags; out of paranoia we avoid the sign bit.
-     *
-     * These modifiers can be or-ed into non-static schedules by the compiler to pass
-     * the additional information.
-     * They will be stripped early in the processing in __kmp_dispatch_init when setting up schedules, so
-     * most of the code won't ever see schedules with these bits set.
-     */
-    kmp_sch_modifier_monotonic      = (1<<29), /**< Set if the monotonic schedule modifier was present */
-    kmp_sch_modifier_nonmonotonic   = (1<<30), /**< Set if the nonmonotonic schedule modifier was present */
-
-# define SCHEDULE_WITHOUT_MODIFIERS(s) (enum sched_type)((s) & ~ (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic))
-# define SCHEDULE_HAS_MONOTONIC(s)     (((s) & kmp_sch_modifier_monotonic)    != 0)
-# define SCHEDULE_HAS_NONMONOTONIC(s)  (((s) & kmp_sch_modifier_nonmonotonic) != 0)
-# define SCHEDULE_HAS_NO_MODIFIERS(s)  (((s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)) == 0)
-#else
-    /* By doing this we hope to avoid multiple tests on OMP_45_ENABLED. Compilers can now eliminate tests on compile time
-     * constants and dead code that results from them, so we can leave code guarded by such an if in place.
-     */
-# define SCHEDULE_WITHOUT_MODIFIERS(s) (s)
-# define SCHEDULE_HAS_MONOTONIC(s)     false
-# define SCHEDULE_HAS_NONMONOTONIC(s)  false
-# define SCHEDULE_HAS_NO_MODIFIERS(s)  true
+  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. Since
+     we need to distinguish the three possible cases (no modifier, monotonic
+     modifier, nonmonotonic modifier), we need separate bits for each modifier.
+     The absence of monotonic does not imply nonmonotonic, especially since 4.5
+     says that the behaviour of the "no modifier" case is implementation defined
+     in 4.5, but will become "nonmonotonic" in 5.0.
+
+     Since we're passing a full 32 bit value, we can use a couple of high bits
+     for these flags; out of paranoia we avoid the sign bit.
+
+     These modifiers can be or-ed into non-static schedules by the compiler to
+     pass the additional information. They will be stripped early in the
+     processing in __kmp_dispatch_init when setting up schedules, so most of the
+     code won't ever see schedules with these bits set.  */
+  kmp_sch_modifier_monotonic =
+      (1 << 29), /**< Set if the monotonic schedule modifier was present */
+  kmp_sch_modifier_nonmonotonic =
+      (1 << 30), /**< Set if the nonmonotonic schedule modifier was present */
+
+#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
+  (enum sched_type)(                                                           \
+      (s) & ~(kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic))
+#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sch_modifier_monotonic) != 0)
+#define SCHEDULE_HAS_NONMONOTONIC(s) (((s)&kmp_sch_modifier_nonmonotonic) != 0)
+#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
+  (((s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)) == 0)
+#else
+/* By doing this we hope to avoid multiple tests on OMP_45_ENABLED. Compilers
+   can now eliminate tests on compile time constants and dead code that results
+   from them, so we can leave code guarded by such an if in place.  */
+#define SCHEDULE_WITHOUT_MODIFIERS(s) (s)
+#define SCHEDULE_HAS_MONOTONIC(s) false
+#define SCHEDULE_HAS_NONMONOTONIC(s) false
+#define SCHEDULE_HAS_NO_MODIFIERS(s) true
 #endif
 
-    kmp_sch_default = kmp_sch_static  /**< default scheduling algorithm */
+  kmp_sch_default = kmp_sch_static /**< default scheduling algorithm */
 };
 
 /* Type to keep runtime schedule set via OMP_SCHEDULE or omp_set_schedule() */
 typedef struct kmp_r_sched {
-    enum sched_type r_sched_type;
-    int             chunk;
+  enum sched_type r_sched_type;
+  int chunk;
 } kmp_r_sched_t;
 
-extern enum sched_type __kmp_sch_map[]; // map OMP 3.0 schedule types with our internal schedule types
+extern enum sched_type __kmp_sch_map[]; // map OMP 3.0 schedule types with our
+// internal schedule types
 
 enum library_type {
-    library_none,
-    library_serial,
-    library_turnaround,
-    library_throughput
+  library_none,
+  library_serial,
+  library_turnaround,
+  library_throughput
 };
 
 #if KMP_OS_LINUX
 enum clock_function_type {
-    clock_function_gettimeofday,
-    clock_function_clock_gettime
+  clock_function_gettimeofday,
+  clock_function_clock_gettime
 };
 #endif /* KMP_OS_LINUX */
 
 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
-enum mic_type {
-    non_mic,
-    mic1,
-    mic2,
-    mic3,
-    dummy
-};
+enum mic_type { non_mic, mic1, mic2, mic3, dummy };
 #endif
 
-/* ------------------------------------------------------------------------ */
 /* -- fast reduction stuff ------------------------------------------------ */
 
 #undef KMP_FAST_REDUCTION_BARRIER
@@ -458,97 +462,94 @@ enum mic_type {
 
 #undef KMP_FAST_REDUCTION_CORE_DUO
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-    #define KMP_FAST_REDUCTION_CORE_DUO 1
+#define KMP_FAST_REDUCTION_CORE_DUO 1
 #endif
 
 enum _reduction_method {
-    reduction_method_not_defined = 0,
-    critical_reduce_block        = ( 1 << 8 ),
-    atomic_reduce_block          = ( 2 << 8 ),
-    tree_reduce_block            = ( 3 << 8 ),
-    empty_reduce_block           = ( 4 << 8 )
-};
-
-// description of the packed_reduction_method variable
-// the packed_reduction_method variable consists of two enum types variables that are packed together into 0-th byte and 1-st byte:
-// 0: ( packed_reduction_method & 0x000000FF ) is a 'enum barrier_type' value of barrier that will be used in fast reduction: bs_plain_barrier or bs_reduction_barrier
-// 1: ( packed_reduction_method & 0x0000FF00 ) is a reduction method that will be used in fast reduction;
-// reduction method is of 'enum _reduction_method' type and it's defined the way so that the bits of 0-th byte are empty,
-// so no need to execute a shift instruction while packing/unpacking
+  reduction_method_not_defined = 0,
+  critical_reduce_block = (1 << 8),
+  atomic_reduce_block = (2 << 8),
+  tree_reduce_block = (3 << 8),
+  empty_reduce_block = (4 << 8)
+};
+
+// Description of the packed_reduction_method variable:
+// The packed_reduction_method variable consists of two enum types variables
+// that are packed together into 0-th byte and 1-st byte:
+// 0: (packed_reduction_method & 0x000000FF) is a 'enum barrier_type' value of
+// barrier that will be used in fast reduction: bs_plain_barrier or
+// bs_reduction_barrier
+// 1: (packed_reduction_method & 0x0000FF00) is a reduction method that will
+// be used in fast reduction;
+// Reduction method is of 'enum _reduction_method' type and it's defined the way
+// so that the bits of 0-th byte are empty, so no need to execute a shift
+// instruction while packing/unpacking
 
 #if KMP_FAST_REDUCTION_BARRIER
-    #define PACK_REDUCTION_METHOD_AND_BARRIER(reduction_method,barrier_type) \
-            ( ( reduction_method ) | ( barrier_type ) )
+#define PACK_REDUCTION_METHOD_AND_BARRIER(reduction_method, barrier_type)      \
+  ((reduction_method) | (barrier_type))
 
-    #define UNPACK_REDUCTION_METHOD(packed_reduction_method) \
-            ( ( enum _reduction_method )( ( packed_reduction_method ) & ( 0x0000FF00 ) ) )
+#define UNPACK_REDUCTION_METHOD(packed_reduction_method)                       \
+  ((enum _reduction_method)((packed_reduction_method) & (0x0000FF00)))
 
-    #define UNPACK_REDUCTION_BARRIER(packed_reduction_method) \
-            ( ( enum barrier_type )(      ( packed_reduction_method ) & ( 0x000000FF ) ) )
+#define UNPACK_REDUCTION_BARRIER(packed_reduction_method)                      \
+  ((enum barrier_type)((packed_reduction_method) & (0x000000FF)))
 #else
-    #define PACK_REDUCTION_METHOD_AND_BARRIER(reduction_method,barrier_type) \
-            ( reduction_method )
+#define PACK_REDUCTION_METHOD_AND_BARRIER(reduction_method, barrier_type)      \
+  (reduction_method)
 
-    #define UNPACK_REDUCTION_METHOD(packed_reduction_method) \
-            ( packed_reduction_method )
+#define UNPACK_REDUCTION_METHOD(packed_reduction_method)                       \
+  (packed_reduction_method)
 
-    #define UNPACK_REDUCTION_BARRIER(packed_reduction_method) \
-            ( bs_plain_barrier )
+#define UNPACK_REDUCTION_BARRIER(packed_reduction_method) (bs_plain_barrier)
 #endif
 
-#define TEST_REDUCTION_METHOD(packed_reduction_method,which_reduction_block) \
-            ( ( UNPACK_REDUCTION_METHOD( packed_reduction_method ) ) == ( which_reduction_block ) )
+#define TEST_REDUCTION_METHOD(packed_reduction_method, which_reduction_block)  \
+  ((UNPACK_REDUCTION_METHOD(packed_reduction_method)) ==                       \
+   (which_reduction_block))
 
 #if KMP_FAST_REDUCTION_BARRIER
-    #define TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER \
-            ( PACK_REDUCTION_METHOD_AND_BARRIER( tree_reduce_block, bs_reduction_barrier ) )
+#define TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER                               \
+  (PACK_REDUCTION_METHOD_AND_BARRIER(tree_reduce_block, bs_reduction_barrier))
 
-    #define TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER \
-            ( PACK_REDUCTION_METHOD_AND_BARRIER( tree_reduce_block, bs_plain_barrier ) )
+#define TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER                                   \
+  (PACK_REDUCTION_METHOD_AND_BARRIER(tree_reduce_block, bs_plain_barrier))
 #endif
 
 typedef int PACKED_REDUCTION_METHOD_T;
 
 /* -- end of fast reduction stuff ----------------------------------------- */
 
-/* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
-
 #if KMP_OS_WINDOWS
-# define USE_CBLKDATA
-# pragma warning( push )
-# pragma warning( disable: 271 310 )
-# include <windows.h>
-# pragma warning( pop )
+#define USE_CBLKDATA
+#pragma warning(push)
+#pragma warning(disable : 271 310)
+#include <windows.h>
+#pragma warning(pop)
 #endif
 
 #if KMP_OS_UNIX
-# include <pthread.h>
-# include <dlfcn.h>
+#include <dlfcn.h>
+#include <pthread.h>
 #endif
 
-/* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
-
-/*
- * Only Linux* OS and Windows* OS support thread affinity.
- */
+/* Only Linux* OS and Windows* OS support thread affinity. */
 #if KMP_AFFINITY_SUPPORTED
 
 // GROUP_AFFINITY is already defined for _MSC_VER>=1600 (VS2010 and later).
-# if KMP_OS_WINDOWS
-#  if _MSC_VER < 1600
+#if KMP_OS_WINDOWS
+#if _MSC_VER < 1600
 typedef struct GROUP_AFFINITY {
-    KAFFINITY Mask;
-    WORD Group;
-    WORD Reserved[3];
+  KAFFINITY Mask;
+  WORD Group;
+  WORD Reserved[3];
 } GROUP_AFFINITY;
-#  endif /* _MSC_VER < 1600 */
-#  if KMP_GROUP_AFFINITY
+#endif /* _MSC_VER < 1600 */
+#if KMP_GROUP_AFFINITY
 extern int __kmp_num_proc_groups;
-#  else
+#else
 static const int __kmp_num_proc_groups = 1;
-#  endif /* KMP_GROUP_AFFINITY */
+#endif /* KMP_GROUP_AFFINITY */
 typedef DWORD (*kmp_GetActiveProcessorCount_t)(WORD);
 extern kmp_GetActiveProcessorCount_t __kmp_GetActiveProcessorCount;
 
@@ -558,164 +559,170 @@ extern kmp_GetActiveProcessorGroupCount_
 typedef BOOL (*kmp_GetThreadGroupAffinity_t)(HANDLE, GROUP_AFFINITY *);
 extern kmp_GetThreadGroupAffinity_t __kmp_GetThreadGroupAffinity;
 
-typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE, const GROUP_AFFINITY *, GROUP_AFFINITY *);
+typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE, const GROUP_AFFINITY *,
+                                             GROUP_AFFINITY *);
 extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
-# endif /* KMP_OS_WINDOWS */
+#endif /* KMP_OS_WINDOWS */
 
-# if KMP_USE_HWLOC
+#if KMP_USE_HWLOC
 extern hwloc_topology_t __kmp_hwloc_topology;
 extern int __kmp_hwloc_error;
-# endif
+#endif
 
 extern size_t __kmp_affin_mask_size;
-# define KMP_AFFINITY_CAPABLE() (__kmp_affin_mask_size > 0)
-# define KMP_AFFINITY_DISABLE() (__kmp_affin_mask_size = 0)
-# define KMP_AFFINITY_ENABLE(mask_size) (__kmp_affin_mask_size = mask_size)
-# define KMP_CPU_SET_ITERATE(i,mask) \
-    for (i = (mask)->begin(); i != (mask)->end() ; i = (mask)->next(i))
-# define KMP_CPU_SET(i,mask) (mask)->set(i)
-# define KMP_CPU_ISSET(i,mask) (mask)->is_set(i)
-# define KMP_CPU_CLR(i,mask) (mask)->clear(i)
-# define KMP_CPU_ZERO(mask) (mask)->zero()
-# define KMP_CPU_COPY(dest, src) (dest)->copy(src)
-# define KMP_CPU_AND(dest, src) (dest)->bitwise_and(src)
-# define KMP_CPU_COMPLEMENT(max_bit_number, mask) (mask)->bitwise_not()
-# define KMP_CPU_UNION(dest, src) (dest)->bitwise_or(src)
-# define KMP_CPU_ALLOC(ptr) (ptr = __kmp_affinity_dispatch->allocate_mask())
-# define KMP_CPU_FREE(ptr) __kmp_affinity_dispatch->deallocate_mask(ptr)
-# define KMP_CPU_ALLOC_ON_STACK(ptr) KMP_CPU_ALLOC(ptr)
-# define KMP_CPU_FREE_FROM_STACK(ptr) KMP_CPU_FREE(ptr)
-# define KMP_CPU_INTERNAL_ALLOC(ptr) KMP_CPU_ALLOC(ptr)
-# define KMP_CPU_INTERNAL_FREE(ptr) KMP_CPU_FREE(ptr)
-# define KMP_CPU_INDEX(arr,i) __kmp_affinity_dispatch->index_mask_array(arr, i)
-# define KMP_CPU_ALLOC_ARRAY(arr, n) (arr = __kmp_affinity_dispatch->allocate_mask_array(n))
-# define KMP_CPU_FREE_ARRAY(arr, n) __kmp_affinity_dispatch->deallocate_mask_array(arr)
-# define KMP_CPU_INTERNAL_ALLOC_ARRAY(arr, n) KMP_CPU_ALLOC_ARRAY(arr, n)
-# define KMP_CPU_INTERNAL_FREE_ARRAY(arr, n) KMP_CPU_FREE_ARRAY(arr, n)
-# define __kmp_get_system_affinity(mask, abort_bool) (mask)->get_system_affinity(abort_bool)
-# define __kmp_set_system_affinity(mask, abort_bool) (mask)->set_system_affinity(abort_bool)
-# define __kmp_get_proc_group(mask) (mask)->get_proc_group()
+#define KMP_AFFINITY_CAPABLE() (__kmp_affin_mask_size > 0)
+#define KMP_AFFINITY_DISABLE() (__kmp_affin_mask_size = 0)
+#define KMP_AFFINITY_ENABLE(mask_size) (__kmp_affin_mask_size = mask_size)
+#define KMP_CPU_SET_ITERATE(i, mask)                                           \
+  for (i = (mask)->begin(); i != (mask)->end(); i = (mask)->next(i))
+#define KMP_CPU_SET(i, mask) (mask)->set(i)
+#define KMP_CPU_ISSET(i, mask) (mask)->is_set(i)
+#define KMP_CPU_CLR(i, mask) (mask)->clear(i)
+#define KMP_CPU_ZERO(mask) (mask)->zero()
+#define KMP_CPU_COPY(dest, src) (dest)->copy(src)
+#define KMP_CPU_AND(dest, src) (dest)->bitwise_and(src)
+#define KMP_CPU_COMPLEMENT(max_bit_number, mask) (mask)->bitwise_not()
+#define KMP_CPU_UNION(dest, src) (dest)->bitwise_or(src)
+#define KMP_CPU_ALLOC(ptr) (ptr = __kmp_affinity_dispatch->allocate_mask())
+#define KMP_CPU_FREE(ptr) __kmp_affinity_dispatch->deallocate_mask(ptr)
+#define KMP_CPU_ALLOC_ON_STACK(ptr) KMP_CPU_ALLOC(ptr)
+#define KMP_CPU_FREE_FROM_STACK(ptr) KMP_CPU_FREE(ptr)
+#define KMP_CPU_INTERNAL_ALLOC(ptr) KMP_CPU_ALLOC(ptr)
+#define KMP_CPU_INTERNAL_FREE(ptr) KMP_CPU_FREE(ptr)
+#define KMP_CPU_INDEX(arr, i) __kmp_affinity_dispatch->index_mask_array(arr, i)
+#define KMP_CPU_ALLOC_ARRAY(arr, n)                                            \
+  (arr = __kmp_affinity_dispatch->allocate_mask_array(n))
+#define KMP_CPU_FREE_ARRAY(arr, n)                                             \
+  __kmp_affinity_dispatch->deallocate_mask_array(arr)
+#define KMP_CPU_INTERNAL_ALLOC_ARRAY(arr, n) KMP_CPU_ALLOC_ARRAY(arr, n)
+#define KMP_CPU_INTERNAL_FREE_ARRAY(arr, n) KMP_CPU_FREE_ARRAY(arr, n)
+#define __kmp_get_system_affinity(mask, abort_bool)                            \
+  (mask)->get_system_affinity(abort_bool)
+#define __kmp_set_system_affinity(mask, abort_bool)                            \
+  (mask)->set_system_affinity(abort_bool)
+#define __kmp_get_proc_group(mask) (mask)->get_proc_group()
 
 class KMPAffinity {
 public:
-    class Mask {
-    public:
-        void* operator new(size_t n);
-        void operator delete(void* p);
-        void* operator new[](size_t n);
-        void operator delete[](void* p);
-        virtual ~Mask() {}
-        // Set bit i to 1
-        virtual void set(int i) {}
-        // Return bit i
-        virtual bool is_set(int i) const { return false; }
-        // Set bit i to 0
-        virtual void clear(int i) {}
-        // Zero out entire mask
-        virtual void zero() {}
-        // Copy src into this mask
-        virtual void copy(const Mask* src) {}
-        // this &= rhs
-        virtual void bitwise_and(const Mask* rhs) {}
-        // this |= rhs
-        virtual void bitwise_or(const Mask* rhs) {}
-        // this = ~this
-        virtual void bitwise_not() {}
-        // API for iterating over an affinity mask
-        // for (int i = mask->begin(); i != mask->end(); i = mask->next(i))
-        virtual int begin() const { return 0; }
-        virtual int end() const { return 0; }
-        virtual int next(int previous) const { return 0; }
-        // Set the system's affinity to this affinity mask's value
-        virtual int set_system_affinity(bool abort_on_error) const { return -1; }
-        // Set this affinity mask to the current system affinity
-        virtual int get_system_affinity(bool abort_on_error) { return -1; }
-        // Only 1 DWORD in the mask should have any procs set.
-        // Return the appropriate index, or -1 for an invalid mask.
-        virtual int get_proc_group() const { return -1; }
-    };
-    void* operator new(size_t n);
-    void operator delete(void* p);
-    // Need virtual destructor
-    virtual ~KMPAffinity() = default;
-    // Determine if affinity is capable
-    virtual void determine_capable(const char* env_var) {}
-    // Bind the current thread to os proc
-    virtual void bind_thread(int proc) {}
-    // Factory functions to allocate/deallocate a mask
-    virtual Mask* allocate_mask() { return nullptr; }
-    virtual void deallocate_mask(Mask* m) { }
-    virtual Mask* allocate_mask_array(int num) { return nullptr; }
-    virtual void deallocate_mask_array(Mask* m) { }
-    virtual Mask* index_mask_array(Mask* m, int index) { return nullptr; }
-    static void pick_api();
-    static void destroy_api();
-    enum api_type {
-        NATIVE_OS
+  class Mask {
+  public:
+    void *operator new(size_t n);
+    void operator delete(void *p);
+    void *operator new[](size_t n);
+    void operator delete[](void *p);
+    virtual ~Mask() {}
+    // Set bit i to 1
+    virtual void set(int i) {}
+    // Return bit i
+    virtual bool is_set(int i) const { return false; }
+    // Set bit i to 0
+    virtual void clear(int i) {}
+    // Zero out entire mask
+    virtual void zero() {}
+    // Copy src into this mask
+    virtual void copy(const Mask *src) {}
+    // this &= rhs
+    virtual void bitwise_and(const Mask *rhs) {}
+    // this |= rhs
+    virtual void bitwise_or(const Mask *rhs) {}
+    // this = ~this
+    virtual void bitwise_not() {}
+    // API for iterating over an affinity mask
+    // for (int i = mask->begin(); i != mask->end(); i = mask->next(i))
+    virtual int begin() const { return 0; }
+    virtual int end() const { return 0; }
+    virtual int next(int previous) const { return 0; }
+    // Set the system's affinity to this affinity mask's value
+    virtual int set_system_affinity(bool abort_on_error) const { return -1; }
+    // Set this affinity mask to the current system affinity
+    virtual int get_system_affinity(bool abort_on_error) { return -1; }
+    // Only 1 DWORD in the mask should have any procs set.
+    // Return the appropriate index, or -1 for an invalid mask.
+    virtual int get_proc_group() const { return -1; }
+  };
+  void *operator new(size_t n);
+  void operator delete(void *p);
+  // Need virtual destructor
+  virtual ~KMPAffinity() = default;
+  // Determine if affinity is capable
+  virtual void determine_capable(const char *env_var) {}
+  // Bind the current thread to os proc
+  virtual void bind_thread(int proc) {}
+  // Factory functions to allocate/deallocate a mask
+  virtual Mask *allocate_mask() { return nullptr; }
+  virtual void deallocate_mask(Mask *m) {}
+  virtual Mask *allocate_mask_array(int num) { return nullptr; }
+  virtual void deallocate_mask_array(Mask *m) {}
+  virtual Mask *index_mask_array(Mask *m, int index) { return nullptr; }
+  static void pick_api();
+  static void destroy_api();
+  enum api_type {
+    NATIVE_OS
 #if KMP_USE_HWLOC
-        , HWLOC
+    ,
+    HWLOC
 #endif
-    };
-    virtual api_type get_api_type() const { KMP_ASSERT(0); return NATIVE_OS; };
+  };
+  virtual api_type get_api_type() const {
+    KMP_ASSERT(0);
+    return NATIVE_OS;
+  };
+
 private:
-    static bool picked_api;
+  static bool picked_api;
 };
 
 typedef KMPAffinity::Mask kmp_affin_mask_t;
-extern KMPAffinity* __kmp_affinity_dispatch;
+extern KMPAffinity *__kmp_affinity_dispatch;
 
-//
 // Declare local char buffers with this size for printing debug and info
 // messages, using __kmp_affinity_print_mask().
-//
-#define KMP_AFFIN_MASK_PRINT_LEN        1024
+#define KMP_AFFIN_MASK_PRINT_LEN 1024
 
 enum affinity_type {
-    affinity_none = 0,
-    affinity_physical,
-    affinity_logical,
-    affinity_compact,
-    affinity_scatter,
-    affinity_explicit,
-    affinity_balanced,
-    affinity_disabled,  // not used outsize the env var parser
-    affinity_default
+  affinity_none = 0,
+  affinity_physical,
+  affinity_logical,
+  affinity_compact,
+  affinity_scatter,
+  affinity_explicit,
+  affinity_balanced,
+  affinity_disabled, // not used outsize the env var parser
+  affinity_default
 };
 
 enum affinity_gran {
-    affinity_gran_fine = 0,
-    affinity_gran_thread,
-    affinity_gran_core,
-    affinity_gran_package,
-    affinity_gran_node,
+  affinity_gran_fine = 0,
+  affinity_gran_thread,
+  affinity_gran_core,
+  affinity_gran_package,
+  affinity_gran_node,
 #if KMP_GROUP_AFFINITY
-    //
-    // The "group" granularity isn't necesssarily coarser than all of the
-    // other levels, but we put it last in the enum.
-    //
-    affinity_gran_group,
+  // The "group" granularity isn't necesssarily coarser than all of the
+  // other levels, but we put it last in the enum.
+  affinity_gran_group,
 #endif /* KMP_GROUP_AFFINITY */
-    affinity_gran_default
+  affinity_gran_default
 };
 
 enum affinity_top_method {
-    affinity_top_method_all = 0, // try all (supported) methods, in order
+  affinity_top_method_all = 0, // try all (supported) methods, in order
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-    affinity_top_method_apicid,
-    affinity_top_method_x2apicid,
+  affinity_top_method_apicid,
+  affinity_top_method_x2apicid,
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
-    affinity_top_method_cpuinfo, // KMP_CPUINFO_FILE is usable on Windows* OS, too
+  affinity_top_method_cpuinfo, // KMP_CPUINFO_FILE is usable on Windows* OS, too
 #if KMP_GROUP_AFFINITY
-    affinity_top_method_group,
+  affinity_top_method_group,
 #endif /* KMP_GROUP_AFFINITY */
-    affinity_top_method_flat,
+  affinity_top_method_flat,
 #if KMP_USE_HWLOC
-    affinity_top_method_hwloc,
+  affinity_top_method_hwloc,
 #endif
-    affinity_top_method_default
+  affinity_top_method_default
 };
 
-#define affinity_respect_mask_default   (-1)
+#define affinity_respect_mask_default (-1)
 
 extern enum affinity_type __kmp_affinity_type; /* Affinity type */
 extern enum affinity_gran __kmp_affinity_gran; /* Affinity granularity */
@@ -726,57 +733,54 @@ extern int __kmp_affinity_compact; /* Af
 extern int __kmp_affinity_offset; /* Affinity offset value  */
 extern int __kmp_affinity_verbose; /* Was verbose specified for KMP_AFFINITY? */
 extern int __kmp_affinity_warnings; /* KMP_AFFINITY warnings enabled ? */
-extern int __kmp_affinity_respect_mask; /* Respect process' initial affinity mask? */
-extern char * __kmp_affinity_proclist; /* proc ID list */
+extern int __kmp_affinity_respect_mask; // Respect process' init affinity mask?
+extern char *__kmp_affinity_proclist; /* proc ID list */
 extern kmp_affin_mask_t *__kmp_affinity_masks;
 extern unsigned __kmp_affinity_num_masks;
 extern void __kmp_affinity_bind_thread(int which);
 
 extern kmp_affin_mask_t *__kmp_affin_fullMask;
-extern char const * __kmp_cpuinfo_file;
+extern char const *__kmp_cpuinfo_file;
 
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 #if OMP_40_ENABLED
 
-//
 // This needs to be kept in sync with the values in omp.h !!!
-//
 typedef enum kmp_proc_bind_t {
-    proc_bind_false = 0,
-    proc_bind_true,
-    proc_bind_master,
-    proc_bind_close,
-    proc_bind_spread,
-    proc_bind_intel,    // use KMP_AFFINITY interface
-    proc_bind_default
+  proc_bind_false = 0,
+  proc_bind_true,
+  proc_bind_master,
+  proc_bind_close,
+  proc_bind_spread,
+  proc_bind_intel, // use KMP_AFFINITY interface
+  proc_bind_default
 } kmp_proc_bind_t;
 
 typedef struct kmp_nested_proc_bind_t {
-    kmp_proc_bind_t *bind_types;
-    int size;
-    int used;
+  kmp_proc_bind_t *bind_types;
+  int size;
+  int used;
 } kmp_nested_proc_bind_t;
 
 extern kmp_nested_proc_bind_t __kmp_nested_proc_bind;
 
 #endif /* OMP_40_ENABLED */
 
-# if KMP_AFFINITY_SUPPORTED
-#  define KMP_PLACE_ALL       (-1)
-#  define KMP_PLACE_UNDEFINED (-2)
-# endif /* KMP_AFFINITY_SUPPORTED */
+#if KMP_AFFINITY_SUPPORTED
+#define KMP_PLACE_ALL (-1)
+#define KMP_PLACE_UNDEFINED (-2)
+#endif /* KMP_AFFINITY_SUPPORTED */
 
 extern int __kmp_affinity_num_places;
 
-
 #if OMP_40_ENABLED
 typedef enum kmp_cancel_kind_t {
-    cancel_noreq = 0,
-    cancel_parallel = 1,
-    cancel_loop = 2,
-    cancel_sections = 3,
-    cancel_taskgroup = 4
+  cancel_noreq = 0,
+  cancel_parallel = 1,
+  cancel_loop = 2,
+  cancel_sections = 3,
+  cancel_taskgroup = 4
 } kmp_cancel_kind_t;
 #endif // OMP_40_ENABLED
 
@@ -795,167 +799,176 @@ extern int __kmp_hws_requested;
 extern int __kmp_hws_abs_flag; // absolute or per-item number requested
 
 /* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
 
-#define KMP_PAD(type, sz)     (sizeof(type) + (sz - ((sizeof(type) - 1) % (sz)) - 1))
+#define KMP_PAD(type, sz)                                                      \
+  (sizeof(type) + (sz - ((sizeof(type) - 1) % (sz)) - 1))
 
-//
 // We need to avoid using -1 as a GTID as +1 is added to the gtid
 // when storing it in a lock, and the value 0 is reserved.
-//
-#define KMP_GTID_DNE            (-2)    /* Does not exist */
-#define KMP_GTID_SHUTDOWN       (-3)    /* Library is shutting down */
-#define KMP_GTID_MONITOR        (-4)    /* Monitor thread ID */
-#define KMP_GTID_UNKNOWN        (-5)    /* Is not known */
-#define KMP_GTID_MIN            (-6)    /* Minimal gtid for low bound check in DEBUG */
-
-#define __kmp_get_gtid()               __kmp_get_global_thread_id()
-#define __kmp_entry_gtid()             __kmp_get_global_thread_id_reg()
-
-#define __kmp_tid_from_gtid(gtid)     ( KMP_DEBUG_ASSERT( (gtid) >= 0 ), \
-                                        __kmp_threads[ (gtid) ]->th.th_info.ds.ds_tid )
-
-#define __kmp_get_tid()               ( __kmp_tid_from_gtid( __kmp_get_gtid() ) )
-#define __kmp_gtid_from_tid(tid,team) ( KMP_DEBUG_ASSERT( (tid) >= 0 && (team) != NULL ), \
-                                        team -> t.t_threads[ (tid) ] -> th.th_info .ds.ds_gtid )
-
-#define __kmp_get_team()              ( __kmp_threads[ (__kmp_get_gtid()) ]-> th.th_team )
-#define __kmp_team_from_gtid(gtid)    ( KMP_DEBUG_ASSERT( (gtid) >= 0 ), \
-                                        __kmp_threads[ (gtid) ]-> th.th_team )
-
-#define __kmp_thread_from_gtid(gtid)  ( KMP_DEBUG_ASSERT( (gtid) >= 0 ), __kmp_threads[ (gtid) ] )
-#define __kmp_get_thread()            ( __kmp_thread_from_gtid( __kmp_get_gtid() ) )
-
-    // Returns current thread (pointer to kmp_info_t). In contrast to __kmp_get_thread(), it works
-    // with registered and not-yet-registered threads.
-#define __kmp_gtid_from_thread(thr)   ( KMP_DEBUG_ASSERT( (thr) != NULL ), \
-                                        (thr)->th.th_info.ds.ds_gtid )
+#define KMP_GTID_DNE (-2) /* Does not exist */
+#define KMP_GTID_SHUTDOWN (-3) /* Library is shutting down */
+#define KMP_GTID_MONITOR (-4) /* Monitor thread ID */
+#define KMP_GTID_UNKNOWN (-5) /* Is not known */
+#define KMP_GTID_MIN (-6) /* Minimal gtid for low bound check in DEBUG */
+
+#define __kmp_get_gtid() __kmp_get_global_thread_id()
+#define __kmp_entry_gtid() __kmp_get_global_thread_id_reg()
+
+#define __kmp_tid_from_gtid(gtid)                                              \
+  (KMP_DEBUG_ASSERT((gtid) >= 0), __kmp_threads[(gtid)]->th.th_info.ds.ds_tid)
+
+#define __kmp_get_tid() (__kmp_tid_from_gtid(__kmp_get_gtid()))
+#define __kmp_gtid_from_tid(tid, team)                                         \
+  (KMP_DEBUG_ASSERT((tid) >= 0 && (team) != NULL),                             \
+   team->t.t_threads[(tid)]->th.th_info.ds.ds_gtid)
+
+#define __kmp_get_team() (__kmp_threads[(__kmp_get_gtid())]->th.th_team)
+#define __kmp_team_from_gtid(gtid)                                             \
+  (KMP_DEBUG_ASSERT((gtid) >= 0), __kmp_threads[(gtid)]->th.th_team)
+
+#define __kmp_thread_from_gtid(gtid)                                           \
+  (KMP_DEBUG_ASSERT((gtid) >= 0), __kmp_threads[(gtid)])
+#define __kmp_get_thread() (__kmp_thread_from_gtid(__kmp_get_gtid()))
+
+// Returns current thread (pointer to kmp_info_t). In contrast to
+// __kmp_get_thread(), it works with registered and not-yet-registered threads.
+#define __kmp_gtid_from_thread(thr)                                            \
+  (KMP_DEBUG_ASSERT((thr) != NULL), (thr)->th.th_info.ds.ds_gtid)
 
 // AT: Which way is correct?
 // AT: 1. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team -> t.t_nproc;
 // AT: 2. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team_nproc;
-#define __kmp_get_team_num_threads(gtid) ( __kmp_threads[ ( gtid ) ] -> th.th_team -> t.t_nproc )
+#define __kmp_get_team_num_threads(gtid)                                       \
+  (__kmp_threads[(gtid)]->th.th_team->t.t_nproc)
 
-
-/* ------------------------------------------------------------------------ */
 /* ------------------------------------------------------------------------ */
 
-#define KMP_UINT64_MAX         (~((kmp_uint64)1<<((sizeof(kmp_uint64)*(1<<3))-1)))
+#define KMP_UINT64_MAX                                                         \
+  (~((kmp_uint64)1 << ((sizeof(kmp_uint64) * (1 << 3)) - 1)))
 
-#define KMP_MIN_NTH           1
+#define KMP_MIN_NTH 1
 
 #ifndef KMP_MAX_NTH
-#  if defined(PTHREAD_THREADS_MAX) && PTHREAD_THREADS_MAX < INT_MAX
-#    define KMP_MAX_NTH          PTHREAD_THREADS_MAX
-#  else
-#    define KMP_MAX_NTH          INT_MAX
-#  endif
+#if defined(PTHREAD_THREADS_MAX) && PTHREAD_THREADS_MAX < INT_MAX
+#define KMP_MAX_NTH PTHREAD_THREADS_MAX
+#else
+#define KMP_MAX_NTH INT_MAX
+#endif
 #endif /* KMP_MAX_NTH */
 
 #ifdef PTHREAD_STACK_MIN
-# define KMP_MIN_STKSIZE         PTHREAD_STACK_MIN
+#define KMP_MIN_STKSIZE PTHREAD_STACK_MIN
 #else
-# define KMP_MIN_STKSIZE         ((size_t)(32 * 1024))
+#define KMP_MIN_STKSIZE ((size_t)(32 * 1024))
 #endif
 
-#define KMP_MAX_STKSIZE          (~((size_t)1<<((sizeof(size_t)*(1<<3))-1)))
+#define KMP_MAX_STKSIZE (~((size_t)1 << ((sizeof(size_t) * (1 << 3)) - 1)))
 
 #if KMP_ARCH_X86
-# define KMP_DEFAULT_STKSIZE     ((size_t)(2 * 1024 * 1024))
+#define KMP_DEFAULT_STKSIZE ((size_t)(2 * 1024 * 1024))
 #elif KMP_ARCH_X86_64
-# define KMP_DEFAULT_STKSIZE     ((size_t)(4 * 1024 * 1024))
-# define KMP_BACKUP_STKSIZE      ((size_t)(2 * 1024 * 1024))
+#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
+#define KMP_BACKUP_STKSIZE ((size_t)(2 * 1024 * 1024))
 #else
-# define KMP_DEFAULT_STKSIZE     ((size_t)(1024 * 1024))
+#define KMP_DEFAULT_STKSIZE ((size_t)(1024 * 1024))
 #endif
 
-#define KMP_DEFAULT_MALLOC_POOL_INCR    ((size_t) (1024 * 1024))
-#define KMP_MIN_MALLOC_POOL_INCR        ((size_t) (4 * 1024))
-#define KMP_MAX_MALLOC_POOL_INCR        (~((size_t)1<<((sizeof(size_t)*(1<<3))-1)))
+#define KMP_DEFAULT_MALLOC_POOL_INCR ((size_t)(1024 * 1024))
+#define KMP_MIN_MALLOC_POOL_INCR ((size_t)(4 * 1024))
+#define KMP_MAX_MALLOC_POOL_INCR                                               \
+  (~((size_t)1 << ((sizeof(size_t) * (1 << 3)) - 1)))
 
-#define KMP_MIN_STKOFFSET       (0)
-#define KMP_MAX_STKOFFSET       KMP_MAX_STKSIZE
+#define KMP_MIN_STKOFFSET (0)
+#define KMP_MAX_STKOFFSET KMP_MAX_STKSIZE
 #if KMP_OS_DARWIN
-# define KMP_DEFAULT_STKOFFSET  KMP_MIN_STKOFFSET
+#define KMP_DEFAULT_STKOFFSET KMP_MIN_STKOFFSET
 #else
-# define KMP_DEFAULT_STKOFFSET  CACHE_LINE
+#define KMP_DEFAULT_STKOFFSET CACHE_LINE
 #endif
 
-#define KMP_MIN_STKPADDING      (0)
-#define KMP_MAX_STKPADDING      (2 * 1024 * 1024)
+#define KMP_MIN_STKPADDING (0)
+#define KMP_MAX_STKPADDING (2 * 1024 * 1024)
 
-#define KMP_BLOCKTIME_MULTIPLIER     (1000)    /* number of blocktime units per second */
-#define KMP_MIN_BLOCKTIME            (0)
-#define KMP_MAX_BLOCKTIME            (INT_MAX) /* Must be this for "infinite" setting the work */
-#define KMP_DEFAULT_BLOCKTIME        (200)     /*  __kmp_blocktime is in milliseconds  */
+#define KMP_BLOCKTIME_MULTIPLIER                                               \
+  (1000) /* number of blocktime units per second */
+#define KMP_MIN_BLOCKTIME (0)
+#define KMP_MAX_BLOCKTIME                                                      \
+  (INT_MAX) /* Must be this for "infinite" setting the work */
+#define KMP_DEFAULT_BLOCKTIME (200) /*  __kmp_blocktime is in milliseconds  */
 
 #if KMP_USE_MONITOR
-#define KMP_DEFAULT_MONITOR_STKSIZE  ((size_t)(64 * 1024))
-#define KMP_MIN_MONITOR_WAKEUPS      (1)       /* min number of times monitor wakes up per second */
-#define KMP_MAX_MONITOR_WAKEUPS      (1000)    /* maximum number of times monitor can wake up per second */
-
-/* Calculate new number of monitor wakeups for a specific block time based on previous monitor_wakeups */
-/* Only allow increasing number of wakeups */
-#define KMP_WAKEUPS_FROM_BLOCKTIME(blocktime, monitor_wakeups) \
-                                 ( ((blocktime) == KMP_MAX_BLOCKTIME) ? (monitor_wakeups) : \
-                                   ((blocktime) == KMP_MIN_BLOCKTIME) ? KMP_MAX_MONITOR_WAKEUPS : \
-                                   ((monitor_wakeups) > (KMP_BLOCKTIME_MULTIPLIER / (blocktime))) ? (monitor_wakeups) : \
-                                       (KMP_BLOCKTIME_MULTIPLIER) / (blocktime) )
-
-/* Calculate number of intervals for a specific block time based on monitor_wakeups */
-#define KMP_INTERVALS_FROM_BLOCKTIME(blocktime, monitor_wakeups)  \
-                                 ( ( (blocktime) + (KMP_BLOCKTIME_MULTIPLIER / (monitor_wakeups)) - 1 ) /  \
-                                   (KMP_BLOCKTIME_MULTIPLIER / (monitor_wakeups)) )
-#else
-# if KMP_OS_UNIX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
-   // HW TSC is used to reduce overhead (clock tick instead of nanosecond).
-   extern kmp_uint64 __kmp_ticks_per_msec;
-#  if KMP_COMPILER_ICC
-#   define KMP_NOW() _rdtsc()
-#  else
-#   define KMP_NOW() __kmp_hardware_timestamp()
-#  endif
-#  define KMP_NOW_MSEC() (KMP_NOW()/__kmp_ticks_per_msec)
-#  define KMP_BLOCKTIME_INTERVAL() (__kmp_dflt_blocktime * __kmp_ticks_per_msec)
-#  define KMP_BLOCKING(goal, count) ((goal) > KMP_NOW())
-# else
-   // System time is retrieved sporadically while blocking.
-   extern kmp_uint64 __kmp_now_nsec();
-#  define KMP_NOW() __kmp_now_nsec()
-#  define KMP_NOW_MSEC() (KMP_NOW()/KMP_USEC_PER_SEC)
-#  define KMP_BLOCKTIME_INTERVAL() (__kmp_dflt_blocktime * KMP_USEC_PER_SEC)
-#  define KMP_BLOCKING(goal, count) ((count) % 1000 != 0 || (goal) > KMP_NOW())
-# endif
-# define KMP_YIELD_NOW() (KMP_NOW_MSEC() / KMP_MAX(__kmp_dflt_blocktime, 1)  \
-                         % (__kmp_yield_on_count + __kmp_yield_off_count) < (kmp_uint32)__kmp_yield_on_count)
+#define KMP_DEFAULT_MONITOR_STKSIZE ((size_t)(64 * 1024))
+#define KMP_MIN_MONITOR_WAKEUPS (1) // min times monitor wakes up per second
+#define KMP_MAX_MONITOR_WAKEUPS (1000) // max times monitor can wake up per sec
+
+/* Calculate new number of monitor wakeups for a specific block time based on
+   previous monitor_wakeups. Only allow increasing number of wakeups */
+#define KMP_WAKEUPS_FROM_BLOCKTIME(blocktime, monitor_wakeups)                 \
+  (((blocktime) == KMP_MAX_BLOCKTIME)                                          \
+       ? (monitor_wakeups)                                                     \
+       : ((blocktime) == KMP_MIN_BLOCKTIME)                                    \
+             ? KMP_MAX_MONITOR_WAKEUPS                                         \
+             : ((monitor_wakeups) > (KMP_BLOCKTIME_MULTIPLIER / (blocktime)))  \
+                   ? (monitor_wakeups)                                         \
+                   : (KMP_BLOCKTIME_MULTIPLIER) / (blocktime))
+
+/* Calculate number of intervals for a specific block time based on
+   monitor_wakeups */
+#define KMP_INTERVALS_FROM_BLOCKTIME(blocktime, monitor_wakeups)               \
+  (((blocktime) + (KMP_BLOCKTIME_MULTIPLIER / (monitor_wakeups)) - 1) /        \
+   (KMP_BLOCKTIME_MULTIPLIER / (monitor_wakeups)))
+#else
+#if KMP_OS_UNIX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+// HW TSC is used to reduce overhead (clock tick instead of nanosecond).
+extern kmp_uint64 __kmp_ticks_per_msec;
+#if KMP_COMPILER_ICC
+#define KMP_NOW() _rdtsc()
+#else
+#define KMP_NOW() __kmp_hardware_timestamp()
+#endif
+#define KMP_NOW_MSEC() (KMP_NOW() / __kmp_ticks_per_msec)
+#define KMP_BLOCKTIME_INTERVAL() (__kmp_dflt_blocktime * __kmp_ticks_per_msec)
+#define KMP_BLOCKING(goal, count) ((goal) > KMP_NOW())
+#else
+// System time is retrieved sporadically while blocking.
+extern kmp_uint64 __kmp_now_nsec();
+#define KMP_NOW() __kmp_now_nsec()
+#define KMP_NOW_MSEC() (KMP_NOW() / KMP_USEC_PER_SEC)
+#define KMP_BLOCKTIME_INTERVAL() (__kmp_dflt_blocktime * KMP_USEC_PER_SEC)
+#define KMP_BLOCKING(goal, count) ((count) % 1000 != 0 || (goal) > KMP_NOW())
+#endif
+#define KMP_YIELD_NOW()                                                        \
+  (KMP_NOW_MSEC() / KMP_MAX(__kmp_dflt_blocktime, 1) %                         \
+       (__kmp_yield_on_count + __kmp_yield_off_count) <                        \
+   (kmp_uint32)__kmp_yield_on_count)
 #endif // KMP_USE_MONITOR
 
-#define KMP_MIN_STATSCOLS       40
-#define KMP_MAX_STATSCOLS       4096
-#define KMP_DEFAULT_STATSCOLS   80
+#define KMP_MIN_STATSCOLS 40
+#define KMP_MAX_STATSCOLS 4096
+#define KMP_DEFAULT_STATSCOLS 80
 
-#define KMP_MIN_INTERVAL        0
-#define KMP_MAX_INTERVAL        (INT_MAX-1)
-#define KMP_DEFAULT_INTERVAL    0
+#define KMP_MIN_INTERVAL 0
+#define KMP_MAX_INTERVAL (INT_MAX - 1)
+#define KMP_DEFAULT_INTERVAL 0
 
-#define KMP_MIN_CHUNK           1
-#define KMP_MAX_CHUNK           (INT_MAX-1)
-#define KMP_DEFAULT_CHUNK       1
+#define KMP_MIN_CHUNK 1
+#define KMP_MAX_CHUNK (INT_MAX - 1)
+#define KMP_DEFAULT_CHUNK 1
 
-#define KMP_MIN_INIT_WAIT       1
-#define KMP_MAX_INIT_WAIT       (INT_MAX/2)
-#define KMP_DEFAULT_INIT_WAIT   2048U
+#define KMP_MIN_INIT_WAIT 1
+#define KMP_MAX_INIT_WAIT (INT_MAX / 2)
+#define KMP_DEFAULT_INIT_WAIT 2048U
 
-#define KMP_MIN_NEXT_WAIT       1
-#define KMP_MAX_NEXT_WAIT       (INT_MAX/2)
-#define KMP_DEFAULT_NEXT_WAIT   1024U
+#define KMP_MIN_NEXT_WAIT 1
+#define KMP_MAX_NEXT_WAIT (INT_MAX / 2)
+#define KMP_DEFAULT_NEXT_WAIT 1024U
 
-#define KMP_DFLT_DISP_NUM_BUFF  7
-#define KMP_MAX_ORDERED         8
+#define KMP_DFLT_DISP_NUM_BUFF 7
+#define KMP_MAX_ORDERED 8
 
-#define KMP_MAX_FIELDS          32
+#define KMP_MAX_FIELDS 32
 
-#define KMP_MAX_BRANCH_BITS     31
+#define KMP_MAX_BRANCH_BITS 31
 
 #define KMP_MAX_ACTIVE_LEVELS_LIMIT INT_MAX
 
@@ -963,204 +976,231 @@ extern int __kmp_hws_abs_flag; // absolu
 
 #define KMP_MAX_TASK_PRIORITY_LIMIT INT_MAX
 
-/* Minimum number of threads before switch to TLS gtid (experimentally determined) */
+/* Minimum number of threads before switch to TLS gtid (experimentally
+   determined) */
 /* josh TODO: what about OS X* tuning? */
-#if   KMP_ARCH_X86 || KMP_ARCH_X86_64
-# define KMP_TLS_GTID_MIN     5
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#define KMP_TLS_GTID_MIN 5
 #else
-# define KMP_TLS_GTID_MIN     INT_MAX
+#define KMP_TLS_GTID_MIN INT_MAX
 #endif
 
-#define KMP_MASTER_TID(tid)      ( (tid) == 0 )
-#define KMP_WORKER_TID(tid)      ( (tid) != 0 )
+#define KMP_MASTER_TID(tid) ((tid) == 0)
+#define KMP_WORKER_TID(tid) ((tid) != 0)
 
-#define KMP_MASTER_GTID(gtid)    ( __kmp_tid_from_gtid((gtid)) == 0 )
-#define KMP_WORKER_GTID(gtid)    ( __kmp_tid_from_gtid((gtid)) != 0 )
-#define KMP_UBER_GTID(gtid)                                           \
-    (                                                                 \
-        KMP_DEBUG_ASSERT( (gtid) >= KMP_GTID_MIN ),                   \
-        KMP_DEBUG_ASSERT( (gtid) < __kmp_threads_capacity ),          \
-        (gtid) >= 0 && __kmp_root[(gtid)] && __kmp_threads[(gtid)] && \
-        (__kmp_threads[(gtid)] == __kmp_root[(gtid)]->r.r_uber_thread)\
-    )
-#define KMP_INITIAL_GTID(gtid)   ( (gtid) == 0 )
+#define KMP_MASTER_GTID(gtid) (__kmp_tid_from_gtid((gtid)) == 0)
+#define KMP_WORKER_GTID(gtid) (__kmp_tid_from_gtid((gtid)) != 0)
+#define KMP_UBER_GTID(gtid)                                                    \
+  (KMP_DEBUG_ASSERT((gtid) >= KMP_GTID_MIN),                                   \
+   KMP_DEBUG_ASSERT((gtid) < __kmp_threads_capacity),                          \
+   (gtid) >= 0 && __kmp_root[(gtid)] && __kmp_threads[(gtid)] &&               \
+       (__kmp_threads[(gtid)] == __kmp_root[(gtid)]->r.r_uber_thread))
+#define KMP_INITIAL_GTID(gtid) ((gtid) == 0)
 
 #ifndef TRUE
-#define FALSE   0
-#define TRUE    (! FALSE)
+#define FALSE 0
+#define TRUE (!FALSE)
 #endif
 
 /* NOTE: all of the following constants must be even */
 
 #if KMP_OS_WINDOWS
-#  define KMP_INIT_WAIT    64U          /* initial number of spin-tests   */
-#  define KMP_NEXT_WAIT    32U          /* susequent number of spin-tests */
+#define KMP_INIT_WAIT 64U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 32U /* susequent number of spin-tests */
 #elif KMP_OS_CNK
-#  define KMP_INIT_WAIT    16U          /* initial number of spin-tests   */
-#  define KMP_NEXT_WAIT     8U          /* susequent number of spin-tests */
+#define KMP_INIT_WAIT 16U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 8U /* susequent number of spin-tests */
 #elif KMP_OS_LINUX
-#  define KMP_INIT_WAIT  1024U          /* initial number of spin-tests   */
-#  define KMP_NEXT_WAIT   512U          /* susequent number of spin-tests */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #elif KMP_OS_DARWIN
 /* TODO: tune for KMP_OS_DARWIN */
-#  define KMP_INIT_WAIT  1024U          /* initial number of spin-tests   */
-#  define KMP_NEXT_WAIT   512U          /* susequent number of spin-tests */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #elif KMP_OS_FREEBSD
 /* TODO: tune for KMP_OS_FREEBSD */
-#  define KMP_INIT_WAIT  1024U          /* initial number of spin-tests   */
-#  define KMP_NEXT_WAIT   512U          /* susequent number of spin-tests */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #elif KMP_OS_NETBSD
 /* TODO: tune for KMP_OS_NETBSD */
-#  define KMP_INIT_WAIT  1024U          /* initial number of spin-tests   */
-#  define KMP_NEXT_WAIT   512U          /* susequent number of spin-tests */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 typedef struct kmp_cpuid {
-    kmp_uint32  eax;
-    kmp_uint32  ebx;
-    kmp_uint32  ecx;
-    kmp_uint32  edx;
+  kmp_uint32 eax;
+  kmp_uint32 ebx;
+  kmp_uint32 ecx;
+  kmp_uint32 edx;
 } kmp_cpuid_t;
-extern void __kmp_x86_cpuid( int mode, int mode2, struct kmp_cpuid *p );
-# if KMP_ARCH_X86
-  extern void __kmp_x86_pause( void );
-# elif KMP_MIC
-  static void __kmp_x86_pause( void ) { _mm_delay_32( 100 ); }
-# else
-  static void __kmp_x86_pause( void ) { _mm_pause(); }
-# endif
-# define KMP_CPU_PAUSE() __kmp_x86_pause()
+extern void __kmp_x86_cpuid(int mode, int mode2, struct kmp_cpuid *p);
+#if KMP_ARCH_X86
+extern void __kmp_x86_pause(void);
+#elif KMP_MIC
+static void __kmp_x86_pause(void) { _mm_delay_32(100); }
+#else
+static void __kmp_x86_pause(void) { _mm_pause(); }
+#endif
+#define KMP_CPU_PAUSE() __kmp_x86_pause()
 #elif KMP_ARCH_PPC64
-# define KMP_PPC64_PRI_LOW() __asm__ volatile ("or 1, 1, 1")
-# define KMP_PPC64_PRI_MED() __asm__ volatile ("or 2, 2, 2")
-# define KMP_PPC64_PRI_LOC_MB() __asm__ volatile ("" : : : "memory")
-# define KMP_CPU_PAUSE() do { KMP_PPC64_PRI_LOW(); KMP_PPC64_PRI_MED(); KMP_PPC64_PRI_LOC_MB(); } while (0)
+#define KMP_PPC64_PRI_LOW() __asm__ volatile("or 1, 1, 1")
+#define KMP_PPC64_PRI_MED() __asm__ volatile("or 2, 2, 2")
+#define KMP_PPC64_PRI_LOC_MB() __asm__ volatile("" : : : "memory")
+#define KMP_CPU_PAUSE()                                                        \
+  do {                                                                         \
+    KMP_PPC64_PRI_LOW();                                                       \
+    KMP_PPC64_PRI_MED();                                                       \
+    KMP_PPC64_PRI_LOC_MB();                                                    \
+  } while (0)
 #else
-# define KMP_CPU_PAUSE()        /* nothing to do */
+#define KMP_CPU_PAUSE() /* nothing to do */
 #endif
 
-#define KMP_INIT_YIELD(count)           { (count) = __kmp_yield_init; }
+#define KMP_INIT_YIELD(count)                                                  \
+  { (count) = __kmp_yield_init; }
 
-#define KMP_YIELD(cond)                 { KMP_CPU_PAUSE(); __kmp_yield( (cond) ); }
-
-// Note the decrement of 2 in the following Macros.  With KMP_LIBRARY=turnaround,
-// there should be no yielding since the starting value from KMP_INIT_YIELD() is odd.
-
-#define KMP_YIELD_WHEN(cond,count)      { KMP_CPU_PAUSE(); (count) -= 2; \
-                                                if (!(count)) { KMP_YIELD(cond); (count) = __kmp_yield_next; } }
-#define KMP_YIELD_SPIN(count)           { KMP_CPU_PAUSE(); (count) -=2; \
-                                                if (!(count)) { KMP_YIELD(1); (count) = __kmp_yield_next; } }
+#define KMP_YIELD(cond)                                                        \
+  {                                                                            \
+    KMP_CPU_PAUSE();                                                           \
+    __kmp_yield((cond));                                                       \
+  }
+
+// Note the decrement of 2 in the following Macros. With KMP_LIBRARY=turnaround,
+// there should be no yielding since initial value from KMP_INIT_YIELD() is odd.
+
+#define KMP_YIELD_WHEN(cond, count)                                            \
+  {                                                                            \
+    KMP_CPU_PAUSE();                                                           \
+    (count) -= 2;                                                              \
+    if (!(count)) {                                                            \
+      KMP_YIELD(cond);                                                         \
+      (count) = __kmp_yield_next;                                              \
+    }                                                                          \
+  }
+#define KMP_YIELD_SPIN(count)                                                  \
+  {                                                                            \
+    KMP_CPU_PAUSE();                                                           \
+    (count) -= 2;                                                              \
+    if (!(count)) {                                                            \
+      KMP_YIELD(1);                                                            \
+      (count) = __kmp_yield_next;                                              \
+    }                                                                          \
+  }
 
 /* ------------------------------------------------------------------------ */
 /* Support datatypes for the orphaned construct nesting checks.             */
 /* ------------------------------------------------------------------------ */
 
 enum cons_type {
-    ct_none,
-    ct_parallel,
-    ct_pdo,
-    ct_pdo_ordered,
-    ct_psections,
-    ct_psingle,
-
-    /* the following must be left in order and not split up */
-    ct_taskq,
-    ct_task,                    /* really task inside non-ordered taskq, considered a worksharing type */
-    ct_task_ordered,            /* really task inside ordered taskq, considered a worksharing type */
-    /* the preceding must be left in order and not split up */
-
-    ct_critical,
-    ct_ordered_in_parallel,
-    ct_ordered_in_pdo,
-    ct_ordered_in_taskq,
-    ct_master,
-    ct_reduce,
-    ct_barrier
+  ct_none,
+  ct_parallel,
+  ct_pdo,
+  ct_pdo_ordered,
+  ct_psections,
+  ct_psingle,
+
+  /* the following must be left in order and not split up */
+  ct_taskq,
+  ct_task, // really task inside non-ordered taskq, considered worksharing type
+  ct_task_ordered, /* really task inside ordered taskq, considered a worksharing
+                      type */
+  /* the preceding must be left in order and not split up */
+
+  ct_critical,
+  ct_ordered_in_parallel,
+  ct_ordered_in_pdo,
+  ct_ordered_in_taskq,
+  ct_master,
+  ct_reduce,
+  ct_barrier
 };
 
 /* test to see if we are in a taskq construct */
-# define IS_CONS_TYPE_TASKQ( ct )       ( ((int)(ct)) >= ((int)ct_taskq) && ((int)(ct)) <= ((int)ct_task_ordered) )
-# define IS_CONS_TYPE_ORDERED( ct )     ((ct) == ct_pdo_ordered || (ct) == ct_task_ordered)
+#define IS_CONS_TYPE_TASKQ(ct)                                                 \
+  (((int)(ct)) >= ((int)ct_taskq) && ((int)(ct)) <= ((int)ct_task_ordered))
+#define IS_CONS_TYPE_ORDERED(ct)                                               \
+  ((ct) == ct_pdo_ordered || (ct) == ct_task_ordered)
 
 struct cons_data {
-    ident_t const     *ident;
-    enum cons_type     type;
-    int                prev;
-    kmp_user_lock_p    name;    /* address exclusively for critical section name comparison */
+  ident_t const *ident;
+  enum cons_type type;
+  int prev;
+  kmp_user_lock_p
+      name; /* address exclusively for critical section name comparison */
 };
 
 struct cons_header {
-    int                 p_top, w_top, s_top;
-    int                 stack_size, stack_top;
-    struct cons_data   *stack_data;
+  int p_top, w_top, s_top;
+  int stack_size, stack_top;
+  struct cons_data *stack_data;
 };
 
 struct kmp_region_info {
-    char                *text;
-    int                 offset[KMP_MAX_FIELDS];
-    int                 length[KMP_MAX_FIELDS];
+  char *text;
+  int offset[KMP_MAX_FIELDS];
+  int length[KMP_MAX_FIELDS];
 };
 
-
 /* ---------------------------------------------------------------------- */
 /* ---------------------------------------------------------------------- */
 
 #if KMP_OS_WINDOWS
-    typedef HANDLE              kmp_thread_t;
-    typedef DWORD               kmp_key_t;
+typedef HANDLE kmp_thread_t;
+typedef DWORD kmp_key_t;
 #endif /* KMP_OS_WINDOWS */
 
 #if KMP_OS_UNIX
-    typedef pthread_t           kmp_thread_t;
-    typedef pthread_key_t       kmp_key_t;
+typedef pthread_t kmp_thread_t;
+typedef pthread_key_t kmp_key_t;
 #endif
 
-extern kmp_key_t  __kmp_gtid_threadprivate_key;
+extern kmp_key_t __kmp_gtid_threadprivate_key;
 
 typedef struct kmp_sys_info {
-    long maxrss;          /* the maximum resident set size utilized (in kilobytes)     */
-    long minflt;          /* the number of page faults serviced without any I/O        */
-    long majflt;          /* the number of page faults serviced that required I/O      */
-    long nswap;           /* the number of times a process was "swapped" out of memory */
-    long inblock;         /* the number of times the file system had to perform input  */
-    long oublock;         /* the number of times the file system had to perform output */
-    long nvcsw;           /* the number of times a context switch was voluntarily      */
-    long nivcsw;          /* the number of times a context switch was forced           */
+  long maxrss; /* the maximum resident set size utilized (in kilobytes)     */
+  long minflt; /* the number of page faults serviced without any I/O        */
+  long majflt; /* the number of page faults serviced that required I/O      */
+  long nswap; /* the number of times a process was "swapped" out of memory */
+  long inblock; /* the number of times the file system had to perform input  */
+  long oublock; /* the number of times the file system had to perform output */
+  long nvcsw; /* the number of times a context switch was voluntarily      */
+  long nivcsw; /* the number of times a context switch was forced           */
 } kmp_sys_info_t;
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 typedef struct kmp_cpuinfo {
-    int        initialized;  // If 0, other fields are not initialized.
-    int        signature;    // CPUID(1).EAX
-    int        family;       // CPUID(1).EAX[27:20] + CPUID(1).EAX[11:8] ( Extended Family + Family )
-    int        model;        // ( CPUID(1).EAX[19:16] << 4 ) + CPUID(1).EAX[7:4] ( ( Extended Model << 4 ) + Model)
-    int        stepping;     // CPUID(1).EAX[3:0] ( Stepping )
-    int        sse2;         // 0 if SSE2 instructions are not supported, 1 otherwise.
-    int        rtm;          // 0 if RTM instructions are not supported, 1 otherwise.
-    int        cpu_stackoffset;
-    int        apic_id;
-    int        physical_id;
-    int        logical_id;
-    kmp_uint64 frequency;    // Nominal CPU frequency in Hz.
-    char       name [3*sizeof (kmp_cpuid_t)]; // CPUID(0x80000002,0x80000003,0x80000004)
+  int initialized; // If 0, other fields are not initialized.
+  int signature; // CPUID(1).EAX
+  int family; // CPUID(1).EAX[27:20]+CPUID(1).EAX[11:8] (Extended Family+Family)
+  int model; // ( CPUID(1).EAX[19:16] << 4 ) + CPUID(1).EAX[7:4] ( ( Extended
+  // Model << 4 ) + Model)
+  int stepping; // CPUID(1).EAX[3:0] ( Stepping )
+  int sse2; // 0 if SSE2 instructions are not supported, 1 otherwise.
+  int rtm; // 0 if RTM instructions are not supported, 1 otherwise.
+  int cpu_stackoffset;
+  int apic_id;
+  int physical_id;
+  int logical_id;
+  kmp_uint64 frequency; // Nominal CPU frequency in Hz.
+  char name[3 * sizeof(kmp_cpuid_t)]; // CPUID(0x80000002,0x80000003,0x80000004)
 } kmp_cpuinfo_t;
 #endif
 
 #ifdef BUILD_TV
 
 struct tv_threadprivate {
-    /* Record type #1 */
-    void        *global_addr;
-    void        *thread_addr;
+  /* Record type #1 */
+  void *global_addr;
+  void *thread_addr;
 };
 
 struct tv_data {
-    struct tv_data      *next;
-    void                *type;
-    union tv_union {
-        struct tv_threadprivate tp;
-    } u;
+  struct tv_data *next;
+  void *type;
+  union tv_union {
+    struct tv_threadprivate tp;
+  } u;
 };
 
 extern kmp_key_t __kmp_tv_key;
@@ -1170,137 +1210,168 @@ extern kmp_key_t __kmp_tv_key;
 /* ------------------------------------------------------------------------ */
 
 #if USE_ITT_BUILD
-// We cannot include "kmp_itt.h" due to circular dependency. Declare the only required type here.
-// Later we will check the type meets requirements.
+// We cannot include "kmp_itt.h" due to circular dependency. Declare the only
+// required type here. Later we will check the type meets requirements.
 typedef int kmp_itt_mark_t;
 #define KMP_ITT_DEBUG 0
 #endif /* USE_ITT_BUILD */
 
-/* ------------------------------------------------------------------------ */
-
-/*
- * Taskq data structures
- */
+/* Taskq data structures */
 
-#define HIGH_WATER_MARK(nslots)         (((nslots) * 3) / 4)
-#define __KMP_TASKQ_THUNKS_PER_TH        1      /* num thunks that each thread can simultaneously execute from a task queue */
+#define HIGH_WATER_MARK(nslots) (((nslots)*3) / 4)
+// num thunks that each thread can simultaneously execute from a task queue
+#define __KMP_TASKQ_THUNKS_PER_TH 1
+
+/* flags for taskq_global_flags, kmp_task_queue_t tq_flags, kmpc_thunk_t
+   th_flags  */
+
+#define TQF_IS_ORDERED 0x0001 // __kmpc_taskq interface, taskq ordered
+//  __kmpc_taskq interface, taskq with lastprivate list
+#define TQF_IS_LASTPRIVATE 0x0002
+#define TQF_IS_NOWAIT 0x0004 // __kmpc_taskq interface, end taskq nowait
+// __kmpc_taskq interface, use heuristics to decide task queue size
+#define TQF_HEURISTICS 0x0008
+
+// __kmpc_taskq interface, reserved for future use
+#define TQF_INTERFACE_RESERVED1 0x0010
+// __kmpc_taskq interface, reserved for future use
+#define TQF_INTERFACE_RESERVED2 0x0020
+// __kmpc_taskq interface, reserved for future use
+#define TQF_INTERFACE_RESERVED3 0x0040
+// __kmpc_taskq interface, reserved for future use
+#define TQF_INTERFACE_RESERVED4 0x0080
+
+#define TQF_INTERFACE_FLAGS 0x00ff // all the __kmpc_taskq interface flags
+// internal/read by instrumentation; only used with TQF_IS_LASTPRIVATE
+#define TQF_IS_LAST_TASK 0x0100
+// internal use only; this thunk->th_task is the taskq_task
+#define TQF_TASKQ_TASK 0x0200
+// internal use only; must release worker threads once ANY queued task
+// exists (global)
+#define TQF_RELEASE_WORKERS 0x0400
+// internal use only; notify workers that master has finished enqueuing tasks
+#define TQF_ALL_TASKS_QUEUED 0x0800
+// internal use only: this queue encountered in parallel context: not serialized
+#define TQF_PARALLEL_CONTEXT 0x1000
+// internal use only; this queue is on the freelist and not in use
+#define TQF_DEALLOCATED 0x2000
 
-/*  flags for taskq_global_flags, kmp_task_queue_t tq_flags, kmpc_thunk_t th_flags  */
-
-#define TQF_IS_ORDERED          0x0001  /*  __kmpc_taskq interface, taskq ordered  */
-#define TQF_IS_LASTPRIVATE      0x0002  /*  __kmpc_taskq interface, taskq with lastprivate list  */
-#define TQF_IS_NOWAIT           0x0004  /*  __kmpc_taskq interface, end taskq nowait  */
-#define TQF_HEURISTICS          0x0008  /*  __kmpc_taskq interface, use heuristics to decide task queue size  */
-#define TQF_INTERFACE_RESERVED1 0x0010  /*  __kmpc_taskq interface, reserved for future use  */
-#define TQF_INTERFACE_RESERVED2 0x0020  /*  __kmpc_taskq interface, reserved for future use  */
-#define TQF_INTERFACE_RESERVED3 0x0040  /*  __kmpc_taskq interface, reserved for future use  */
-#define TQF_INTERFACE_RESERVED4 0x0080  /*  __kmpc_taskq interface, reserved for future use  */
-
-#define TQF_INTERFACE_FLAGS     0x00ff  /*  all the __kmpc_taskq interface flags  */
-
-#define TQF_IS_LAST_TASK        0x0100  /*  internal/read by instrumentation; only used with TQF_IS_LASTPRIVATE  */
-#define TQF_TASKQ_TASK          0x0200  /*  internal use only; this thunk->th_task is the taskq_task  */
-#define TQF_RELEASE_WORKERS     0x0400  /*  internal use only; must release worker threads once ANY queued task exists (global) */
-#define TQF_ALL_TASKS_QUEUED    0x0800  /*  internal use only; notify workers that master has finished enqueuing tasks */
-#define TQF_PARALLEL_CONTEXT    0x1000  /*  internal use only: this queue encountered in a parallel context: not serialized */
-#define TQF_DEALLOCATED         0x2000  /*  internal use only; this queue is on the freelist and not in use */
-
-#define TQF_INTERNAL_FLAGS      0x3f00  /*  all the internal use only flags  */
+#define TQF_INTERNAL_FLAGS 0x3f00 // all the internal use only flags
 
 typedef struct KMP_ALIGN_CACHE kmpc_aligned_int32_t {
-    kmp_int32                      ai_data;
+  kmp_int32 ai_data;
 } kmpc_aligned_int32_t;
 
 typedef struct KMP_ALIGN_CACHE kmpc_aligned_queue_slot_t {
-    struct kmpc_thunk_t   *qs_thunk;
+  struct kmpc_thunk_t *qs_thunk;
 } kmpc_aligned_queue_slot_t;
 
 typedef struct kmpc_task_queue_t {
-        /* task queue linkage fields for n-ary tree of queues (locked with global taskq_tree_lck) */
-    kmp_lock_t                    tq_link_lck;          /*  lock for child link, child next/prev links and child ref counts */
-    union {
-        struct kmpc_task_queue_t *tq_parent;            /*  pointer to parent taskq, not locked */
-        struct kmpc_task_queue_t *tq_next_free;         /*  for taskq internal freelists, locked with global taskq_freelist_lck */
-    } tq;
-    volatile struct kmpc_task_queue_t *tq_first_child;  /*  pointer to linked-list of children, locked by tq's tq_link_lck */
-    struct kmpc_task_queue_t     *tq_next_child;        /*  next child in linked-list, locked by parent tq's tq_link_lck */
-    struct kmpc_task_queue_t     *tq_prev_child;        /*  previous child in linked-list, locked by parent tq's tq_link_lck */
-    volatile kmp_int32            tq_ref_count;         /*  reference count of threads with access to this task queue */
-                                                        /*  (other than the thread executing the kmpc_end_taskq call) */
-                                                        /*  locked by parent tq's tq_link_lck */
-
-        /* shared data for task queue */
-    struct kmpc_aligned_shared_vars_t    *tq_shareds;   /*  per-thread array of pointers to shared variable structures */
-                                                        /*  only one array element exists for all but outermost taskq */
-
-        /* bookkeeping for ordered task queue */
-    kmp_uint32                    tq_tasknum_queuing;   /*  ordered task number assigned while queuing tasks */
-    volatile kmp_uint32           tq_tasknum_serving;   /*  ordered number of next task to be served (executed) */
-
-        /* thunk storage management for task queue */
-    kmp_lock_t                    tq_free_thunks_lck;   /*  lock for thunk freelist manipulation */
-    struct kmpc_thunk_t          *tq_free_thunks;       /*  thunk freelist, chained via th.th_next_free  */
-    struct kmpc_thunk_t          *tq_thunk_space;       /*  space allocated for thunks for this task queue  */
-
-        /* data fields for queue itself */
-    kmp_lock_t                    tq_queue_lck;         /*  lock for [de]enqueue operations: tq_queue, tq_head, tq_tail, tq_nfull */
-    kmpc_aligned_queue_slot_t    *tq_queue;             /*  array of queue slots to hold thunks for tasks */
-    volatile struct kmpc_thunk_t *tq_taskq_slot;        /*  special slot for taskq task thunk, occupied if not NULL  */
-    kmp_int32                     tq_nslots;            /*  # of tq_thunk_space thunks alloc'd (not incl. tq_taskq_slot space)  */
-    kmp_int32                     tq_head;              /*  enqueue puts next item in here (index into tq_queue array) */
-    kmp_int32                     tq_tail;              /*  dequeue takes next item out of here (index into tq_queue array) */
-    volatile kmp_int32            tq_nfull;             /*  # of occupied entries in task queue right now  */
-    kmp_int32                     tq_hiwat;             /*  high-water mark for tq_nfull and queue scheduling  */
-    volatile kmp_int32            tq_flags;             /*  TQF_xxx  */
-
-        /* bookkeeping for outstanding thunks */
-    struct kmpc_aligned_int32_t  *tq_th_thunks;         /*  per-thread array for # of regular thunks currently being executed */
-    kmp_int32                     tq_nproc;             /*  number of thunks in the th_thunks array */
+  /* task queue linkage fields for n-ary tree of queues (locked with global
+     taskq_tree_lck) */
+  kmp_lock_t tq_link_lck; /* lock for child link, child next/prev links and
+                             child ref counts */
+  union {
+    struct kmpc_task_queue_t *tq_parent; // pointer to parent taskq, not locked
+    // for taskq internal freelists, locked with global taskq_freelist_lck
+    struct kmpc_task_queue_t *tq_next_free;
+  } tq;
+  // pointer to linked-list of children, locked by tq's tq_link_lck
+  volatile struct kmpc_task_queue_t *tq_first_child;
+  // next child in linked-list, locked by parent tq's tq_link_lck
+  struct kmpc_task_queue_t *tq_next_child;
+  // previous child in linked-list, locked by parent tq's tq_link_lck
+  struct kmpc_task_queue_t *tq_prev_child;
+  // reference count of threads with access to this task queue
+  volatile kmp_int32 tq_ref_count;
+  /* (other than the thread executing the kmpc_end_taskq call) */
+  /* locked by parent tq's tq_link_lck */
+
+  /* shared data for task queue */
+  /* per-thread array of pointers to shared variable structures */
+  struct kmpc_aligned_shared_vars_t *tq_shareds;
+  /* only one array element exists for all but outermost taskq */
+
+  /* bookkeeping for ordered task queue */
+  kmp_uint32 tq_tasknum_queuing; // ordered task # assigned while queuing tasks
+  // ordered number of next task to be served (executed)
+  volatile kmp_uint32 tq_tasknum_serving;
+
+  /* thunk storage management for task queue */
+  kmp_lock_t tq_free_thunks_lck; /* lock for thunk freelist manipulation */
+  // thunk freelist, chained via th.th_next_free
+  struct kmpc_thunk_t *tq_free_thunks;
+  // space allocated for thunks for this task queue
+  struct kmpc_thunk_t *tq_thunk_space;
+
+  /* data fields for queue itself */
+  kmp_lock_t tq_queue_lck; /* lock for [de]enqueue operations: tq_queue,
+                              tq_head, tq_tail, tq_nfull */
+  /* array of queue slots to hold thunks for tasks */
+  kmpc_aligned_queue_slot_t *tq_queue;
+  volatile struct kmpc_thunk_t *tq_taskq_slot; /* special slot for taskq task
+                                                  thunk, occupied if not NULL */
+  kmp_int32 tq_nslots; /* # of tq_thunk_space thunks alloc'd (not incl.
+                          tq_taskq_slot space)  */
+  kmp_int32 tq_head; // enqueue puts item here (index into tq_queue array)
+  kmp_int32 tq_tail; // dequeue takes item from here (index into tq_queue array)
+  volatile kmp_int32 tq_nfull; // # of occupied entries in task queue right now
+  kmp_int32 tq_hiwat; /* high-water mark for tq_nfull and queue scheduling  */
+  volatile kmp_int32 tq_flags; /*  TQF_xxx  */
+
+  /* bookkeeping for outstanding thunks */
+
+  /* per-thread array for # of regular thunks currently being executed */
+  struct kmpc_aligned_int32_t *tq_th_thunks;
+  kmp_int32 tq_nproc; /* number of thunks in the th_thunks array */
 
-        /* statistics library bookkeeping */
-    ident_t                       *tq_loc;              /*  source location information for taskq directive */
+  /* statistics library bookkeeping */
+  ident_t *tq_loc; /*  source location information for taskq directive */
 } kmpc_task_queue_t;
 
-typedef void (*kmpc_task_t) (kmp_int32 global_tid, struct kmpc_thunk_t *thunk);
+typedef void (*kmpc_task_t)(kmp_int32 global_tid, struct kmpc_thunk_t *thunk);
 
 /*  sizeof_shareds passed as arg to __kmpc_taskq call  */
-typedef struct kmpc_shared_vars_t {             /*  aligned during dynamic allocation */
-    kmpc_task_queue_t         *sv_queue;
-    /*  (pointers to) shared vars  */
+typedef struct kmpc_shared_vars_t { /* aligned during dynamic allocation */
+  kmpc_task_queue_t *sv_queue; /* (pointers to) shared vars */
 } kmpc_shared_vars_t;
 
 typedef struct KMP_ALIGN_CACHE kmpc_aligned_shared_vars_t {
-    volatile struct kmpc_shared_vars_t     *ai_data;
+  volatile struct kmpc_shared_vars_t *ai_data;
 } kmpc_aligned_shared_vars_t;
 
-/*  sizeof_thunk passed as arg to kmpc_taskq call  */
-typedef struct kmpc_thunk_t {                   /*  aligned during dynamic allocation */
-    union {                                     /*  field used for internal freelists too  */
-        kmpc_shared_vars_t  *th_shareds;
-        struct kmpc_thunk_t *th_next_free;      /*  freelist of individual thunks within queue, head at tq_free_thunks  */
-    } th;
-    kmpc_task_t th_task;                        /*  taskq_task if flags & TQF_TASKQ_TASK  */
-    struct kmpc_thunk_t *th_encl_thunk;         /*  pointer to dynamically enclosing thunk on this thread's call stack */
-    kmp_int32 th_flags;                         /*  TQF_xxx (tq_flags interface plus possible internal flags)  */
-    kmp_int32 th_status;
-    kmp_uint32 th_tasknum;                      /*  task number assigned in order of queuing, used for ordered sections */
-    /*  private vars  */
+/* sizeof_thunk passed as arg to kmpc_taskq call */
+typedef struct kmpc_thunk_t { /* aligned during dynamic allocation */
+  union { /* field used for internal freelists too */
+    kmpc_shared_vars_t *th_shareds;
+    struct kmpc_thunk_t *th_next_free; /* freelist of individual thunks within
+                                          queue, head at tq_free_thunks */
+  } th;
+  kmpc_task_t th_task; /* taskq_task if flags & TQF_TASKQ_TASK */
+  struct kmpc_thunk_t *th_encl_thunk; /* pointer to dynamically enclosing thunk
+                                         on this thread's call stack */
+  // TQF_xxx(tq_flags interface plus possible internal flags)
+  kmp_int32 th_flags;
+
+  kmp_int32 th_status;
+  kmp_uint32 th_tasknum; /* task number assigned in order of queuing, used for
+                            ordered sections */
+  /* private vars */
 } kmpc_thunk_t;
 
 typedef struct KMP_ALIGN_CACHE kmp_taskq {
-    int                 tq_curr_thunk_capacity;
+  int tq_curr_thunk_capacity;
 
-    kmpc_task_queue_t  *tq_root;
-    kmp_int32           tq_global_flags;
+  kmpc_task_queue_t *tq_root;
+  kmp_int32 tq_global_flags;
 
-    kmp_lock_t          tq_freelist_lck;
-    kmpc_task_queue_t  *tq_freelist;
+  kmp_lock_t tq_freelist_lck;
+  kmpc_task_queue_t *tq_freelist;
 
-    kmpc_thunk_t      **tq_curr_thunk;
+  kmpc_thunk_t **tq_curr_thunk;
 } kmp_taskq_t;
 
 /* END Taskq data structures */
-/* --------------------------------------------------------------------------- */
 
 typedef kmp_int32 kmp_critical_name[8];
 
@@ -1308,18 +1379,21 @@ typedef kmp_int32 kmp_critical_name[8];
 @ingroup PARALLEL
 The type for a microtask which gets passed to @ref __kmpc_fork_call().
 The arguments to the outlined function are
- at param global_tid the global thread identity of the thread executing the function.
+ at param global_tid the global thread identity of the thread executing the
+function.
 @param bound_tid  the local identitiy of the thread executing the function
 @param ... pointers to shared variables accessed by the function.
 */
-typedef void (*kmpc_micro)              ( kmp_int32 * global_tid, kmp_int32 * bound_tid, ... );
-typedef void (*kmpc_micro_bound)        ( kmp_int32 * bound_tid, kmp_int32 * bound_nth, ... );
+typedef void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...);
+typedef void (*kmpc_micro_bound)(kmp_int32 *bound_tid, kmp_int32 *bound_nth,
+                                 ...);
 
 /*!
 @ingroup THREADPRIVATE
 @{
 */
-/* --------------------------------------------------------------------------- */
+/* ---------------------------------------------------------------------------
+ */
 /* Threadprivate initialization/finalization function declarations */
 
 /*  for non-array objects:  __kmpc_threadprivate_register()  */
@@ -1328,487 +1402,505 @@ typedef void (*kmpc_micro_bound)
  Pointer to the constructor function.
  The first argument is the <tt>this</tt> pointer
 */
-typedef void *(*kmpc_ctor)    (void *);
+typedef void *(*kmpc_ctor)(void *);
 
 /*!
  Pointer to the destructor function.
  The first argument is the <tt>this</tt> pointer
 */
-typedef void (*kmpc_dtor)     (void * /*, size_t */); /* 2nd arg: magic number for KCC unused by Intel compiler */
+typedef void (*kmpc_dtor)(
+    void * /*, size_t */); /* 2nd arg: magic number for KCC unused by Intel
+                              compiler */
 /*!
  Pointer to an alternate constructor.
  The first argument is the <tt>this</tt> pointer.
 */
-typedef void *(*kmpc_cctor)   (void *, void *);
+typedef void *(*kmpc_cctor)(void *, void *);
 
-/*  for array objects: __kmpc_threadprivate_register_vec()  */
-                                /* First arg: "this" pointer */
-                                /* Last arg: number of array elements */
+/* for array objects: __kmpc_threadprivate_register_vec() */
+/* First arg: "this" pointer */
+/* Last arg: number of array elements */
 /*!
  Array constructor.
  First argument is the <tt>this</tt> pointer
  Second argument the number of array elements.
 */
-typedef void *(*kmpc_ctor_vec)  (void *, size_t);
+typedef void *(*kmpc_ctor_vec)(void *, size_t);
 /*!
  Pointer to the array destructor function.
  The first argument is the <tt>this</tt> pointer
  Second argument the number of array elements.
 */
-typedef void (*kmpc_dtor_vec)   (void *, size_t);
+typedef void (*kmpc_dtor_vec)(void *, size_t);
 /*!
  Array constructor.
  First argument is the <tt>this</tt> pointer
  Third argument the number of array elements.
 */
-typedef void *(*kmpc_cctor_vec) (void *, void *, size_t); /* function unused by compiler */
+typedef void *(*kmpc_cctor_vec)(void *, void *,
+                                size_t); /* function unused by compiler */
 
 /*!
 @}
 */
 
-
-/* ------------------------------------------------------------------------ */
-
 /* keeps tracked of threadprivate cache allocations for cleanup later */
 typedef struct kmp_cached_addr {
-    void                      **addr;           /* address of allocated cache */
-    struct kmp_cached_addr     *next;           /* pointer to next cached address */
+  void **addr; /* address of allocated cache */
+  struct kmp_cached_addr *next; /* pointer to next cached address */
 } kmp_cached_addr_t;
 
 struct private_data {
-    struct private_data *next;          /* The next descriptor in the list      */
-    void                *data;          /* The data buffer for this descriptor  */
-    int                  more;          /* The repeat count for this descriptor */
-    size_t               size;          /* The data size for this descriptor    */
+  struct private_data *next; /* The next descriptor in the list      */
+  void *data; /* The data buffer for this descriptor  */
+  int more; /* The repeat count for this descriptor */
+  size_t size; /* The data size for this descriptor    */
 };
 
 struct private_common {
-    struct private_common     *next;
-    struct private_common     *link;
-    void                      *gbl_addr;
-    void                      *par_addr;        /* par_addr == gbl_addr for MASTER thread */
-    size_t                     cmn_size;
-};
-
-struct shared_common
-{
-    struct shared_common      *next;
-    struct private_data       *pod_init;
-    void                      *obj_init;
-    void                      *gbl_addr;
-    union {
-        kmpc_ctor              ctor;
-        kmpc_ctor_vec          ctorv;
-    } ct;
-    union {
-        kmpc_cctor             cctor;
-        kmpc_cctor_vec         cctorv;
-    } cct;
-    union {
-        kmpc_dtor              dtor;
-        kmpc_dtor_vec          dtorv;
-    } dt;
-    size_t                     vec_len;
-    int                        is_vec;
-    size_t                     cmn_size;
-};
-
-#define KMP_HASH_TABLE_LOG2     9                               /* log2 of the hash table size */
-#define KMP_HASH_TABLE_SIZE     (1 << KMP_HASH_TABLE_LOG2)      /* size of the hash table */
-#define KMP_HASH_SHIFT          3                               /* throw away this many low bits from the address */
-#define KMP_HASH(x)             ((((kmp_uintptr_t) x) >> KMP_HASH_SHIFT) & (KMP_HASH_TABLE_SIZE-1))
+  struct private_common *next;
+  struct private_common *link;
+  void *gbl_addr;
+  void *par_addr; /* par_addr == gbl_addr for MASTER thread */
+  size_t cmn_size;
+};
+
+struct shared_common {
+  struct shared_common *next;
+  struct private_data *pod_init;
+  void *obj_init;
+  void *gbl_addr;
+  union {
+    kmpc_ctor ctor;
+    kmpc_ctor_vec ctorv;
+  } ct;
+  union {
+    kmpc_cctor cctor;
+    kmpc_cctor_vec cctorv;
+  } cct;
+  union {
+    kmpc_dtor dtor;
+    kmpc_dtor_vec dtorv;
+  } dt;
+  size_t vec_len;
+  int is_vec;
+  size_t cmn_size;
+};
+
+#define KMP_HASH_TABLE_LOG2 9 /* log2 of the hash table size */
+#define KMP_HASH_TABLE_SIZE                                                    \
+  (1 << KMP_HASH_TABLE_LOG2) /* size of the hash table */
+#define KMP_HASH_SHIFT 3 /* throw away this many low bits from the address */
+#define KMP_HASH(x)                                                            \
+  ((((kmp_uintptr_t)x) >> KMP_HASH_SHIFT) & (KMP_HASH_TABLE_SIZE - 1))
 
 struct common_table {
-    struct  private_common      *data[ KMP_HASH_TABLE_SIZE ];
+  struct private_common *data[KMP_HASH_TABLE_SIZE];
 };
 
 struct shared_table {
-    struct  shared_common       *data[ KMP_HASH_TABLE_SIZE ];
+  struct shared_common *data[KMP_HASH_TABLE_SIZE];
 };
-/* ------------------------------------------------------------------------ */
+
 /* ------------------------------------------------------------------------ */
 
 #if KMP_STATIC_STEAL_ENABLED
 typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
-    kmp_int32 count;
-    kmp_int32 ub;
-    /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
-    kmp_int32 lb;
-    kmp_int32 st;
-    kmp_int32 tc;
-    kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put after ub */
-
-    // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on )
-    //    a) parm3 is properly aligned and
-    //    b) all parm1-4 are in the same cache line.
-    // Because of parm1-4 are used together, performance seems to be better
-    // if they are in the same line (not measured though).
-
-    struct KMP_ALIGN( 32 ) { // AC: changed 16 to 32 in order to simplify template
-        kmp_int32 parm1;     //     structures in kmp_dispatch.cpp. This should
-        kmp_int32 parm2;     //     make no real change at least while padding is off.
-        kmp_int32 parm3;
-        kmp_int32 parm4;
-    };
+  kmp_int32 count;
+  kmp_int32 ub;
+  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+  kmp_int32 lb;
+  kmp_int32 st;
+  kmp_int32 tc;
+  kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put
+                                     after ub */
+
+  // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on )
+  //    a) parm3 is properly aligned and
+  //    b) all parm1-4 are in the same cache line.
+  // Because of parm1-4 are used together, performance seems to be better
+  // if they are in the same line (not measured though).
+
+  struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
+    kmp_int32 parm1; //     structures in kmp_dispatch.cpp. This should
+    kmp_int32 parm2; //     make no real change at least while padding is off.
+    kmp_int32 parm3;
+    kmp_int32 parm4;
+  };
 
-    kmp_uint32 ordered_lower;
-    kmp_uint32 ordered_upper;
+  kmp_uint32 ordered_lower;
+  kmp_uint32 ordered_upper;
 #if KMP_OS_WINDOWS
-    // This var can be placed in the hole between 'tc' and 'parm1', instead of 'static_steal_counter'.
-    // It would be nice to measure execution times.
-    // Conditional if/endif can be removed at all.
-    kmp_int32 last_upper;
+// This var can be placed in the hole between 'tc' and 'parm1', instead of
+// 'static_steal_counter'. It would be nice to measure execution times.
+// Conditional if/endif can be removed at all.
+  kmp_int32 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info32_t;
 
 typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
-    kmp_int64 count;   /* current chunk number for static and static-steal scheduling*/
-    kmp_int64 ub;      /* upper-bound */
-    /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
-    kmp_int64 lb;      /* lower-bound */
-    kmp_int64 st;      /* stride */
-    kmp_int64 tc;      /* trip count (number of iterations) */
-    kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put after ub */
-
-    /* parm[1-4] are used in different ways by different scheduling algorithms */
-
-    // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
-    //    a) parm3 is properly aligned and
-    //    b) all parm1-4 are in the same cache line.
-    // Because of parm1-4 are used together, performance seems to be better
-    // if they are in the same line (not measured though).
-
-    struct KMP_ALIGN( 32 ) {
-        kmp_int64 parm1;
-        kmp_int64 parm2;
-        kmp_int64 parm3;
-        kmp_int64 parm4;
-    };
+  kmp_int64 count; // current chunk number for static & static-steal scheduling
+  kmp_int64 ub; /* upper-bound */
+  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+  kmp_int64 lb; /* lower-bound */
+  kmp_int64 st; /* stride */
+  kmp_int64 tc; /* trip count (number of iterations) */
+  kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put
+                                     after ub */
+
+  /* parm[1-4] are used in different ways by different scheduling algorithms */
+
+  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+  //    a) parm3 is properly aligned and
+  //    b) all parm1-4 are in the same cache line.
+  // Because of parm1-4 are used together, performance seems to be better
+  // if they are in the same line (not measured though).
+
+  struct KMP_ALIGN(32) {
+    kmp_int64 parm1;
+    kmp_int64 parm2;
+    kmp_int64 parm3;
+    kmp_int64 parm4;
+  };
 
-    kmp_uint64 ordered_lower;
-    kmp_uint64 ordered_upper;
+  kmp_uint64 ordered_lower;
+  kmp_uint64 ordered_upper;
 #if KMP_OS_WINDOWS
-    // This var can be placed in the hole between 'tc' and 'parm1', instead of 'static_steal_counter'.
-    // It would be nice to measure execution times.
-    // Conditional if/endif can be removed at all.
-    kmp_int64 last_upper;
+// This var can be placed in the hole between 'tc' and 'parm1', instead of
+// 'static_steal_counter'. It would be nice to measure execution times.
+// Conditional if/endif can be removed at all.
+  kmp_int64 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info64_t;
 #else /* KMP_STATIC_STEAL_ENABLED */
 typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
-    kmp_int32 lb;
-    kmp_int32 ub;
-    kmp_int32 st;
-    kmp_int32 tc;
+  kmp_int32 lb;
+  kmp_int32 ub;
+  kmp_int32 st;
+  kmp_int32 tc;
+
+  kmp_int32 parm1;
+  kmp_int32 parm2;
+  kmp_int32 parm3;
+  kmp_int32 parm4;
 
-    kmp_int32 parm1;
-    kmp_int32 parm2;
-    kmp_int32 parm3;
-    kmp_int32 parm4;
-
-    kmp_int32 count;
+  kmp_int32 count;
 
-    kmp_uint32 ordered_lower;
-    kmp_uint32 ordered_upper;
+  kmp_uint32 ordered_lower;
+  kmp_uint32 ordered_upper;
 #if KMP_OS_WINDOWS
-    kmp_int32 last_upper;
+  kmp_int32 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info32_t;
 
 typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
-    kmp_int64 lb;      /* lower-bound */
-    kmp_int64 ub;      /* upper-bound */
-    kmp_int64 st;      /* stride */
-    kmp_int64 tc;      /* trip count (number of iterations) */
-
-    /* parm[1-4] are used in different ways by different scheduling algorithms */
-    kmp_int64 parm1;
-    kmp_int64 parm2;
-    kmp_int64 parm3;
-    kmp_int64 parm4;
+  kmp_int64 lb; /* lower-bound */
+  kmp_int64 ub; /* upper-bound */
+  kmp_int64 st; /* stride */
+  kmp_int64 tc; /* trip count (number of iterations) */
+
+  /* parm[1-4] are used in different ways by different scheduling algorithms */
+  kmp_int64 parm1;
+  kmp_int64 parm2;
+  kmp_int64 parm3;
+  kmp_int64 parm4;
 
-    kmp_int64 count;   /* current chunk number for static scheduling */
+  kmp_int64 count; /* current chunk number for static scheduling */
 
-    kmp_uint64 ordered_lower;
-    kmp_uint64 ordered_upper;
+  kmp_uint64 ordered_lower;
+  kmp_uint64 ordered_upper;
 #if KMP_OS_WINDOWS
-    kmp_int64 last_upper;
+  kmp_int64 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info64_t;
 #endif /* KMP_STATIC_STEAL_ENABLED */
 
 typedef struct KMP_ALIGN_CACHE dispatch_private_info {
-    union private_info {
-        dispatch_private_info32_t  p32;
-        dispatch_private_info64_t  p64;
-    } u;
-    enum sched_type schedule;  /* scheduling algorithm */
-    kmp_int32       ordered;   /* ordered clause specified */
-    kmp_int32       ordered_bumped;
-    kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making ordered_iteration scalar
-    struct dispatch_private_info * next; /* stack of buffers for nest of serial regions */
-    kmp_int32       nomerge;   /* don't merge iters if serialized */
-    kmp_int32       type_size; /* the size of types in private_info */
-    enum cons_type  pushed_ws;
+  union private_info {
+    dispatch_private_info32_t p32;
+    dispatch_private_info64_t p64;
+  } u;
+  enum sched_type schedule; /* scheduling algorithm */
+  kmp_int32 ordered; /* ordered clause specified */
+  kmp_int32 ordered_bumped;
+  // To retain the structure size after making ordered_iteration scalar
+  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
+  // Stack of buffers for nest of serial regions
+  struct dispatch_private_info *next;
+  kmp_int32 nomerge; /* don't merge iters if serialized */
+  kmp_int32 type_size; /* the size of types in private_info */
+  enum cons_type pushed_ws;
 } dispatch_private_info_t;
 
 typedef struct dispatch_shared_info32 {
-    /* chunk index under dynamic, number of idle threads under static-steal;
-       iteration index otherwise */
-    volatile kmp_uint32      iteration;
-    volatile kmp_uint32      num_done;
-    volatile kmp_uint32      ordered_iteration;
-    kmp_int32   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size after making ordered_iteration scalar
+  /* chunk index under dynamic, number of idle threads under static-steal;
+     iteration index otherwise */
+  volatile kmp_uint32 iteration;
+  volatile kmp_uint32 num_done;
+  volatile kmp_uint32 ordered_iteration;
+  // Dummy to retain the structure size after making ordered_iteration scalar
+  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 1];
 } dispatch_shared_info32_t;
 
 typedef struct dispatch_shared_info64 {
-    /* chunk index under dynamic, number of idle threads under static-steal;
-       iteration index otherwise */
-    volatile kmp_uint64      iteration;
-    volatile kmp_uint64      num_done;
-    volatile kmp_uint64      ordered_iteration;
-    kmp_int64   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making ordered_iteration scalar
+  /* chunk index under dynamic, number of idle threads under static-steal;
+     iteration index otherwise */
+  volatile kmp_uint64 iteration;
+  volatile kmp_uint64 num_done;
+  volatile kmp_uint64 ordered_iteration;
+  // Dummy to retain the structure size after making ordered_iteration scalar
+  kmp_int64 ordered_dummy[KMP_MAX_ORDERED - 3];
 } dispatch_shared_info64_t;
 
 typedef struct dispatch_shared_info {
-    union shared_info {
-        dispatch_shared_info32_t  s32;
-        dispatch_shared_info64_t  s64;
-    } u;
-    volatile kmp_uint32     buffer_index;
+  union shared_info {
+    dispatch_shared_info32_t s32;
+    dispatch_shared_info64_t s64;
+  } u;
+  volatile kmp_uint32 buffer_index;
 #if OMP_45_ENABLED
-    volatile kmp_int32      doacross_buf_idx;  // teamwise index
-    volatile kmp_uint32    *doacross_flags;    // shared array of iteration flags (0/1)
-    kmp_int32               doacross_num_done; // count finished threads
+  volatile kmp_int32 doacross_buf_idx; // teamwise index
+  volatile kmp_uint32 *doacross_flags; // shared array of iteration flags (0/1)
+  kmp_int32 doacross_num_done; // count finished threads
 #endif
 #if KMP_USE_HWLOC
-    // When linking with libhwloc, the ORDERED EPCC test slows down on big
-    // machines (> 48 cores). Performance analysis showed that a cache thrash
-    // was occurring and this padding helps alleviate the problem.
-    char padding[64];
+  // When linking with libhwloc, the ORDERED EPCC test slows down on big
+  // machines (> 48 cores). Performance analysis showed that a cache thrash
+  // was occurring and this padding helps alleviate the problem.
+  char padding[64];
 #endif
 } dispatch_shared_info_t;
 
 typedef struct kmp_disp {
-    /* Vector for ORDERED SECTION */
-    void (*th_deo_fcn)( int * gtid, int * cid, ident_t *);
-    /* Vector for END ORDERED SECTION */
-    void (*th_dxo_fcn)( int * gtid, int * cid, ident_t *);
+  /* Vector for ORDERED SECTION */
+  void (*th_deo_fcn)(int *gtid, int *cid, ident_t *);
+  /* Vector for END ORDERED SECTION */
+  void (*th_dxo_fcn)(int *gtid, int *cid, ident_t *);
 
-    dispatch_shared_info_t  *th_dispatch_sh_current;
-    dispatch_private_info_t *th_dispatch_pr_current;
+  dispatch_shared_info_t *th_dispatch_sh_current;
+  dispatch_private_info_t *th_dispatch_pr_current;
 
-    dispatch_private_info_t *th_disp_buffer;
-    kmp_int32                th_disp_index;
+  dispatch_private_info_t *th_disp_buffer;
+  kmp_int32 th_disp_index;
 #if OMP_45_ENABLED
-    kmp_int32                th_doacross_buf_idx; // thread's doacross buffer index
-    volatile kmp_uint32     *th_doacross_flags;   // pointer to shared array of flags
-    union { // we can use union here because doacross cannot be used in nonmonotonic loops
-        kmp_int64           *th_doacross_info;    // info on loop bounds
-        kmp_lock_t          *th_steal_lock;       // lock used for chunk stealing (8-byte variable)
-    };
+  kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
+  volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
+  union { // we can use union here because doacross cannot be used in
+    // nonmonotonic loops
+    kmp_int64 *th_doacross_info; // info on loop bounds
+    kmp_lock_t *th_steal_lock; // lock used for chunk stealing (8-byte variable)
+  };
 #else
 #if KMP_STATIC_STEAL_ENABLED
-    kmp_lock_t              *th_steal_lock;       // lock used for chunk stealing (8-byte variable)
-    void* dummy_padding[1]; // make it 64 bytes on Intel(R) 64
+  kmp_lock_t *th_steal_lock; // lock used for chunk stealing (8-byte variable)
+  void *dummy_padding[1]; // make it 64 bytes on Intel(R) 64
 #else
-    void* dummy_padding[2]; // make it 64 bytes on Intel(R) 64
+  void *dummy_padding[2]; // make it 64 bytes on Intel(R) 64
 #endif
 #endif
 #if KMP_USE_INTERNODE_ALIGNMENT
-    char more_padding[INTERNODE_CACHE_LINE];
+  char more_padding[INTERNODE_CACHE_LINE];
 #endif
 } kmp_disp_t;
 
 /* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
-
 /* Barrier stuff */
 
 /* constants for barrier state update */
-#define KMP_INIT_BARRIER_STATE  0       /* should probably start from zero */
-#define KMP_BARRIER_SLEEP_BIT   0       /* bit used for suspend/sleep part of state */
-#define KMP_BARRIER_UNUSED_BIT  1       /* bit that must never be set for valid state */
-#define KMP_BARRIER_BUMP_BIT    2       /* lsb used for bump of go/arrived state */
-
-#define KMP_BARRIER_SLEEP_STATE         (1 << KMP_BARRIER_SLEEP_BIT)
-#define KMP_BARRIER_UNUSED_STATE        (1 << KMP_BARRIER_UNUSED_BIT)
-#define KMP_BARRIER_STATE_BUMP          (1 << KMP_BARRIER_BUMP_BIT)
+#define KMP_INIT_BARRIER_STATE 0 /* should probably start from zero */
+#define KMP_BARRIER_SLEEP_BIT 0 /* bit used for suspend/sleep part of state */
+#define KMP_BARRIER_UNUSED_BIT 1 // bit that must never be set for valid state
+#define KMP_BARRIER_BUMP_BIT 2 /* lsb used for bump of go/arrived state */
+
+#define KMP_BARRIER_SLEEP_STATE (1 << KMP_BARRIER_SLEEP_BIT)
+#define KMP_BARRIER_UNUSED_STATE (1 << KMP_BARRIER_UNUSED_BIT)
+#define KMP_BARRIER_STATE_BUMP (1 << KMP_BARRIER_BUMP_BIT)
 
 #if (KMP_BARRIER_SLEEP_BIT >= KMP_BARRIER_BUMP_BIT)
-# error "Barrier sleep bit must be smaller than barrier bump bit"
+#error "Barrier sleep bit must be smaller than barrier bump bit"
 #endif
 #if (KMP_BARRIER_UNUSED_BIT >= KMP_BARRIER_BUMP_BIT)
-# error "Barrier unused bit must be smaller than barrier bump bit"
+#error "Barrier unused bit must be smaller than barrier bump bit"
 #endif
 
 // Constants for release barrier wait state: currently, hierarchical only
-#define KMP_BARRIER_NOT_WAITING        0  // Normal state; worker not in wait_sleep
-#define KMP_BARRIER_OWN_FLAG           1  // Normal state; worker waiting on own b_go flag in release
-#define KMP_BARRIER_PARENT_FLAG        2  // Special state; worker waiting on parent's b_go flag in release
-#define KMP_BARRIER_SWITCH_TO_OWN_FLAG 3  // Special state; tells worker to shift from parent to own b_go
-#define KMP_BARRIER_SWITCHING          4  // Special state; worker resets appropriate flag on wake-up
-
-#define KMP_NOT_SAFE_TO_REAP 0  // Thread th_reap_state: not safe to reap (tasking)
-#define KMP_SAFE_TO_REAP 1      // Thread th_reap_state: safe to reap (not tasking)
+#define KMP_BARRIER_NOT_WAITING 0 // Normal state; worker not in wait_sleep
+#define KMP_BARRIER_OWN_FLAG                                                   \
+  1 // Normal state; worker waiting on own b_go flag in release
+#define KMP_BARRIER_PARENT_FLAG                                                \
+  2 // Special state; worker waiting on parent's b_go flag in release
+#define KMP_BARRIER_SWITCH_TO_OWN_FLAG                                         \
+  3 // Special state; tells worker to shift from parent to own b_go
+#define KMP_BARRIER_SWITCHING                                                  \
+  4 // Special state; worker resets appropriate flag on wake-up
+
+#define KMP_NOT_SAFE_TO_REAP                                                   \
+  0 // Thread th_reap_state: not safe to reap (tasking)
+#define KMP_SAFE_TO_REAP 1 // Thread th_reap_state: safe to reap (not tasking)
 
 enum barrier_type {
-    bs_plain_barrier = 0,       /* 0, All non-fork/join barriers (except reduction barriers if enabled) */
-    bs_forkjoin_barrier,        /* 1, All fork/join (parallel region) barriers */
-    #if KMP_FAST_REDUCTION_BARRIER
-        bs_reduction_barrier,   /* 2, All barriers that are used in reduction */
-    #endif // KMP_FAST_REDUCTION_BARRIER
-    bs_last_barrier             /* Just a placeholder to mark the end */
+  bs_plain_barrier = 0, /* 0, All non-fork/join barriers (except reduction
+                           barriers if enabled) */
+  bs_forkjoin_barrier, /* 1, All fork/join (parallel region) barriers */
+#if KMP_FAST_REDUCTION_BARRIER
+  bs_reduction_barrier, /* 2, All barriers that are used in reduction */
+#endif // KMP_FAST_REDUCTION_BARRIER
+  bs_last_barrier /* Just a placeholder to mark the end */
 };
 
 // to work with reduction barriers just like with plain barriers
 #if !KMP_FAST_REDUCTION_BARRIER
-    #define bs_reduction_barrier bs_plain_barrier
+#define bs_reduction_barrier bs_plain_barrier
 #endif // KMP_FAST_REDUCTION_BARRIER
 
-typedef enum kmp_bar_pat {      /* Barrier communication patterns */
-    bp_linear_bar = 0,          /* Single level (degenerate) tree */
-    bp_tree_bar = 1,            /* Balanced tree with branching factor 2^n */
-    bp_hyper_bar = 2,           /* Hypercube-embedded tree with min branching factor 2^n */
-    bp_hierarchical_bar = 3,    /* Machine hierarchy tree */
-    bp_last_bar = 4             /* Placeholder to mark the end */
+typedef enum kmp_bar_pat { /* Barrier communication patterns */
+                           bp_linear_bar =
+                               0, /* Single level (degenerate) tree */
+                           bp_tree_bar =
+                               1, /* Balanced tree with branching factor 2^n */
+                           bp_hyper_bar =
+                               2, /* Hypercube-embedded tree with min branching
+                                     factor 2^n */
+                           bp_hierarchical_bar = 3, /* Machine hierarchy tree */
+                           bp_last_bar = 4 /* Placeholder to mark the end */
 } kmp_bar_pat_e;
 
-# define KMP_BARRIER_ICV_PUSH   1
+#define KMP_BARRIER_ICV_PUSH 1
 
 /* Record for holding the values of the internal controls stack records */
 typedef struct kmp_internal_control {
-    int           serial_nesting_level;  /* corresponds to the value of the th_team_serialized field */
-    kmp_int8      nested;                /* internal control for nested parallelism (per thread) */
-    kmp_int8      dynamic;               /* internal control for dynamic adjustment of threads (per thread) */
-    kmp_int8      bt_set;                /* internal control for whether blocktime is explicitly set */
-    int           blocktime;             /* internal control for blocktime */
+  int serial_nesting_level; /* corresponds to the value of the
+                               th_team_serialized field */
+  kmp_int8 nested; /* internal control for nested parallelism (per thread) */
+  kmp_int8 dynamic; /* internal control for dynamic adjustment of threads (per
+                       thread) */
+  kmp_int8
+      bt_set; /* internal control for whether blocktime is explicitly set */
+  int blocktime; /* internal control for blocktime */
 #if KMP_USE_MONITOR
-    int           bt_intervals;          /* internal control for blocktime intervals */
+  int bt_intervals; /* internal control for blocktime intervals */
 #endif
-    int           nproc;                 /* internal control for #threads for next parallel region (per thread) */
-    int           max_active_levels;     /* internal control for max_active_levels */
-    kmp_r_sched_t sched;                 /* internal control for runtime schedule {sched,chunk} pair */
+  int nproc; /* internal control for #threads for next parallel region (per
+                thread) */
+  int max_active_levels; /* internal control for max_active_levels */
+  kmp_r_sched_t
+      sched; /* internal control for runtime schedule {sched,chunk} pair */
 #if OMP_40_ENABLED
-    kmp_proc_bind_t proc_bind;           /* internal control for affinity  */
-    kmp_int32       default_device;      /* internal control for default device */
+  kmp_proc_bind_t proc_bind; /* internal control for affinity  */
+  kmp_int32 default_device; /* internal control for default device */
 #endif // OMP_40_ENABLED
-    struct kmp_internal_control *next;
+  struct kmp_internal_control *next;
 } kmp_internal_control_t;
 
-static inline void
-copy_icvs( kmp_internal_control_t *dst, kmp_internal_control_t *src ) {
-    *dst = *src;
+static inline void copy_icvs(kmp_internal_control_t *dst,
+                             kmp_internal_control_t *src) {
+  *dst = *src;
 }
 
 /* Thread barrier needs volatile barrier fields */
 typedef struct KMP_ALIGN_CACHE kmp_bstate {
-    // th_fixed_icvs is aligned by virtue of kmp_bstate being aligned (and all uses of it).
-    // It is not explicitly aligned below, because we *don't* want it to be padded -- instead,
-    // we fit b_go into the same cache line with th_fixed_icvs, enabling NGO cache lines
-    // stores in the hierarchical barrier.
-    kmp_internal_control_t th_fixed_icvs;          // Initial ICVs for the thread
-    // Tuck b_go into end of th_fixed_icvs cache line, so it can be stored with same NGO store
-    volatile kmp_uint64 b_go;                      // STATE => task should proceed (hierarchical)
-    KMP_ALIGN_CACHE volatile kmp_uint64 b_arrived; // STATE => task reached synch point.
-    kmp_uint32 *skip_per_level;
-    kmp_uint32 my_level;
-    kmp_int32 parent_tid;
-    kmp_int32 old_tid;
-    kmp_uint32 depth;
-    struct kmp_bstate *parent_bar;
-    kmp_team_t *team;
-    kmp_uint64 leaf_state;
-    kmp_uint32 nproc;
-    kmp_uint8 base_leaf_kids;
-    kmp_uint8 leaf_kids;
-    kmp_uint8 offset;
-    kmp_uint8 wait_flag;
-    kmp_uint8 use_oncore_barrier;
+  // th_fixed_icvs is aligned by virtue of kmp_bstate being aligned (and all
+  // uses of it). It is not explicitly aligned below, because we *don't* want
+  // it to be padded -- instead, we fit b_go into the same cache line with
+  // th_fixed_icvs, enabling NGO cache lines stores in the hierarchical barrier.
+  kmp_internal_control_t th_fixed_icvs; // Initial ICVs for the thread
+  // Tuck b_go into end of th_fixed_icvs cache line, so it can be stored with
+  // same NGO store
+  volatile kmp_uint64 b_go; // STATE => task should proceed (hierarchical)
+  KMP_ALIGN_CACHE volatile kmp_uint64
+      b_arrived; // STATE => task reached synch point.
+  kmp_uint32 *skip_per_level;
+  kmp_uint32 my_level;
+  kmp_int32 parent_tid;
+  kmp_int32 old_tid;
+  kmp_uint32 depth;
+  struct kmp_bstate *parent_bar;
+  kmp_team_t *team;
+  kmp_uint64 leaf_state;
+  kmp_uint32 nproc;
+  kmp_uint8 base_leaf_kids;
+  kmp_uint8 leaf_kids;
+  kmp_uint8 offset;
+  kmp_uint8 wait_flag;
+  kmp_uint8 use_oncore_barrier;
 #if USE_DEBUGGER
-    // The following field is intended for the debugger solely. Only the worker thread itself accesses this
-    // field: the worker increases it by 1 when it arrives to a barrier.
-    KMP_ALIGN_CACHE kmp_uint b_worker_arrived;
+  // The following field is intended for the debugger solely. Only the worker
+  // thread itself accesses this field: the worker increases it by 1 when it
+  // arrives to a barrier.
+  KMP_ALIGN_CACHE kmp_uint b_worker_arrived;
 #endif /* USE_DEBUGGER */
 } kmp_bstate_t;
 
 union KMP_ALIGN_CACHE kmp_barrier_union {
-    double       b_align;        /* use worst case alignment */
-    char         b_pad[ KMP_PAD(kmp_bstate_t, CACHE_LINE) ];
-    kmp_bstate_t bb;
+  double b_align; /* use worst case alignment */
+  char b_pad[KMP_PAD(kmp_bstate_t, CACHE_LINE)];
+  kmp_bstate_t bb;
 };
 
 typedef union kmp_barrier_union kmp_balign_t;
 
 /* Team barrier needs only non-volatile arrived counter */
 union KMP_ALIGN_CACHE kmp_barrier_team_union {
-    double       b_align;        /* use worst case alignment */
-    char         b_pad[ CACHE_LINE ];
-    struct {
-        kmp_uint64   b_arrived;       /* STATE => task reached synch point. */
+  double b_align; /* use worst case alignment */
+  char b_pad[CACHE_LINE];
+  struct {
+    kmp_uint64 b_arrived; /* STATE => task reached synch point. */
 #if USE_DEBUGGER
-        // The following two fields are indended for the debugger solely. Only master of the team accesses
-        // these fields: the first one is increased by 1 when master arrives to a barrier, the
-        // second one is increased by one when all the threads arrived.
-        kmp_uint     b_master_arrived;
-        kmp_uint     b_team_arrived;
+    // The following two fields are indended for the debugger solely. Only
+    // master of the team accesses these fields: the first one is increased by
+    // 1 when master arrives to a barrier, the second one is increased by one
+    // when all the threads arrived.
+    kmp_uint b_master_arrived;
+    kmp_uint b_team_arrived;
 #endif
-    };
+  };
 };
 
 typedef union kmp_barrier_team_union kmp_balign_team_t;
 
-/*
- * Padding for Linux* OS pthreads condition variables and mutexes used to signal
- * threads when a condition changes.  This is to workaround an NPTL bug
- * where padding was added to pthread_cond_t which caused the initialization
- * routine to write outside of the structure if compiled on pre-NPTL threads.
- */
-
+/* Padding for Linux* OS pthreads condition variables and mutexes used to signal
+   threads when a condition changes.  This is to workaround an NPTL bug where
+   padding was added to pthread_cond_t which caused the initialization routine
+   to write outside of the structure if compiled on pre-NPTL threads.  */
 #if KMP_OS_WINDOWS
-typedef struct kmp_win32_mutex
-{
-    /* The Lock */
-    CRITICAL_SECTION cs;
+typedef struct kmp_win32_mutex {
+  /* The Lock */
+  CRITICAL_SECTION cs;
 } kmp_win32_mutex_t;
 
-typedef struct kmp_win32_cond
-{
-    /* Count of the number of waiters. */
-    int waiters_count_;
-
-    /* Serialize access to <waiters_count_> */
-    kmp_win32_mutex_t waiters_count_lock_;
-
-    /* Number of threads to release via a <cond_broadcast> or a */
-    /* <cond_signal> */
-    int release_count_;
-
-    /* Keeps track of the current "generation" so that we don't allow */
-    /* one thread to steal all the "releases" from the broadcast. */
-    int wait_generation_count_;
-
-    /* A manual-reset event that's used to block and release waiting */
-    /* threads. */
-    HANDLE event_;
+typedef struct kmp_win32_cond {
+  /* Count of the number of waiters. */
+  int waiters_count_;
+
+  /* Serialize access to <waiters_count_> */
+  kmp_win32_mutex_t waiters_count_lock_;
+
+  /* Number of threads to release via a <cond_broadcast> or a <cond_signal> */
+  int release_count_;
+
+  /* Keeps track of the current "generation" so that we don't allow */
+  /* one thread to steal all the "releases" from the broadcast. */
+  int wait_generation_count_;
+
+  /* A manual-reset event that's used to block and release waiting threads. */
+  HANDLE event_;
 } kmp_win32_cond_t;
 #endif
 
 #if KMP_OS_UNIX
 
 union KMP_ALIGN_CACHE kmp_cond_union {
-    double              c_align;
-    char                c_pad[ CACHE_LINE ];
-    pthread_cond_t      c_cond;
+  double c_align;
+  char c_pad[CACHE_LINE];
+  pthread_cond_t c_cond;
 };
 
 typedef union kmp_cond_union kmp_cond_align_t;
 
 union KMP_ALIGN_CACHE kmp_mutex_union {
-    double              m_align;
-    char                m_pad[ CACHE_LINE ];
-    pthread_mutex_t     m_mutex;
+  double m_align;
+  char m_pad[CACHE_LINE];
+  pthread_mutex_t m_mutex;
 };
 
 typedef union kmp_mutex_union kmp_mutex_align_t;
@@ -1816,145 +1908,159 @@ typedef union kmp_mutex_union kmp_mutex_
 #endif /* KMP_OS_UNIX */
 
 typedef struct kmp_desc_base {
-    void    *ds_stackbase;
-    size_t            ds_stacksize;
-    int               ds_stackgrow;
-    kmp_thread_t      ds_thread;
-    volatile int      ds_tid;
-    int               ds_gtid;
+  void *ds_stackbase;
+  size_t ds_stacksize;
+  int ds_stackgrow;
+  kmp_thread_t ds_thread;
+  volatile int ds_tid;
+  int ds_gtid;
 #if KMP_OS_WINDOWS
-    volatile int      ds_alive;
-    DWORD             ds_thread_id;
-        /*
-            ds_thread keeps thread handle on Windows* OS. It is enough for RTL purposes. However,
-            debugger support (libomp_db) cannot work with handles, because they uncomparable. For
-            example, debugger requests info about thread with handle h. h is valid within debugger
-            process, and meaningless within debugee process. Even if h is duped by call to
-            DuplicateHandle(), so the result h' is valid within debugee process, but it is a *new*
-            handle which does *not* equal to any other handle in debugee... The only way to
-            compare handles is convert them to system-wide ids. GetThreadId() function is
-            available only in Longhorn and Server 2003. :-( In contrast, GetCurrentThreadId() is
-            available on all Windows* OS flavours (including Windows* 95). Thus, we have to get thread id by
-            call to GetCurrentThreadId() from within the thread and save it to let libomp_db
-            identify threads.
-        */
+  volatile int ds_alive;
+  DWORD ds_thread_id;
+/* ds_thread keeps thread handle on Windows* OS. It is enough for RTL purposes.
+   However, debugger support (libomp_db) cannot work with handles, because they
+   uncomparable. For example, debugger requests info about thread with handle h.
+   h is valid within debugger process, and meaningless within debugee process.
+   Even if h is duped by call to DuplicateHandle(), so the result h' is valid
+   within debugee process, but it is a *new* handle which does *not* equal to
+   any other handle in debugee... The only way to compare handles is convert
+   them to system-wide ids. GetThreadId() function is available only in
+   Longhorn and Server 2003. :-( In contrast, GetCurrentThreadId() is available
+   on all Windows* OS flavours (including Windows* 95). Thus, we have to get
+   thread id by call to GetCurrentThreadId() from within the thread and save it
+   to let libomp_db identify threads.  */
 #endif /* KMP_OS_WINDOWS */
 } kmp_desc_base_t;
 
 typedef union KMP_ALIGN_CACHE kmp_desc {
-    double           ds_align;        /* use worst case alignment */
-    char             ds_pad[ KMP_PAD(kmp_desc_base_t, CACHE_LINE) ];
-    kmp_desc_base_t  ds;
+  double ds_align; /* use worst case alignment */
+  char ds_pad[KMP_PAD(kmp_desc_base_t, CACHE_LINE)];
+  kmp_desc_base_t ds;
 } kmp_desc_t;
 
-
 typedef struct kmp_local {
-    volatile int           this_construct; /* count of single's encountered by thread */
-    void                  *reduce_data;
+  volatile int this_construct; /* count of single's encountered by thread */
+  void *reduce_data;
 #if KMP_USE_BGET
-    void                  *bget_data;
-    void                  *bget_list;
-#if ! USE_CMP_XCHG_FOR_BGET
+  void *bget_data;
+  void *bget_list;
+#if !USE_CMP_XCHG_FOR_BGET
 #ifdef USE_QUEUING_LOCK_FOR_BGET
-    kmp_lock_t             bget_lock;      /* Lock for accessing bget free list */
+  kmp_lock_t bget_lock; /* Lock for accessing bget free list */
 #else
-    kmp_bootstrap_lock_t   bget_lock;      /* Lock for accessing bget free list */
-                                           /* Must be bootstrap lock so we can use it at library shutdown */
+  kmp_bootstrap_lock_t bget_lock; // Lock for accessing bget free list. Must be
+// bootstrap lock so we can use it at library
+// shutdown.
 #endif /* USE_LOCK_FOR_BGET */
 #endif /* ! USE_CMP_XCHG_FOR_BGET */
 #endif /* KMP_USE_BGET */
 
 #ifdef BUILD_TV
-    struct tv_data        *tv_data;
+  struct tv_data *tv_data;
 #endif
 
-    PACKED_REDUCTION_METHOD_T packed_reduction_method; /* stored by __kmpc_reduce*(), used by __kmpc_end_reduce*() */
+  PACKED_REDUCTION_METHOD_T
+  packed_reduction_method; /* stored by __kmpc_reduce*(), used by
+                              __kmpc_end_reduce*() */
 
 } kmp_local_t;
 
-#define KMP_CHECK_UPDATE(a, b) if ((a) != (b)) (a) = (b)
-#define KMP_CHECK_UPDATE_SYNC(a, b) if ((a) != (b)) TCW_SYNC_PTR((a), (b))
-
-#define get__blocktime( xteam, xtid )     ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime)
-#define get__bt_set( xteam, xtid )        ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set)
+#define KMP_CHECK_UPDATE(a, b)                                                 \
+  if ((a) != (b))                                                              \
+  (a) = (b)
+#define KMP_CHECK_UPDATE_SYNC(a, b)                                            \
+  if ((a) != (b))                                                              \
+  TCW_SYNC_PTR((a), (b))
+
+#define get__blocktime(xteam, xtid)                                            \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime)
+#define get__bt_set(xteam, xtid)                                               \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set)
 #if KMP_USE_MONITOR
-#define get__bt_intervals( xteam, xtid )  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals)
+#define get__bt_intervals(xteam, xtid)                                         \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals)
 #endif
 
-#define get__nested_2(xteam,xtid)         ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nested)
-#define get__dynamic_2(xteam,xtid)        ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.dynamic)
-#define get__nproc_2(xteam,xtid)          ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nproc)
-#define get__sched_2(xteam,xtid)          ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.sched)
-
-#define set__blocktime_team( xteam, xtid, xval ) \
-        ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime )    = (xval) )
+#define get__nested_2(xteam, xtid)                                             \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nested)
+#define get__dynamic_2(xteam, xtid)                                            \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.dynamic)
+#define get__nproc_2(xteam, xtid)                                              \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nproc)
+#define get__sched_2(xteam, xtid)                                              \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.sched)
+
+#define set__blocktime_team(xteam, xtid, xval)                                 \
+  (((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime) =     \
+       (xval))
 
 #if KMP_USE_MONITOR
-#define set__bt_intervals_team( xteam, xtid, xval ) \
-        ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals ) = (xval) )
+#define set__bt_intervals_team(xteam, xtid, xval)                              \
+  (((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals) =  \
+       (xval))
 #endif
 
-#define set__bt_set_team( xteam, xtid, xval ) \
-        ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set )       = (xval) )
-
+#define set__bt_set_team(xteam, xtid, xval)                                    \
+  (((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set) = (xval))
 
-#define set__nested( xthread, xval )                            \
-        ( ( (xthread)->th.th_current_task->td_icvs.nested ) = (xval) )
-#define get__nested( xthread ) \
-        ( ( (xthread)->th.th_current_task->td_icvs.nested ) ? (FTN_TRUE) : (FTN_FALSE) )
+#define set__nested(xthread, xval)                                             \
+  (((xthread)->th.th_current_task->td_icvs.nested) = (xval))
+#define get__nested(xthread)                                                   \
+  (((xthread)->th.th_current_task->td_icvs.nested) ? (FTN_TRUE) : (FTN_FALSE))
 
-#define set__dynamic( xthread, xval )                            \
-        ( ( (xthread)->th.th_current_task->td_icvs.dynamic ) = (xval) )
-#define get__dynamic( xthread ) \
-        ( ( (xthread)->th.th_current_task->td_icvs.dynamic ) ? (FTN_TRUE) : (FTN_FALSE) )
+#define set__dynamic(xthread, xval)                                            \
+  (((xthread)->th.th_current_task->td_icvs.dynamic) = (xval))
+#define get__dynamic(xthread)                                                  \
+  (((xthread)->th.th_current_task->td_icvs.dynamic) ? (FTN_TRUE) : (FTN_FALSE))
 
-#define set__nproc( xthread, xval )                            \
-        ( ( (xthread)->th.th_current_task->td_icvs.nproc ) = (xval) )
+#define set__nproc(xthread, xval)                                              \
+  (((xthread)->th.th_current_task->td_icvs.nproc) = (xval))
 
-#define set__max_active_levels( xthread, xval )                            \
-        ( ( (xthread)->th.th_current_task->td_icvs.max_active_levels ) = (xval) )
+#define set__max_active_levels(xthread, xval)                                  \
+  (((xthread)->th.th_current_task->td_icvs.max_active_levels) = (xval))
 
-#define set__sched( xthread, xval )                            \
-        ( ( (xthread)->th.th_current_task->td_icvs.sched ) = (xval) )
+#define set__sched(xthread, xval)                                              \
+  (((xthread)->th.th_current_task->td_icvs.sched) = (xval))
 
 #if OMP_40_ENABLED
 
-#define set__proc_bind( xthread, xval )                          \
-        ( ( (xthread)->th.th_current_task->td_icvs.proc_bind ) = (xval) )
-#define get__proc_bind( xthread ) \
-        ( (xthread)->th.th_current_task->td_icvs.proc_bind )
+#define set__proc_bind(xthread, xval)                                          \
+  (((xthread)->th.th_current_task->td_icvs.proc_bind) = (xval))
+#define get__proc_bind(xthread)                                                \
+  ((xthread)->th.th_current_task->td_icvs.proc_bind)
 
 #endif /* OMP_40_ENABLED */
 
-
-/* ------------------------------------------------------------------------ */
 // OpenMP tasking data structures
-//
 
 typedef enum kmp_tasking_mode {
-    tskm_immediate_exec = 0,
-    tskm_extra_barrier = 1,
-    tskm_task_teams = 2,
-    tskm_max = 2
+  tskm_immediate_exec = 0,
+  tskm_extra_barrier = 1,
+  tskm_task_teams = 2,
+  tskm_max = 2
 } kmp_tasking_mode_t;
 
-extern kmp_tasking_mode_t __kmp_tasking_mode;         /* determines how/when to execute tasks */
+extern kmp_tasking_mode_t
+    __kmp_tasking_mode; /* determines how/when to execute tasks */
 extern kmp_int32 __kmp_task_stealing_constraint;
 #if OMP_40_ENABLED
-    extern kmp_int32 __kmp_default_device; // Set via OMP_DEFAULT_DEVICE if specified, defaults to 0 otherwise
+extern kmp_int32 __kmp_default_device; // Set via OMP_DEFAULT_DEVICE if
+// specified, defaults to 0 otherwise
 #endif
 #if OMP_45_ENABLED
-    extern kmp_int32 __kmp_max_task_priority; // Set via OMP_MAX_TASK_PRIORITY if specified, defaults to 0 otherwise
+extern kmp_int32 __kmp_max_task_priority; // Set via OMP_MAX_TASK_PRIORITY if
+// specified, defaults to 0 otherwise
 #endif
 
-/* NOTE: kmp_taskdata_t and kmp_task_t structures allocated in single block with taskdata first */
-#define KMP_TASK_TO_TASKDATA(task)     (((kmp_taskdata_t *) task) - 1)
-#define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *) (taskdata + 1)
-
-// The tt_found_tasks flag is a signal to all threads in the team that tasks were spawned and
-// queued since the previous barrier release.
-#define KMP_TASKING_ENABLED(task_team) \
-    (TCR_SYNC_4((task_team)->tt.tt_found_tasks) == TRUE)
+/* NOTE: kmp_taskdata_t and kmp_task_t structures allocated in single block with
+   taskdata first */
+#define KMP_TASK_TO_TASKDATA(task) (((kmp_taskdata_t *)task) - 1)
+#define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1)
+
+// The tt_found_tasks flag is a signal to all threads in the team that tasks
+// were spawned and queued since the previous barrier release.
+#define KMP_TASKING_ENABLED(task_team)                                         \
+  (TCR_SYNC_4((task_team)->tt.tt_found_tasks) == TRUE)
 /*!
 @ingroup BASIC_TYPES
 @{
@@ -1962,33 +2068,37 @@ extern kmp_int32 __kmp_task_stealing_con
 
 /*!
  */
-typedef kmp_int32 (* kmp_routine_entry_t)( kmp_int32, void * );
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, void *);
 
 #if OMP_40_ENABLED || OMP_45_ENABLED
 typedef union kmp_cmplrdata {
 #if OMP_45_ENABLED
-    kmp_int32           priority;           /**< priority specified by user for the task */
+  kmp_int32 priority; /**< priority specified by user for the task */
 #endif // OMP_45_ENABLED
 #if OMP_40_ENABLED
-    kmp_routine_entry_t destructors;        /* pointer to function to invoke deconstructors of firstprivate C++ objects */
+  kmp_routine_entry_t
+      destructors; /* pointer to function to invoke deconstructors of
+                      firstprivate C++ objects */
 #endif // OMP_40_ENABLED
-    /* future data */
+  /* future data */
 } kmp_cmplrdata_t;
 #endif
 
 /*  sizeof_kmp_task_t passed as arg to kmpc_omp_task call  */
 /*!
  */
-typedef struct kmp_task {                   /* GEH: Shouldn't this be aligned somehow? */
-    void *              shareds;            /**< pointer to block of pointers to shared vars   */
-    kmp_routine_entry_t routine;            /**< pointer to routine to call for executing task */
-    kmp_int32           part_id;            /**< part id for the task                          */
+typedef struct kmp_task { /* GEH: Shouldn't this be aligned somehow? */
+  void *shareds; /**< pointer to block of pointers to shared vars   */
+  kmp_routine_entry_t
+      routine; /**< pointer to routine to call for executing task */
+  kmp_int32 part_id; /**< part id for the task                          */
 #if OMP_40_ENABLED || OMP_45_ENABLED
-    kmp_cmplrdata_t data1;                  /* Two known optional additions: destructors and priority */
-    kmp_cmplrdata_t data2;                  /* Process destructors first, priority second */
-    /* future data */
+  kmp_cmplrdata_t
+      data1; /* Two known optional additions: destructors and priority */
+  kmp_cmplrdata_t data2; /* Process destructors first, priority second */
+/* future data */
 #endif
-    /*  private vars  */
+  /*  private vars  */
 } kmp_task_t;
 
 /*!
@@ -1997,69 +2107,69 @@ typedef struct kmp_task {
 
 #if OMP_40_ENABLED
 typedef struct kmp_taskgroup {
-    kmp_uint32            count;   // number of allocated and not yet complete tasks
-    kmp_int32             cancel_request; // request for cancellation of this taskgroup
-    struct kmp_taskgroup *parent;  // parent taskgroup
+  kmp_uint32 count; // number of allocated and not yet complete tasks
+  kmp_int32 cancel_request; // request for cancellation of this taskgroup
+  struct kmp_taskgroup *parent; // parent taskgroup
 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
 #if OMP_45_ENABLED
-    // Block of data to perform task reduction
-    void                 *reduce_data; // reduction related info
-    kmp_int32             reduce_num_data; // number of data items to reduce
+  // Block of data to perform task reduction
+  void *reduce_data; // reduction related info
+  kmp_int32 reduce_num_data; // number of data items to reduce
 #endif
 } kmp_taskgroup_t;
 
 // forward declarations
-typedef union kmp_depnode       kmp_depnode_t;
-typedef struct kmp_depnode_list  kmp_depnode_list_t;
+typedef union kmp_depnode kmp_depnode_t;
+typedef struct kmp_depnode_list kmp_depnode_list_t;
 typedef struct kmp_dephash_entry kmp_dephash_entry_t;
 
 typedef struct kmp_depend_info {
-     kmp_intptr_t               base_addr;
-     size_t                     len;
-     struct {
-         bool                   in:1;
-         bool                   out:1;
-     } flags;
+  kmp_intptr_t base_addr;
+  size_t len;
+  struct {
+    bool in : 1;
+    bool out : 1;
+  } flags;
 } kmp_depend_info_t;
 
 struct kmp_depnode_list {
-   kmp_depnode_t *              node;
-   kmp_depnode_list_t *         next;
+  kmp_depnode_t *node;
+  kmp_depnode_list_t *next;
 };
 
 typedef struct kmp_base_depnode {
-    kmp_depnode_list_t        * successors;
-    kmp_task_t                * task;
+  kmp_depnode_list_t *successors;
+  kmp_task_t *task;
 
-    kmp_lock_t                  lock;
+  kmp_lock_t lock;
 
 #if KMP_SUPPORT_GRAPH_OUTPUT
-    kmp_uint32                  id;
+  kmp_uint32 id;
 #endif
 
-    volatile kmp_int32          npredecessors;
-    volatile kmp_int32          nrefs;
+  volatile kmp_int32 npredecessors;
+  volatile kmp_int32 nrefs;
 } kmp_base_depnode_t;
 
 union KMP_ALIGN_CACHE kmp_depnode {
-    double          dn_align;        /* use worst case alignment */
-    char            dn_pad[ KMP_PAD(kmp_base_depnode_t, CACHE_LINE) ];
-    kmp_base_depnode_t dn;
+  double dn_align; /* use worst case alignment */
+  char dn_pad[KMP_PAD(kmp_base_depnode_t, CACHE_LINE)];
+  kmp_base_depnode_t dn;
 };
 
 struct kmp_dephash_entry {
-    kmp_intptr_t               addr;
-    kmp_depnode_t            * last_out;
-    kmp_depnode_list_t       * last_ins;
-    kmp_dephash_entry_t      * next_in_bucket;
+  kmp_intptr_t addr;
+  kmp_depnode_t *last_out;
+  kmp_depnode_list_t *last_ins;
+  kmp_dephash_entry_t *next_in_bucket;
 };
 
 typedef struct kmp_dephash {
-   kmp_dephash_entry_t     ** buckets;
-   size_t		      size;
+  kmp_dephash_entry_t **buckets;
+  size_t size;
 #ifdef KMP_DEBUG
-   kmp_uint32                 nelements;
-   kmp_uint32                 nconflicts;
+  kmp_uint32 nelements;
+  kmp_uint32 nconflicts;
 #endif
 } kmp_dephash_t;
 
@@ -2069,556 +2179,583 @@ typedef struct kmp_dephash {
 
 /* Tied Task stack definitions */
 typedef struct kmp_stack_block {
-    kmp_taskdata_t *          sb_block[ TASK_STACK_BLOCK_SIZE ];
-    struct kmp_stack_block *  sb_next;
-    struct kmp_stack_block *  sb_prev;
+  kmp_taskdata_t *sb_block[TASK_STACK_BLOCK_SIZE];
+  struct kmp_stack_block *sb_next;
+  struct kmp_stack_block *sb_prev;
 } kmp_stack_block_t;
 
 typedef struct kmp_task_stack {
-    kmp_stack_block_t         ts_first_block;  // first block of stack entries
-    kmp_taskdata_t **         ts_top;          // pointer to the top of stack
-    kmp_int32                 ts_entries;      // number of entries on the stack
+  kmp_stack_block_t ts_first_block; // first block of stack entries
+  kmp_taskdata_t **ts_top; // pointer to the top of stack
+  kmp_int32 ts_entries; // number of entries on the stack
 } kmp_task_stack_t;
 
 #endif // BUILD_TIED_TASK_STACK
 
-typedef struct kmp_tasking_flags {          /* Total struct must be exactly 32 bits */
-    /* Compiler flags */                    /* Total compiler flags must be 16 bits */
-    unsigned tiedness    : 1;               /* task is either tied (1) or untied (0) */
-    unsigned final       : 1;               /* task is final(1) so execute immediately */
-    unsigned merged_if0  : 1;               /* no __kmpc_task_{begin/complete}_if0 calls in if0 code path */
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
+  unsigned final : 1; /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+                              code path */
 #if OMP_40_ENABLED
-    unsigned destructors_thunk : 1;         /* set if the compiler creates a thunk to invoke destructors from the runtime */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+                                     invoke destructors from the runtime */
 #if OMP_45_ENABLED
-    unsigned proxy       : 1;               /* task is a proxy task (it will be executed outside the context of the RTL) */
-    unsigned priority_specified :1;         /* set if the compiler provides priority setting for the task */
-    unsigned reserved    : 10;              /* reserved for compiler use */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+                         context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+                                      setting for the task */
+  unsigned reserved : 10; /* reserved for compiler use */
 #else
-    unsigned reserved    : 12;              /* reserved for compiler use */
+  unsigned reserved : 12; /* reserved for compiler use */
 #endif
 #else // OMP_40_ENABLED
-    unsigned reserved    : 13;              /* reserved for compiler use */
+  unsigned reserved : 13; /* reserved for compiler use */
 #endif // OMP_40_ENABLED
 
-    /* Library flags */                     /* Total library flags must be 16 bits */
-    unsigned tasktype    : 1;               /* task is either explicit(1) or implicit (0) */
-    unsigned task_serial : 1;               /* this task is executed immediately (1) or deferred (0) */
-    unsigned tasking_ser : 1;               /* all tasks in team are either executed immediately (1) or may be deferred (0) */
-    unsigned team_serial : 1;               /* entire team is serial (1) [1 thread] or parallel (0) [>= 2 threads] */
-                                            /* If either team_serial or tasking_ser is set, task team may be NULL */
-    /* Task State Flags: */
-    unsigned started     : 1;               /* 1==started, 0==not started     */
-    unsigned executing   : 1;               /* 1==executing, 0==not executing */
-    unsigned complete    : 1;               /* 1==complete, 0==not complete   */
-    unsigned freed       : 1;               /* 1==freed, 0==allocateed        */
-    unsigned native      : 1;               /* 1==gcc-compiled task, 0==intel */
-    unsigned reserved31  : 7;               /* reserved for library use */
+  /* Library flags */ /* Total library flags must be 16 bits */
+  unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1; /* 1==started, 0==not started     */
+  unsigned executing : 1; /* 1==executing, 0==not executing */
+  unsigned complete : 1; /* 1==complete, 0==not complete   */
+  unsigned freed : 1; /* 1==freed, 0==allocateed        */
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
 
 } kmp_tasking_flags_t;
 
-
-struct kmp_taskdata {                                 /* aligned during dynamic allocation       */
-    kmp_int32               td_task_id;               /* id, assigned by debugger                */
-    kmp_tasking_flags_t     td_flags;                 /* task flags                              */
-    kmp_team_t *            td_team;                  /* team for this task                      */
-    kmp_info_p *            td_alloc_thread;          /* thread that allocated data structures   */
-                                                      /* Currently not used except for perhaps IDB */
-    kmp_taskdata_t *        td_parent;                /* parent task                             */
-    kmp_int32               td_level;                 /* task nesting level                      */
-    kmp_int32               td_untied_count;          /* untied task active parts counter        */
-    ident_t *               td_ident;                 /* task identifier                         */
-                            // Taskwait data.
-    ident_t *               td_taskwait_ident;
-    kmp_uint32              td_taskwait_counter;
-    kmp_int32               td_taskwait_thread;       /* gtid + 1 of thread encountered taskwait */
-    KMP_ALIGN_CACHE kmp_internal_control_t  td_icvs;  /* Internal control variables for the task */
-    KMP_ALIGN_CACHE volatile kmp_uint32 td_allocated_child_tasks;  /* Child tasks (+ current task) not yet deallocated */
-    volatile kmp_uint32     td_incomplete_child_tasks; /* Child tasks not yet complete */
-#if OMP_40_ENABLED
-    kmp_taskgroup_t *       td_taskgroup;         // Each task keeps pointer to its current taskgroup
-    kmp_dephash_t *         td_dephash;           // Dependencies for children tasks are tracked from here
-    kmp_depnode_t *         td_depnode;           // Pointer to graph node if this task has dependencies
+struct kmp_taskdata { /* aligned during dynamic allocation       */
+  kmp_int32 td_task_id; /* id, assigned by debugger                */
+  kmp_tasking_flags_t td_flags; /* task flags                              */
+  kmp_team_t *td_team; /* team for this task                      */
+  kmp_info_p *td_alloc_thread; /* thread that allocated data structures   */
+  /* Currently not used except for perhaps IDB */
+  kmp_taskdata_t *td_parent; /* parent task                             */
+  kmp_int32 td_level; /* task nesting level                      */
+  kmp_int32 td_untied_count; /* untied task active parts counter        */
+  ident_t *td_ident; /* task identifier                         */
+  // Taskwait data.
+  ident_t *td_taskwait_ident;
+  kmp_uint32 td_taskwait_counter;
+  kmp_int32 td_taskwait_thread; /* gtid + 1 of thread encountered taskwait */
+  KMP_ALIGN_CACHE kmp_internal_control_t
+      td_icvs; /* Internal control variables for the task */
+  KMP_ALIGN_CACHE volatile kmp_uint32
+      td_allocated_child_tasks; /* Child tasks (+ current task) not yet
+                                   deallocated */
+  volatile kmp_uint32
+      td_incomplete_child_tasks; /* Child tasks not yet complete */
+#if OMP_40_ENABLED
+  kmp_taskgroup_t
+      *td_taskgroup; // Each task keeps pointer to its current taskgroup
+  kmp_dephash_t
+      *td_dephash; // Dependencies for children tasks are tracked from here
+  kmp_depnode_t
+      *td_depnode; // Pointer to graph node if this task has dependencies
 #endif
 #if OMPT_SUPPORT
-    ompt_task_info_t        ompt_task_info;
+  ompt_task_info_t ompt_task_info;
 #endif
 #if OMP_45_ENABLED
-    kmp_task_team_t *       td_task_team;
-    kmp_int32               td_size_alloc;        // The size of task structure, including shareds etc.
+  kmp_task_team_t *td_task_team;
+  kmp_int32 td_size_alloc; // The size of task structure, including shareds etc.
 #endif
 }; // struct kmp_taskdata
 
 // Make sure padding above worked
-KMP_BUILD_ASSERT( sizeof(kmp_taskdata_t) % sizeof(void *) == 0 );
+KMP_BUILD_ASSERT(sizeof(kmp_taskdata_t) % sizeof(void *) == 0);
 
 // Data for task team but per thread
 typedef struct kmp_base_thread_data {
-    kmp_info_p *            td_thr;                // Pointer back to thread info
-                                                   // Used only in __kmp_execute_tasks_template, maybe not avail until task is queued?
-    kmp_bootstrap_lock_t    td_deque_lock;         // Lock for accessing deque
-    kmp_taskdata_t **       td_deque;              // Deque of tasks encountered by td_thr, dynamically allocated
-    kmp_int32               td_deque_size;         // Size of deck
-    kmp_uint32              td_deque_head;         // Head of deque (will wrap)
-    kmp_uint32              td_deque_tail;         // Tail of deque (will wrap)
-    kmp_int32               td_deque_ntasks;       // Number of tasks in deque
-                                                   // GEH: shouldn't this be volatile since used in while-spin?
-    kmp_int32               td_deque_last_stolen;  // Thread number of last successful steal
+  kmp_info_p *td_thr; // Pointer back to thread info
+  // Used only in __kmp_execute_tasks_template, maybe not avail until task is
+  // queued?
+  kmp_bootstrap_lock_t td_deque_lock; // Lock for accessing deque
+  kmp_taskdata_t *
+      *td_deque; // Deque of tasks encountered by td_thr, dynamically allocated
+  kmp_int32 td_deque_size; // Size of deck
+  kmp_uint32 td_deque_head; // Head of deque (will wrap)
+  kmp_uint32 td_deque_tail; // Tail of deque (will wrap)
+  kmp_int32 td_deque_ntasks; // Number of tasks in deque
+  // GEH: shouldn't this be volatile since used in while-spin?
+  kmp_int32 td_deque_last_stolen; // Thread number of last successful steal
 #ifdef BUILD_TIED_TASK_STACK
-    kmp_task_stack_t        td_susp_tied_tasks;    // Stack of suspended tied tasks for task scheduling constraint
+  kmp_task_stack_t td_susp_tied_tasks; // Stack of suspended tied tasks for task
+// scheduling constraint
 #endif // BUILD_TIED_TASK_STACK
 } kmp_base_thread_data_t;
 
-#define TASK_DEQUE_BITS          8  // Used solely to define INITIAL_TASK_DEQUE_SIZE
-#define INITIAL_TASK_DEQUE_SIZE  ( 1 << TASK_DEQUE_BITS )
+#define TASK_DEQUE_BITS 8 // Used solely to define INITIAL_TASK_DEQUE_SIZE
+#define INITIAL_TASK_DEQUE_SIZE (1 << TASK_DEQUE_BITS)
 
-#define TASK_DEQUE_SIZE(td)     ((td).td_deque_size)
-#define TASK_DEQUE_MASK(td)     ((td).td_deque_size - 1)
+#define TASK_DEQUE_SIZE(td) ((td).td_deque_size)
+#define TASK_DEQUE_MASK(td) ((td).td_deque_size - 1)
 
 typedef union KMP_ALIGN_CACHE kmp_thread_data {
-    kmp_base_thread_data_t  td;
-    double                  td_align;       /* use worst case alignment */
-    char                    td_pad[ KMP_PAD(kmp_base_thread_data_t, CACHE_LINE) ];
+  kmp_base_thread_data_t td;
+  double td_align; /* use worst case alignment */
+  char td_pad[KMP_PAD(kmp_base_thread_data_t, CACHE_LINE)];
 } kmp_thread_data_t;
 
-
 // Data for task teams which are used when tasking is enabled for the team
 typedef struct kmp_base_task_team {
-    kmp_bootstrap_lock_t    tt_threads_lock;       /* Lock used to allocate per-thread part of task team */
-                                                   /* must be bootstrap lock since used at library shutdown*/
-    kmp_task_team_t *       tt_next;               /* For linking the task team free list */
-    kmp_thread_data_t *     tt_threads_data;       /* Array of per-thread structures for task team */
-                                                   /* Data survives task team deallocation */
-    kmp_int32               tt_found_tasks;        /* Have we found tasks and queued them while executing this team? */
-                                                   /* TRUE means tt_threads_data is set up and initialized */
-    kmp_int32               tt_nproc;              /* #threads in team           */
-    kmp_int32               tt_max_threads;        /* number of entries allocated for threads_data array */
+  kmp_bootstrap_lock_t
+      tt_threads_lock; /* Lock used to allocate per-thread part of task team */
+  /* must be bootstrap lock since used at library shutdown*/
+  kmp_task_team_t *tt_next; /* For linking the task team free list */
+  kmp_thread_data_t
+      *tt_threads_data; /* Array of per-thread structures for task team */
+  /* Data survives task team deallocation */
+  kmp_int32 tt_found_tasks; /* Have we found tasks and queued them while
+                               executing this team? */
+  /* TRUE means tt_threads_data is set up and initialized */
+  kmp_int32 tt_nproc; /* #threads in team           */
+  kmp_int32
+      tt_max_threads; /* number of entries allocated for threads_data array */
 #if OMP_45_ENABLED
-    kmp_int32               tt_found_proxy_tasks;  /* Have we found proxy tasks since last barrier */
+  kmp_int32
+      tt_found_proxy_tasks; /* Have we found proxy tasks since last barrier */
 #endif
 
-    KMP_ALIGN_CACHE
-    volatile kmp_uint32     tt_unfinished_threads; /* #threads still active      */
+  KMP_ALIGN_CACHE
+  volatile kmp_uint32 tt_unfinished_threads; /* #threads still active      */
 
-    KMP_ALIGN_CACHE
-    volatile kmp_uint32     tt_active;             /* is the team still actively executing tasks */
+  KMP_ALIGN_CACHE
+  volatile kmp_uint32
+      tt_active; /* is the team still actively executing tasks */
 } kmp_base_task_team_t;
 
 union KMP_ALIGN_CACHE kmp_task_team {
-    kmp_base_task_team_t tt;
-    double               tt_align;       /* use worst case alignment */
-    char                 tt_pad[ KMP_PAD(kmp_base_task_team_t, CACHE_LINE) ];
+  kmp_base_task_team_t tt;
+  double tt_align; /* use worst case alignment */
+  char tt_pad[KMP_PAD(kmp_base_task_team_t, CACHE_LINE)];
 };
 
-#if ( USE_FAST_MEMORY == 3 ) || ( USE_FAST_MEMORY == 5 )
-// Free lists keep same-size free memory slots for fast memory allocation routines
+#if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
+// Free lists keep same-size free memory slots for fast memory allocation
+// routines
 typedef struct kmp_free_list {
-    void             *th_free_list_self;   // Self-allocated tasks free list
-    void             *th_free_list_sync;   // Self-allocated tasks stolen/returned by other threads
-    void             *th_free_list_other;  // Non-self free list (to be returned to owner's sync list)
+  void *th_free_list_self; // Self-allocated tasks free list
+  void *th_free_list_sync; // Self-allocated tasks stolen/returned by other
+  // threads
+  void *th_free_list_other; // Non-self free list (to be returned to owner's
+  // sync list)
 } kmp_free_list_t;
 #endif
 #if KMP_NESTED_HOT_TEAMS
-// Hot teams array keeps hot teams and their sizes for given thread.
-// Hot teams are not put in teams pool, and they don't put threads in threads pool.
+// Hot teams array keeps hot teams and their sizes for given thread. Hot teams
+// are not put in teams pool, and they don't put threads in threads pool.
 typedef struct kmp_hot_team_ptr {
-    kmp_team_p *hot_team;      // pointer to hot_team of given nesting level
-    kmp_int32   hot_team_nth;  // number of threads allocated for the hot_team
+  kmp_team_p *hot_team; // pointer to hot_team of given nesting level
+  kmp_int32 hot_team_nth; // number of threads allocated for the hot_team
 } kmp_hot_team_ptr_t;
 #endif
 #if OMP_40_ENABLED
 typedef struct kmp_teams_size {
-    kmp_int32   nteams;        // number of teams in a league
-    kmp_int32   nth;           // number of threads in each team of the league
+  kmp_int32 nteams; // number of teams in a league
+  kmp_int32 nth; // number of threads in each team of the league
 } kmp_teams_size_t;
 #endif
 
-/* ------------------------------------------------------------------------ */
 // OpenMP thread data structures
-//
 
 typedef struct KMP_ALIGN_CACHE kmp_base_info {
-/*
- * Start with the readonly data which is cache aligned and padded.
- * this is written before the thread starts working by the master.
- * (uber masters may update themselves later)
- * (usage does not consider serialized regions)
- */
-    kmp_desc_t        th_info;
-    kmp_team_p       *th_team;       /* team we belong to */
-    kmp_root_p       *th_root;       /* pointer to root of task hierarchy */
-    kmp_info_p       *th_next_pool;  /* next available thread in the pool */
-    kmp_disp_t       *th_dispatch;   /* thread's dispatch data */
-    int               th_in_pool;    /* in thread pool (32 bits for TCR/TCW) */
-
-    /* The following are cached from the team info structure */
-    /* TODO use these in more places as determined to be needed via profiling */
-    int               th_team_nproc;      /* number of threads in a team */
-    kmp_info_p       *th_team_master;     /* the team's master thread */
-    int               th_team_serialized; /* team is serialized */
-#if OMP_40_ENABLED
-    microtask_t       th_teams_microtask; /* save entry address for teams construct */
-    int               th_teams_level;     /* save initial level of teams construct */
-                                          /* it is 0 on device but may be any on host */
-#endif
-
-    /* The blocktime info is copied from the team struct to the thread sruct */
-    /* at the start of a barrier, and the values stored in the team are used */
-    /* at points in the code where the team struct is no longer guaranteed   */
-    /* to exist (from the POV of worker threads).                            */
+  /* Start with the readonly data which is cache aligned and padded. This is
+     written before the thread starts working by the master. Uber masters may
+     update themselves later. Usage does not consider serialized regions.  */
+  kmp_desc_t th_info;
+  kmp_team_p *th_team; /* team we belong to */
+  kmp_root_p *th_root; /* pointer to root of task hierarchy */
+  kmp_info_p *th_next_pool; /* next available thread in the pool */
+  kmp_disp_t *th_dispatch; /* thread's dispatch data */
+  int th_in_pool; /* in thread pool (32 bits for TCR/TCW) */
+
+  /* The following are cached from the team info structure */
+  /* TODO use these in more places as determined to be needed via profiling */
+  int th_team_nproc; /* number of threads in a team */
+  kmp_info_p *th_team_master; /* the team's master thread */
+  int th_team_serialized; /* team is serialized */
+#if OMP_40_ENABLED
+  microtask_t th_teams_microtask; /* save entry address for teams construct */
+  int th_teams_level; /* save initial level of teams construct */
+/* it is 0 on device but may be any on host */
+#endif
+
+/* The blocktime info is copied from the team struct to the thread sruct */
+/* at the start of a barrier, and the values stored in the team are used */
+/* at points in the code where the team struct is no longer guaranteed   */
+/* to exist (from the POV of worker threads).                            */
 #if KMP_USE_MONITOR
-    int               th_team_bt_intervals;
-    int               th_team_bt_set;
+  int th_team_bt_intervals;
+  int th_team_bt_set;
 #else
-    kmp_uint64        th_team_bt_intervals;
+  kmp_uint64 th_team_bt_intervals;
 #endif
 
 
 #if KMP_AFFINITY_SUPPORTED
-    kmp_affin_mask_t  *th_affin_mask; /* thread's current affinity mask */
+  kmp_affin_mask_t *th_affin_mask; /* thread's current affinity mask */
 #endif
 
-/*
- * The data set by the master at reinit, then R/W by the worker
- */
-    KMP_ALIGN_CACHE int     th_set_nproc;  /* if > 0, then only use this request for the next fork */
+  /* The data set by the master at reinit, then R/W by the worker */
+  KMP_ALIGN_CACHE int
+      th_set_nproc; /* if > 0, then only use this request for the next fork */
 #if KMP_NESTED_HOT_TEAMS
-    kmp_hot_team_ptr_t     *th_hot_teams;     /* array of hot teams */
+  kmp_hot_team_ptr_t *th_hot_teams; /* array of hot teams */
 #endif
 #if OMP_40_ENABLED
-    kmp_proc_bind_t         th_set_proc_bind; /* if != proc_bind_default, use request for next fork */
-    kmp_teams_size_t        th_teams_size;    /* number of teams/threads in teams construct */
-# if KMP_AFFINITY_SUPPORTED
-    int                     th_current_place; /* place currently bound to */
-    int                     th_new_place;     /* place to bind to in par reg */
-    int                     th_first_place;   /* first place in partition */
-    int                     th_last_place;    /* last place in partition */
-# endif
+  kmp_proc_bind_t
+      th_set_proc_bind; /* if != proc_bind_default, use request for next fork */
+  kmp_teams_size_t
+      th_teams_size; /* number of teams/threads in teams construct */
+#if KMP_AFFINITY_SUPPORTED
+  int th_current_place; /* place currently bound to */
+  int th_new_place; /* place to bind to in par reg */
+  int th_first_place; /* first place in partition */
+  int th_last_place; /* last place in partition */
+#endif
 #endif
 #if USE_ITT_BUILD
-    kmp_uint64              th_bar_arrive_time;           /* arrival to barrier timestamp */
-    kmp_uint64              th_bar_min_time;              /* minimum arrival time at the barrier */
-    kmp_uint64              th_frame_time;                /* frame timestamp */
+  kmp_uint64 th_bar_arrive_time; /* arrival to barrier timestamp */
+  kmp_uint64 th_bar_min_time; /* minimum arrival time at the barrier */
+  kmp_uint64 th_frame_time; /* frame timestamp */
 #endif /* USE_ITT_BUILD */
-    kmp_local_t             th_local;
-    struct private_common  *th_pri_head;
+  kmp_local_t th_local;
+  struct private_common *th_pri_head;
 
-/*
- * Now the data only used by the worker (after initial allocation)
- */
-    /* TODO the first serial team should actually be stored in the info_t
-     * structure.  this will help reduce initial allocation overhead */
-    KMP_ALIGN_CACHE kmp_team_p *th_serial_team; /*serialized team held in reserve*/
+  /* Now the data only used by the worker (after initial allocation) */
+  /* TODO the first serial team should actually be stored in the info_t
+     structure.  this will help reduce initial allocation overhead */
+  KMP_ALIGN_CACHE kmp_team_p
+      *th_serial_team; /*serialized team held in reserve*/
 
 #if OMPT_SUPPORT
-    ompt_thread_info_t      ompt_thread_info;
+  ompt_thread_info_t ompt_thread_info;
 #endif
 
-/* The following are also read by the master during reinit */
-    struct common_table    *th_pri_common;
-
-    volatile kmp_uint32     th_spin_here;   /* thread-local location for spinning */
-                                            /* while awaiting queuing lock acquire */
-
-    volatile void          *th_sleep_loc;   // this points at a kmp_flag<T>
-
-    ident_t          *th_ident;
-    unsigned         th_x;                     // Random number generator data
-    unsigned         th_a;                     // Random number generator data
-
-/*
- * Tasking-related data for the thread
- */
-    kmp_task_team_t    * th_task_team;           // Task team struct
-    kmp_taskdata_t     * th_current_task;        // Innermost Task being executed
-    kmp_uint8            th_task_state;          // alternating 0/1 for task team identification
-    kmp_uint8          * th_task_state_memo_stack;  // Stack holding memos of th_task_state at nested levels
-    kmp_uint32           th_task_state_top;         // Top element of th_task_state_memo_stack
-    kmp_uint32           th_task_state_stack_sz;    // Size of th_task_state_memo_stack
-    kmp_uint32           th_reap_state;  // Non-zero indicates thread is not
-                                         // tasking, thus safe to reap
-
-    /*
-     * More stuff for keeping track of active/sleeping threads
-     * (this part is written by the worker thread)
-     */
-    kmp_uint8            th_active_in_pool;      // included in count of
-                                                 // #active threads in pool
-    int                  th_active;              // ! sleeping
-                                                 // 32 bits for TCR/TCW
-
-    struct cons_header * th_cons; // used for consistency check
-
-/*
- * Add the syncronizing data which is cache aligned and padded.
- */
-    KMP_ALIGN_CACHE kmp_balign_t      th_bar[ bs_last_barrier ];
+  /* The following are also read by the master during reinit */
+  struct common_table *th_pri_common;
 
-    KMP_ALIGN_CACHE volatile     kmp_int32    th_next_waiting;  /* gtid+1 of next thread on lock wait queue, 0 if none */
+  volatile kmp_uint32 th_spin_here; /* thread-local location for spinning */
+  /* while awaiting queuing lock acquire */
 
-#if ( USE_FAST_MEMORY == 3 ) || ( USE_FAST_MEMORY == 5 )
-    #define NUM_LISTS 4
-    kmp_free_list_t   th_free_lists[NUM_LISTS];   // Free lists for fast memory allocation routines
+  volatile void *th_sleep_loc; // this points at a kmp_flag<T>
+
+  ident_t *th_ident;
+  unsigned th_x; // Random number generator data
+  unsigned th_a; // Random number generator data
+
+  /* Tasking-related data for the thread */
+  kmp_task_team_t *th_task_team; // Task team struct
+  kmp_taskdata_t *th_current_task; // Innermost Task being executed
+  kmp_uint8 th_task_state; // alternating 0/1 for task team identification
+  kmp_uint8 *th_task_state_memo_stack; // Stack holding memos of th_task_state
+  // at nested levels
+  kmp_uint32 th_task_state_top; // Top element of th_task_state_memo_stack
+  kmp_uint32 th_task_state_stack_sz; // Size of th_task_state_memo_stack
+  kmp_uint32 th_reap_state; // Non-zero indicates thread is not
+  // tasking, thus safe to reap
+
+  /* More stuff for keeping track of active/sleeping threads (this part is
+     written by the worker thread) */
+  kmp_uint8 th_active_in_pool; // included in count of #active threads in pool
+  int th_active; // ! sleeping; 32 bits for TCR/TCW
+  struct cons_header *th_cons; // used for consistency check
+
+  /* Add the syncronizing data which is cache aligned and padded. */
+  KMP_ALIGN_CACHE kmp_balign_t th_bar[bs_last_barrier];
+
+  KMP_ALIGN_CACHE volatile kmp_int32
+      th_next_waiting; /* gtid+1 of next thread on lock wait queue, 0 if none */
+
+#if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
+#define NUM_LISTS 4
+  kmp_free_list_t th_free_lists[NUM_LISTS]; // Free lists for fast memory
+// allocation routines
 #endif
 
 #if KMP_OS_WINDOWS
-    kmp_win32_cond_t  th_suspend_cv;
-    kmp_win32_mutex_t th_suspend_mx;
-    int               th_suspend_init;
+  kmp_win32_cond_t th_suspend_cv;
+  kmp_win32_mutex_t th_suspend_mx;
+  int th_suspend_init;
 #endif
 #if KMP_OS_UNIX
-    kmp_cond_align_t  th_suspend_cv;
-    kmp_mutex_align_t th_suspend_mx;
-    int               th_suspend_init_count;
+  kmp_cond_align_t th_suspend_cv;
+  kmp_mutex_align_t th_suspend_mx;
+  int th_suspend_init_count;
 #endif
 
 #if USE_ITT_BUILD
-    kmp_itt_mark_t        th_itt_mark_single;
-    // alignment ???
+  kmp_itt_mark_t th_itt_mark_single;
+// alignment ???
 #endif /* USE_ITT_BUILD */
 #if KMP_STATS_ENABLED
-    kmp_stats_list* th_stats;
+  kmp_stats_list *th_stats;
 #endif
 } kmp_base_info_t;
 
 typedef union KMP_ALIGN_CACHE kmp_info {
-    double          th_align;        /* use worst case alignment */
-    char            th_pad[ KMP_PAD(kmp_base_info_t, CACHE_LINE) ];
-    kmp_base_info_t th;
+  double th_align; /* use worst case alignment */
+  char th_pad[KMP_PAD(kmp_base_info_t, CACHE_LINE)];
+  kmp_base_info_t th;
 } kmp_info_t;
 
-/* ------------------------------------------------------------------------ */
 // OpenMP thread team data structures
-//
-typedef struct kmp_base_data {
-    volatile kmp_uint32 t_value;
-} kmp_base_data_t;
+
+typedef struct kmp_base_data { volatile kmp_uint32 t_value; } kmp_base_data_t;
 
 typedef union KMP_ALIGN_CACHE kmp_sleep_team {
-    double              dt_align;        /* use worst case alignment */
-    char                dt_pad[ KMP_PAD(kmp_base_data_t, CACHE_LINE) ];
-    kmp_base_data_t     dt;
+  double dt_align; /* use worst case alignment */
+  char dt_pad[KMP_PAD(kmp_base_data_t, CACHE_LINE)];
+  kmp_base_data_t dt;
 } kmp_sleep_team_t;
 
 typedef union KMP_ALIGN_CACHE kmp_ordered_team {
-    double              dt_align;        /* use worst case alignment */
-    char                dt_pad[ KMP_PAD(kmp_base_data_t, CACHE_LINE) ];
-    kmp_base_data_t     dt;
+  double dt_align; /* use worst case alignment */
+  char dt_pad[KMP_PAD(kmp_base_data_t, CACHE_LINE)];
+  kmp_base_data_t dt;
 } kmp_ordered_team_t;
 
-typedef int     (*launch_t)( int gtid );
+typedef int (*launch_t)(int gtid);
 
 /* Minimum number of ARGV entries to malloc if necessary */
-#define KMP_MIN_MALLOC_ARGV_ENTRIES     100
+#define KMP_MIN_MALLOC_ARGV_ENTRIES 100
 
-// Set up how many argv pointers will fit in cache lines containing t_inline_argv. Historically, we
-// have supported at least 96 bytes. Using a larger value for more space between the master write/worker
-// read section and read/write by all section seems to buy more performance on EPCC PARALLEL.
+// Set up how many argv pointers will fit in cache lines containing
+// t_inline_argv. Historically, we have supported at least 96 bytes. Using a
+// larger value for more space between the master write/worker read section and
+// read/write by all section seems to buy more performance on EPCC PARALLEL.
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-# define KMP_INLINE_ARGV_BYTES         ( 4 * CACHE_LINE - ( ( 3 * KMP_PTR_SKIP + 2 * sizeof(int) + 2 * sizeof(kmp_int8) + sizeof(kmp_int16) + sizeof(kmp_uint32) ) % CACHE_LINE ) )
+#define KMP_INLINE_ARGV_BYTES                                                  \
+  (4 * CACHE_LINE -                                                            \
+   ((3 * KMP_PTR_SKIP + 2 * sizeof(int) + 2 * sizeof(kmp_int8) +               \
+     sizeof(kmp_int16) + sizeof(kmp_uint32)) %                                 \
+    CACHE_LINE))
 #else
-# define KMP_INLINE_ARGV_BYTES         ( 2 * CACHE_LINE - ( ( 3 * KMP_PTR_SKIP + 2 * sizeof(int) ) % CACHE_LINE ) )
+#define KMP_INLINE_ARGV_BYTES                                                  \
+  (2 * CACHE_LINE - ((3 * KMP_PTR_SKIP + 2 * sizeof(int)) % CACHE_LINE))
 #endif
-#define KMP_INLINE_ARGV_ENTRIES        (int)( KMP_INLINE_ARGV_BYTES / KMP_PTR_SKIP )
+#define KMP_INLINE_ARGV_ENTRIES (int)(KMP_INLINE_ARGV_BYTES / KMP_PTR_SKIP)
 
 typedef struct KMP_ALIGN_CACHE kmp_base_team {
-    // Synchronization Data ---------------------------------------------------------------------------------
-    KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered;
-    kmp_balign_team_t        t_bar[ bs_last_barrier ];
-    volatile int             t_construct;    // count of single directive encountered by team
-    kmp_lock_t               t_single_lock;  // team specific lock
-
-    // Master only -----------------------------------------------------------------------------------------
-    KMP_ALIGN_CACHE int      t_master_tid;   // tid of master in parent team
-    int                      t_master_this_cons; // "this_construct" single counter of master in parent team
-    ident_t                 *t_ident;        // if volatile, have to change too much other crud to volatile too
-    kmp_team_p              *t_parent;       // parent team
-    kmp_team_p              *t_next_pool;    // next free team in the team pool
-    kmp_disp_t              *t_dispatch;     // thread's dispatch data
-    kmp_task_team_t         *t_task_team[2]; // Task team struct; switch between 2
+  // Synchronization Data
+  // ---------------------------------------------------------------------------
+  KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered;
+  kmp_balign_team_t t_bar[bs_last_barrier];
+  volatile int t_construct; // count of single directive encountered by team
+  kmp_lock_t t_single_lock; // team specific lock
+
+  // Master only
+  // ---------------------------------------------------------------------------
+  KMP_ALIGN_CACHE int t_master_tid; // tid of master in parent team
+  int t_master_this_cons; // "this_construct" single counter of master in parent
+  // team
+  ident_t *t_ident; // if volatile, have to change too much other crud to
+  // volatile too
+  kmp_team_p *t_parent; // parent team
+  kmp_team_p *t_next_pool; // next free team in the team pool
+  kmp_disp_t *t_dispatch; // thread's dispatch data
+  kmp_task_team_t *t_task_team[2]; // Task team struct; switch between 2
 #if OMP_40_ENABLED
-    kmp_proc_bind_t          t_proc_bind;    // bind type for par region
+  kmp_proc_bind_t t_proc_bind; // bind type for par region
 #endif // OMP_40_ENABLED
 #if USE_ITT_BUILD
-    kmp_uint64               t_region_time;  // region begin timestamp
+  kmp_uint64 t_region_time; // region begin timestamp
 #endif /* USE_ITT_BUILD */
 
-    // Master write, workers read --------------------------------------------------------------------------
-    KMP_ALIGN_CACHE void   **t_argv;
-    int                      t_argc;
-    int                      t_nproc;        // number of threads in team
-    microtask_t              t_pkfn;
-    launch_t                 t_invoke;       // procedure to launch the microtask
+  // Master write, workers read
+  // --------------------------------------------------------------------------
+  KMP_ALIGN_CACHE void **t_argv;
+  int t_argc;
+  int t_nproc; // number of threads in team
+  microtask_t t_pkfn;
+  launch_t t_invoke; // procedure to launch the microtask
 
 #if OMPT_SUPPORT
-    ompt_team_info_t         ompt_team_info;
-    ompt_lw_taskteam_t      *ompt_serialized_team_info;
+  ompt_team_info_t ompt_team_info;
+  ompt_lw_taskteam_t *ompt_serialized_team_info;
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-    kmp_int8                 t_fp_control_saved;
-    kmp_int8                 t_pad2b;
-    kmp_int16                t_x87_fpu_control_word; // FP control regs
-    kmp_uint32               t_mxcsr;
+  kmp_int8 t_fp_control_saved;
+  kmp_int8 t_pad2b;
+  kmp_int16 t_x87_fpu_control_word; // FP control regs
+  kmp_uint32 t_mxcsr;
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-    void                    *t_inline_argv[ KMP_INLINE_ARGV_ENTRIES ];
+  void *t_inline_argv[KMP_INLINE_ARGV_ENTRIES];
 
-    KMP_ALIGN_CACHE kmp_info_t **t_threads;
-    kmp_taskdata_t *t_implicit_task_taskdata;  // Taskdata for the thread's implicit task
-    int                      t_level;          // nested parallel level
-
-    KMP_ALIGN_CACHE int      t_max_argc;
-    int                      t_max_nproc;    // maximum threads this team can handle (dynamicly expandable)
-    int                      t_serialized;   // levels deep of serialized teams
-    dispatch_shared_info_t  *t_disp_buffer;  // buffers for dispatch system
-    int                      t_id;           // team's id, assigned by debugger.
-    int                      t_active_level; // nested active parallel level
-    kmp_r_sched_t            t_sched;        // run-time schedule for the team
+  KMP_ALIGN_CACHE kmp_info_t **t_threads;
+  kmp_taskdata_t
+      *t_implicit_task_taskdata; // Taskdata for the thread's implicit task
+  int t_level; // nested parallel level
+
+  KMP_ALIGN_CACHE int t_max_argc;
+  int t_max_nproc; // max threads this team can handle (dynamicly expandable)
+  int t_serialized; // levels deep of serialized teams
+  dispatch_shared_info_t *t_disp_buffer; // buffers for dispatch system
+  int t_id; // team's id, assigned by debugger.
+  int t_active_level; // nested active parallel level
+  kmp_r_sched_t t_sched; // run-time schedule for the team
 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
-    int                      t_first_place;  // first & last place in parent thread's partition.
-    int                      t_last_place;   // Restore these values to master after par region.
+  int t_first_place; // first & last place in parent thread's partition.
+  int t_last_place; // Restore these values to master after par region.
 #endif // OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
-    int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via omp_set_num_threads() call
+  int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via
+// omp_set_num_threads() call
 
-    // Read/write by workers as well -----------------------------------------------------------------------
+// Read/write by workers as well
 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
-    // Using CACHE_LINE=64 reduces memory footprint, but causes a big perf regression of epcc 'parallel'
-    // and 'barrier' on fxe256lin01. This extra padding serves to fix the performance of epcc 'parallel'
-    // and 'barrier' when CACHE_LINE=64. TODO: investigate more and get rid if this padding.
-    char dummy_padding[1024];
-#endif
-    KMP_ALIGN_CACHE kmp_internal_control_t *t_control_stack_top;  // internal control stack for additional nested teams.
-                                                   // for SERIALIZED teams nested 2 or more levels deep
-#if OMP_40_ENABLED
-    kmp_int32                t_cancel_request; // typed flag to store request state of cancellation
-#endif
-    int                      t_master_active;  // save on fork, restore on join
-    kmp_taskq_t              t_taskq;          // this team's task queue
-    void                    *t_copypriv_data;  // team specific pointer to copyprivate data array
-    kmp_uint32               t_copyin_counter;
+  // Using CACHE_LINE=64 reduces memory footprint, but causes a big perf
+  // regression of epcc 'parallel' and 'barrier' on fxe256lin01. This extra
+  // padding serves to fix the performance of epcc 'parallel' and 'barrier' when
+  // CACHE_LINE=64. TODO: investigate more and get rid if this padding.
+  char dummy_padding[1024];
+#endif
+  // Internal control stack for additional nested teams.
+  KMP_ALIGN_CACHE kmp_internal_control_t *t_control_stack_top;
+// for SERIALIZED teams nested 2 or more levels deep
+#if OMP_40_ENABLED
+  // typed flag to store request state of cancellation
+  kmp_int32 t_cancel_request;
+#endif
+  int t_master_active; // save on fork, restore on join
+  kmp_taskq_t t_taskq; // this team's task queue
+  void *t_copypriv_data; // team specific pointer to copyprivate data array
+  kmp_uint32 t_copyin_counter;
 #if USE_ITT_BUILD
-    void                    *t_stack_id;       // team specific stack stitching id (for ittnotify)
+  void *t_stack_id; // team specific stack stitching id (for ittnotify)
 #endif /* USE_ITT_BUILD */
 } kmp_base_team_t;
 
 union KMP_ALIGN_CACHE kmp_team {
-    kmp_base_team_t     t;
-    double              t_align;       /* use worst case alignment */
-    char                t_pad[ KMP_PAD(kmp_base_team_t, CACHE_LINE) ];
+  kmp_base_team_t t;
+  double t_align; /* use worst case alignment */
+  char t_pad[KMP_PAD(kmp_base_team_t, CACHE_LINE)];
 };
 
-
 typedef union KMP_ALIGN_CACHE kmp_time_global {
-    double              dt_align;        /* use worst case alignment */
-    char                dt_pad[ KMP_PAD(kmp_base_data_t, CACHE_LINE) ];
-    kmp_base_data_t     dt;
+  double dt_align; /* use worst case alignment */
+  char dt_pad[KMP_PAD(kmp_base_data_t, CACHE_LINE)];
+  kmp_base_data_t dt;
 } kmp_time_global_t;
 
 typedef struct kmp_base_global {
-    /* cache-aligned */
-    kmp_time_global_t   g_time;
+  /* cache-aligned */
+  kmp_time_global_t g_time;
 
-    /* non cache-aligned */
-    volatile int        g_abort;
-    volatile int        g_done;
+  /* non cache-aligned */
+  volatile int g_abort;
+  volatile int g_done;
 
-    int                 g_dynamic;
-    enum dynamic_mode   g_dynamic_mode;
+  int g_dynamic;
+  enum dynamic_mode g_dynamic_mode;
 } kmp_base_global_t;
 
 typedef union KMP_ALIGN_CACHE kmp_global {
-    kmp_base_global_t   g;
-    double              g_align;        /* use worst case alignment */
-    char                g_pad[ KMP_PAD(kmp_base_global_t, CACHE_LINE) ];
+  kmp_base_global_t g;
+  double g_align; /* use worst case alignment */
+  char g_pad[KMP_PAD(kmp_base_global_t, CACHE_LINE)];
 } kmp_global_t;
 
-
 typedef struct kmp_base_root {
-    // TODO: GEH - combine r_active with r_in_parallel then r_active == (r_in_parallel>= 0)
-    // TODO: GEH - then replace r_active with t_active_levels if we can to reduce the synch
-    //             overhead or keeping r_active
-
-    volatile int        r_active;       /* TRUE if some region in a nest has > 1 thread */
-                                        // GEH: This is misnamed, should be r_in_parallel
-    volatile int        r_nested;       // TODO: GEH - This is unused, just remove it entirely.
-    int                 r_in_parallel;  /* keeps a count of active parallel regions per root */
-                                        // GEH: This is misnamed, should be r_active_levels
-    kmp_team_t         *r_root_team;
-    kmp_team_t         *r_hot_team;
-    kmp_info_t         *r_uber_thread;
-    kmp_lock_t          r_begin_lock;
-    volatile int        r_begin;
-    int                 r_blocktime; /* blocktime for this root and descendants */
+  // TODO: GEH - combine r_active with r_in_parallel then r_active ==
+  // (r_in_parallel>= 0)
+  // TODO: GEH - then replace r_active with t_active_levels if we can to reduce
+  // the synch overhead or keeping r_active
+  volatile int r_active; /* TRUE if some region in a nest has > 1 thread */
+  // GEH: This is misnamed, should be r_in_parallel
+  volatile int r_nested; // TODO: GEH - This is unused, just remove it entirely.
+  int r_in_parallel; /* keeps a count of active parallel regions per root */
+  // GEH: This is misnamed, should be r_active_levels
+  kmp_team_t *r_root_team;
+  kmp_team_t *r_hot_team;
+  kmp_info_t *r_uber_thread;
+  kmp_lock_t r_begin_lock;
+  volatile int r_begin;
+  int r_blocktime; /* blocktime for this root and descendants */
 } kmp_base_root_t;
 
 typedef union KMP_ALIGN_CACHE kmp_root {
-    kmp_base_root_t     r;
-    double              r_align;        /* use worst case alignment */
-    char                r_pad[ KMP_PAD(kmp_base_root_t, CACHE_LINE) ];
+  kmp_base_root_t r;
+  double r_align; /* use worst case alignment */
+  char r_pad[KMP_PAD(kmp_base_root_t, CACHE_LINE)];
 } kmp_root_t;
 
 struct fortran_inx_info {
-    kmp_int32   data;
+  kmp_int32 data;
 };
 
 /* ------------------------------------------------------------------------ */
 
-/* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
-
-extern int      __kmp_settings;
-extern int      __kmp_duplicate_library_ok;
+extern int __kmp_settings;
+extern int __kmp_duplicate_library_ok;
 #if USE_ITT_BUILD
-extern int      __kmp_forkjoin_frames;
-extern int      __kmp_forkjoin_frames_mode;
+extern int __kmp_forkjoin_frames;
+extern int __kmp_forkjoin_frames_mode;
 #endif
 extern PACKED_REDUCTION_METHOD_T __kmp_force_reduction_method;
-extern int      __kmp_determ_red;
+extern int __kmp_determ_red;
 
 #ifdef KMP_DEBUG
-extern int      kmp_a_debug;
-extern int      kmp_b_debug;
-extern int      kmp_c_debug;
-extern int      kmp_d_debug;
-extern int      kmp_e_debug;
-extern int      kmp_f_debug;
+extern int kmp_a_debug;
+extern int kmp_b_debug;
+extern int kmp_c_debug;
+extern int kmp_d_debug;
+extern int kmp_e_debug;
+extern int kmp_f_debug;
 #endif /* KMP_DEBUG */
 
 /* For debug information logging using rotating buffer */
-#define KMP_DEBUG_BUF_LINES_INIT        512
-#define KMP_DEBUG_BUF_LINES_MIN         1
+#define KMP_DEBUG_BUF_LINES_INIT 512
+#define KMP_DEBUG_BUF_LINES_MIN 1
 
-#define KMP_DEBUG_BUF_CHARS_INIT        128
-#define KMP_DEBUG_BUF_CHARS_MIN         2
+#define KMP_DEBUG_BUF_CHARS_INIT 128
+#define KMP_DEBUG_BUF_CHARS_MIN 2
 
-extern int     __kmp_debug_buf;            /* TRUE means use buffer, FALSE means print to stderr */
-extern int     __kmp_debug_buf_lines;      /* How many lines of debug stored in buffer */
-extern int     __kmp_debug_buf_chars;      /* How many characters allowed per line in buffer */
-extern int     __kmp_debug_buf_atomic;     /* TRUE means use atomic update of buffer entry pointer */
-
-extern char   *__kmp_debug_buffer;         /* Debug buffer itself */
-extern int     __kmp_debug_count;          /* Counter for number of lines printed in buffer so far */
-extern int     __kmp_debug_buf_warn_chars; /* Keep track of char increase recommended in warnings */
+extern int
+    __kmp_debug_buf; /* TRUE means use buffer, FALSE means print to stderr */
+extern int __kmp_debug_buf_lines; /* How many lines of debug stored in buffer */
+extern int
+    __kmp_debug_buf_chars; /* How many characters allowed per line in buffer */
+extern int __kmp_debug_buf_atomic; /* TRUE means use atomic update of buffer
+                                      entry pointer */
+
+extern char *__kmp_debug_buffer; /* Debug buffer itself */
+extern int __kmp_debug_count; /* Counter for number of lines printed in buffer
+                                 so far */
+extern int __kmp_debug_buf_warn_chars; /* Keep track of char increase
+                                          recommended in warnings */
 /* end rotating debug buffer */
 
 #ifdef KMP_DEBUG
-extern int      __kmp_par_range;           /* +1 => only go par for constructs in range */
+extern int __kmp_par_range; /* +1 => only go par for constructs in range */
 
-#define KMP_PAR_RANGE_ROUTINE_LEN       1024
-extern char     __kmp_par_range_routine[KMP_PAR_RANGE_ROUTINE_LEN];
-#define KMP_PAR_RANGE_FILENAME_LEN      1024
-extern char     __kmp_par_range_filename[KMP_PAR_RANGE_FILENAME_LEN];
-extern int      __kmp_par_range_lb;
-extern int      __kmp_par_range_ub;
+#define KMP_PAR_RANGE_ROUTINE_LEN 1024
+extern char __kmp_par_range_routine[KMP_PAR_RANGE_ROUTINE_LEN];
+#define KMP_PAR_RANGE_FILENAME_LEN 1024
+extern char __kmp_par_range_filename[KMP_PAR_RANGE_FILENAME_LEN];
+extern int __kmp_par_range_lb;
+extern int __kmp_par_range_ub;
 #endif
 
 /* For printing out dynamic storage map for threads and teams */
-extern int      __kmp_storage_map;         /* True means print storage map for threads and teams */
-extern int      __kmp_storage_map_verbose; /* True means storage map includes placement info */
-extern int      __kmp_storage_map_verbose_specified;
+extern int
+    __kmp_storage_map; /* True means print storage map for threads and teams */
+extern int __kmp_storage_map_verbose; /* True means storage map includes
+                                         placement info */
+extern int __kmp_storage_map_verbose_specified;
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-extern kmp_cpuinfo_t    __kmp_cpuinfo;
+extern kmp_cpuinfo_t __kmp_cpuinfo;
 #endif
 
 extern volatile int __kmp_init_serial;
@@ -2638,65 +2775,72 @@ extern int __kmp_version;
 extern kmp_cached_addr_t *__kmp_threadpriv_cache_list;
 
 /* Barrier algorithm types and options */
-extern kmp_uint32    __kmp_barrier_gather_bb_dflt;
-extern kmp_uint32    __kmp_barrier_release_bb_dflt;
+extern kmp_uint32 __kmp_barrier_gather_bb_dflt;
+extern kmp_uint32 __kmp_barrier_release_bb_dflt;
 extern kmp_bar_pat_e __kmp_barrier_gather_pat_dflt;
 extern kmp_bar_pat_e __kmp_barrier_release_pat_dflt;
-extern kmp_uint32    __kmp_barrier_gather_branch_bits  [ bs_last_barrier ];
-extern kmp_uint32    __kmp_barrier_release_branch_bits [ bs_last_barrier ];
-extern kmp_bar_pat_e __kmp_barrier_gather_pattern      [ bs_last_barrier ];
-extern kmp_bar_pat_e __kmp_barrier_release_pattern     [ bs_last_barrier ];
-extern char const   *__kmp_barrier_branch_bit_env_name [ bs_last_barrier ];
-extern char const   *__kmp_barrier_pattern_env_name    [ bs_last_barrier ];
-extern char const   *__kmp_barrier_type_name           [ bs_last_barrier ];
-extern char const   *__kmp_barrier_pattern_name        [ bp_last_bar ];
+extern kmp_uint32 __kmp_barrier_gather_branch_bits[bs_last_barrier];
+extern kmp_uint32 __kmp_barrier_release_branch_bits[bs_last_barrier];
+extern kmp_bar_pat_e __kmp_barrier_gather_pattern[bs_last_barrier];
+extern kmp_bar_pat_e __kmp_barrier_release_pattern[bs_last_barrier];
+extern char const *__kmp_barrier_branch_bit_env_name[bs_last_barrier];
+extern char const *__kmp_barrier_pattern_env_name[bs_last_barrier];
+extern char const *__kmp_barrier_type_name[bs_last_barrier];
+extern char const *__kmp_barrier_pattern_name[bp_last_bar];
 
 /* Global Locks */
-extern kmp_bootstrap_lock_t __kmp_initz_lock;     /* control initialization */
-extern kmp_bootstrap_lock_t __kmp_forkjoin_lock;  /* control fork/join access */
-extern kmp_bootstrap_lock_t __kmp_exit_lock;      /* exit() is not always thread-safe */
+extern kmp_bootstrap_lock_t __kmp_initz_lock; /* control initialization */
+extern kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
+extern kmp_bootstrap_lock_t
+    __kmp_exit_lock; /* exit() is not always thread-safe */
 #if KMP_USE_MONITOR
-extern kmp_bootstrap_lock_t __kmp_monitor_lock;   /* control monitor thread creation */
+extern kmp_bootstrap_lock_t
+    __kmp_monitor_lock; /* control monitor thread creation */
 #endif
-extern kmp_bootstrap_lock_t __kmp_tp_cached_lock; /* used for the hack to allow threadprivate cache and __kmp_threads expansion to co-exist */
-
-extern kmp_lock_t __kmp_global_lock;    /* control OS/global access  */
-extern kmp_queuing_lock_t __kmp_dispatch_lock;  /* control dispatch access  */
-extern kmp_lock_t __kmp_debug_lock;     /* control I/O access for KMP_DEBUG */
+extern kmp_bootstrap_lock_t
+    __kmp_tp_cached_lock; /* used for the hack to allow threadprivate cache and
+                             __kmp_threads expansion to co-exist */
+
+extern kmp_lock_t __kmp_global_lock; /* control OS/global access  */
+extern kmp_queuing_lock_t __kmp_dispatch_lock; /* control dispatch access  */
+extern kmp_lock_t __kmp_debug_lock; /* control I/O access for KMP_DEBUG */
 
 /* used for yielding spin-waits */
-extern unsigned int __kmp_init_wait;    /* initial number of spin-tests   */
-extern unsigned int __kmp_next_wait;    /* susequent number of spin-tests */
+extern unsigned int __kmp_init_wait; /* initial number of spin-tests   */
+extern unsigned int __kmp_next_wait; /* susequent number of spin-tests */
 
 extern enum library_type __kmp_library;
 
-extern enum sched_type  __kmp_sched;    /* default runtime scheduling */
-extern enum sched_type  __kmp_static;   /* default static scheduling method */
-extern enum sched_type  __kmp_guided;   /* default guided scheduling method */
-extern enum sched_type  __kmp_auto;     /* default auto scheduling method */
-extern int              __kmp_chunk;    /* default runtime chunk size */
+extern enum sched_type __kmp_sched; /* default runtime scheduling */
+extern enum sched_type __kmp_static; /* default static scheduling method */
+extern enum sched_type __kmp_guided; /* default guided scheduling method */
+extern enum sched_type __kmp_auto; /* default auto scheduling method */
+extern int __kmp_chunk; /* default runtime chunk size */
 
-extern size_t     __kmp_stksize;        /* stack size per thread         */
+extern size_t __kmp_stksize; /* stack size per thread         */
 #if KMP_USE_MONITOR
-extern size_t     __kmp_monitor_stksize;/* stack size for monitor thread */
+extern size_t __kmp_monitor_stksize; /* stack size for monitor thread */
 #endif
-extern size_t     __kmp_stkoffset;      /* stack offset per thread       */
-extern int        __kmp_stkpadding;     /* Should we pad root thread(s) stack */
+extern size_t __kmp_stkoffset; /* stack offset per thread       */
+extern int __kmp_stkpadding; /* Should we pad root thread(s) stack */
 
-extern size_t     __kmp_malloc_pool_incr; /* incremental size of pool for kmp_malloc() */
-extern int        __kmp_env_chunk;      /* was KMP_CHUNK specified?     */
-extern int        __kmp_env_stksize;    /* was KMP_STACKSIZE specified? */
-extern int        __kmp_env_omp_stksize;/* was OMP_STACKSIZE specified? */
-extern int        __kmp_env_all_threads;    /* was KMP_ALL_THREADS or KMP_MAX_THREADS specified? */
-extern int        __kmp_env_omp_all_threads;/* was OMP_THREAD_LIMIT specified? */
-extern int        __kmp_env_blocktime;  /* was KMP_BLOCKTIME specified? */
-extern int        __kmp_env_checks;     /* was KMP_CHECKS specified?    */
-extern int        __kmp_env_consistency_check;     /* was KMP_CONSISTENCY_CHECK specified?    */
-extern int        __kmp_generate_warnings; /* should we issue warnings? */
-extern int        __kmp_reserve_warn;   /* have we issued reserve_threads warning? */
+extern size_t
+    __kmp_malloc_pool_incr; /* incremental size of pool for kmp_malloc() */
+extern int __kmp_env_chunk; /* was KMP_CHUNK specified?     */
+extern int __kmp_env_stksize; /* was KMP_STACKSIZE specified? */
+extern int __kmp_env_omp_stksize; /* was OMP_STACKSIZE specified? */
+extern int __kmp_env_all_threads; /* was KMP_ALL_THREADS or KMP_MAX_THREADS
+                                     specified? */
+extern int __kmp_env_omp_all_threads; /* was OMP_THREAD_LIMIT specified? */
+extern int __kmp_env_blocktime; /* was KMP_BLOCKTIME specified? */
+extern int __kmp_env_checks; /* was KMP_CHECKS specified?    */
+extern int
+    __kmp_env_consistency_check; /* was KMP_CONSISTENCY_CHECK specified?    */
+extern int __kmp_generate_warnings; /* should we issue warnings? */
+extern int __kmp_reserve_warn; /* have we issued reserve_threads warning? */
 
 #ifdef DEBUG_SUSPEND
-extern int        __kmp_suspend_count;  /* count inside __kmp_suspend_template() */
+extern int __kmp_suspend_count; /* count inside __kmp_suspend_template() */
 #endif
 
 extern kmp_uint32 __kmp_yield_init;
@@ -2706,87 +2850,107 @@ extern kmp_uint32 __kmp_yield_next;
 extern kmp_uint32 __kmp_yielding_on;
 #endif
 extern kmp_uint32 __kmp_yield_cycle;
-extern kmp_int32  __kmp_yield_on_count;
-extern kmp_int32  __kmp_yield_off_count;
+extern kmp_int32 __kmp_yield_on_count;
+extern kmp_int32 __kmp_yield_off_count;
 
 /* ------------------------------------------------------------------------- */
-extern int        __kmp_allThreadsSpecified;
+extern int __kmp_allThreadsSpecified;
 
-extern size_t     __kmp_align_alloc;
+extern size_t __kmp_align_alloc;
 /* following data protected by initialization routines */
-extern int        __kmp_xproc;          /* number of processors in the system */
-extern int        __kmp_avail_proc;      /* number of processors available to the process */
-extern size_t     __kmp_sys_min_stksize; /* system-defined minimum stack size */
-extern int        __kmp_sys_max_nth;    /* system-imposed maximum number of threads */
-extern int        __kmp_max_nth;        /* maximum total number of concurrently-existing threads */
-extern int        __kmp_threads_capacity; /* capacity of the arrays __kmp_threads and __kmp_root */
-extern int        __kmp_dflt_team_nth;  /* default number of threads in a parallel region a la OMP_NUM_THREADS */
-extern int        __kmp_dflt_team_nth_ub; /* upper bound on "" determined at serial initialization */
-extern int        __kmp_tp_capacity;    /* capacity of __kmp_threads if threadprivate is used (fixed) */
-extern int        __kmp_tp_cached;      /* whether threadprivate cache has been created (__kmpc_threadprivate_cached()) */
-extern int        __kmp_dflt_nested;    /* nested parallelism enabled by default a la OMP_NESTED */
-extern int        __kmp_dflt_blocktime; /* number of milliseconds to wait before blocking (env setting) */
+extern int __kmp_xproc; /* number of processors in the system */
+extern int __kmp_avail_proc; /* number of processors available to the process */
+extern size_t __kmp_sys_min_stksize; /* system-defined minimum stack size */
+extern int __kmp_sys_max_nth; /* system-imposed maximum number of threads */
+extern int
+    __kmp_max_nth; /* maximum total number of concurrently-existing threads */
+extern int __kmp_threads_capacity; /* capacity of the arrays __kmp_threads and
+                                      __kmp_root */
+extern int __kmp_dflt_team_nth; /* default number of threads in a parallel
+                                   region a la OMP_NUM_THREADS */
+extern int __kmp_dflt_team_nth_ub; /* upper bound on "" determined at serial
+                                      initialization */
+extern int __kmp_tp_capacity; /* capacity of __kmp_threads if threadprivate is
+                                 used (fixed) */
+extern int __kmp_tp_cached; /* whether threadprivate cache has been created
+                               (__kmpc_threadprivate_cached()) */
+extern int __kmp_dflt_nested; /* nested parallelism enabled by default a la
+                                 OMP_NESTED */
+extern int __kmp_dflt_blocktime; /* number of milliseconds to wait before
+                                    blocking (env setting) */
 #if KMP_USE_MONITOR
-extern int        __kmp_monitor_wakeups;/* number of times monitor wakes up per second */
-extern int        __kmp_bt_intervals;   /* number of monitor timestamp intervals before blocking */
+extern int
+    __kmp_monitor_wakeups; /* number of times monitor wakes up per second */
+extern int __kmp_bt_intervals; /* number of monitor timestamp intervals before
+                                  blocking */
 #endif
 #ifdef KMP_ADJUST_BLOCKTIME
-extern int        __kmp_zero_bt;        /* whether blocktime has been forced to zero */
+extern int __kmp_zero_bt; /* whether blocktime has been forced to zero */
 #endif /* KMP_ADJUST_BLOCKTIME */
 #ifdef KMP_DFLT_NTH_CORES
-extern int        __kmp_ncores;         /* Total number of cores for threads placement */
+extern int __kmp_ncores; /* Total number of cores for threads placement */
 #endif
-extern int        __kmp_abort_delay;    /* Number of millisecs to delay on abort for VTune */
+extern int
+    __kmp_abort_delay; /* Number of millisecs to delay on abort for VTune */
 
-extern int        __kmp_need_register_atfork_specified;
-extern int        __kmp_need_register_atfork;/* At initialization, call pthread_atfork to install fork handler */
-extern int        __kmp_gtid_mode;      /* Method of getting gtid, values:
-                                           0 - not set, will be set at runtime
-                                           1 - using stack search
-                                           2 - dynamic TLS (pthread_getspecific(Linux* OS/OS X*) or TlsGetValue(Windows* OS))
-                                           3 - static TLS (__declspec(thread) __kmp_gtid), Linux* OS .so only.
-                                         */
-extern int        __kmp_adjust_gtid_mode; /* If true, adjust method based on #threads */
+extern int __kmp_need_register_atfork_specified;
+extern int
+    __kmp_need_register_atfork; /* At initialization, call pthread_atfork to
+                                   install fork handler */
+extern int __kmp_gtid_mode; /* Method of getting gtid, values:
+                               0 - not set, will be set at runtime
+                               1 - using stack search
+                               2 - dynamic TLS (pthread_getspecific(Linux* OS/OS
+                                   X*) or TlsGetValue(Windows* OS))
+                               3 - static TLS (__declspec(thread) __kmp_gtid),
+                                   Linux* OS .so only.  */
+extern int
+    __kmp_adjust_gtid_mode; /* If true, adjust method based on #threads */
 #ifdef KMP_TDATA_GTID
 #if KMP_OS_WINDOWS
-extern __declspec(thread) int __kmp_gtid; /* This thread's gtid, if __kmp_gtid_mode == 3 */
+extern __declspec(
+    thread) int __kmp_gtid; /* This thread's gtid, if __kmp_gtid_mode == 3 */
 #else
 extern __thread int __kmp_gtid;
-#endif /* KMP_OS_WINDOWS - workaround because Intel(R) Many Integrated Core compiler 20110316 doesn't accept __declspec */
+#endif /* KMP_OS_WINDOWS - workaround because Intel(R) Many Integrated Core    \
+          compiler 20110316 doesn't accept __declspec */
 #endif
-extern int        __kmp_tls_gtid_min;   /* #threads below which use sp search for gtid */
-extern int        __kmp_foreign_tp;     /* If true, separate TP var for each foreign thread */
+extern int __kmp_tls_gtid_min; /* #threads below which use sp search for gtid */
+extern int __kmp_foreign_tp; // If true, separate TP var for each foreign thread
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-extern int        __kmp_inherit_fp_control; /* copy fp creg(s) parent->workers at fork */
-extern kmp_int16  __kmp_init_x87_fpu_control_word; /* init thread's FP control reg */
-extern kmp_uint32 __kmp_init_mxcsr;      /* init thread's mxscr */
+extern int __kmp_inherit_fp_control; // copy fp creg(s) parent->workers at fork
+extern kmp_int16 __kmp_init_x87_fpu_control_word; // init thread's FP ctrl reg
+extern kmp_uint32 __kmp_init_mxcsr; /* init thread's mxscr */
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-extern int        __kmp_dflt_max_active_levels; /* max_active_levels for nested parallelism enabled by default a la OMP_MAX_ACTIVE_LEVELS */
-extern int        __kmp_dispatch_num_buffers; /* max possible dynamic loops in concurrent execution per team */
+extern int __kmp_dflt_max_active_levels; /* max_active_levels for nested
+                                            parallelism enabled by default via
+                                            OMP_MAX_ACTIVE_LEVELS */
+extern int __kmp_dispatch_num_buffers; /* max possible dynamic loops in
+                                          concurrent execution per team */
 #if KMP_NESTED_HOT_TEAMS
-extern int        __kmp_hot_teams_mode;
-extern int        __kmp_hot_teams_max_level;
+extern int __kmp_hot_teams_mode;
+extern int __kmp_hot_teams_max_level;
 #endif
 
-# if KMP_OS_LINUX
+#if KMP_OS_LINUX
 extern enum clock_function_type __kmp_clock_function;
 extern int __kmp_clock_function_param;
-# endif /* KMP_OS_LINUX */
+#endif /* KMP_OS_LINUX */
 
 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
 extern enum mic_type __kmp_mic_type;
 #endif
 
-# ifdef USE_LOAD_BALANCE
-extern double      __kmp_load_balance_interval;   /* Interval for the load balance algorithm */
-# endif /* USE_LOAD_BALANCE */
+#ifdef USE_LOAD_BALANCE
+extern double __kmp_load_balance_interval; // load balance algorithm interval
+#endif /* USE_LOAD_BALANCE */
 
 // OpenMP 3.1 - Nested num threads array
 typedef struct kmp_nested_nthreads_t {
-    int * nth;
-    int   size;
-    int   used;
+  int *nth;
+  int size;
+  int used;
 } kmp_nested_nthreads_t;
 
 extern kmp_nested_nthreads_t __kmp_nested_nth;
@@ -2795,290 +2959,313 @@ extern kmp_nested_nthreads_t __kmp_neste
 
 // Parameters for the speculative lock backoff system.
 struct kmp_adaptive_backoff_params_t {
-    // Number of soft retries before it counts as a hard retry.
-    kmp_uint32 max_soft_retries;
-    // Badness is a bit mask : 0,1,3,7,15,... on each hard failure we move one to the right
-    kmp_uint32 max_badness;
+  // Number of soft retries before it counts as a hard retry.
+  kmp_uint32 max_soft_retries;
+  // Badness is a bit mask : 0,1,3,7,15,... on each hard failure we move one to
+  // the right
+  kmp_uint32 max_badness;
 };
 
 extern kmp_adaptive_backoff_params_t __kmp_adaptive_backoff_params;
 
 #if KMP_DEBUG_ADAPTIVE_LOCKS
-extern char * __kmp_speculative_statsfile;
+extern char *__kmp_speculative_statsfile;
 #endif
 
 #endif // KMP_USE_ADAPTIVE_LOCKS
 
 #if OMP_40_ENABLED
-extern int __kmp_display_env;           /* TRUE or FALSE */
-extern int __kmp_display_env_verbose;   /* TRUE if OMP_DISPLAY_ENV=VERBOSE */
-extern int __kmp_omp_cancellation;      /* TRUE or FALSE */
+extern int __kmp_display_env; /* TRUE or FALSE */
+extern int __kmp_display_env_verbose; /* TRUE if OMP_DISPLAY_ENV=VERBOSE */
+extern int __kmp_omp_cancellation; /* TRUE or FALSE */
 #endif
 
 /* ------------------------------------------------------------------------- */
 
-/* --------------------------------------------------------------------------- */
 /* the following are protected by the fork/join lock */
 /* write: lock  read: anytime */
-extern          kmp_info_t **__kmp_threads;      /* Descriptors for the threads */
+extern kmp_info_t **__kmp_threads; /* Descriptors for the threads */
 /* read/write: lock */
-extern volatile kmp_team_t  *     __kmp_team_pool;
-extern volatile kmp_info_t  *     __kmp_thread_pool;
+extern volatile kmp_team_t *__kmp_team_pool;
+extern volatile kmp_info_t *__kmp_thread_pool;
 
-/* total number of threads reachable from some root thread including all root threads*/
+// total num threads reachable from some root thread including all root threads
 extern volatile int __kmp_nth;
-/* total number of threads reachable from some root thread including all root threads,
-   and those in the thread pool */
+/* total number of threads reachable from some root thread including all root
+   threads, and those in the thread pool */
 extern volatile int __kmp_all_nth;
 extern int __kmp_thread_pool_nth;
 extern volatile int __kmp_thread_pool_active_nth;
 
-extern kmp_root_t **__kmp_root;         /* root of thread hierarchy */
+extern kmp_root_t **__kmp_root; /* root of thread hierarchy */
 /* end data protected by fork/join lock */
-/* --------------------------------------------------------------------------- */
+/* ------------------------------------------------------------------------- */
 
-extern kmp_global_t  __kmp_global;         /* global status */
+extern kmp_global_t __kmp_global; /* global status */
 
 extern kmp_info_t __kmp_monitor;
-extern volatile kmp_uint32 __kmp_team_counter;      // Used by Debugging Support Library.
-extern volatile kmp_uint32 __kmp_task_counter;      // Used by Debugging Support Library.
+extern volatile kmp_uint32 __kmp_team_counter; // For Debugging Support Library
+extern volatile kmp_uint32 __kmp_task_counter; // For Debugging Support Library
 
 #if USE_DEBUGGER
 
-#define _KMP_GEN_ID( counter )                                         \
-    (                                                                  \
-        __kmp_debugging                                                \
-        ?                                                              \
-        KMP_TEST_THEN_INC32( (volatile kmp_int32 *) & counter ) + 1    \
-        :                                                              \
-        ~ 0                                                            \
-    )
-#else
-#define _KMP_GEN_ID( counter )                                         \
-    (                                                                  \
-        ~ 0                                                            \
-    )
+#define _KMP_GEN_ID(counter)                                                   \
+  (__kmp_debugging ? KMP_TEST_THEN_INC32((volatile kmp_int32 *)&counter) + 1   \
+                   : ~0)
+#else
+#define _KMP_GEN_ID(counter) (~0)
 #endif /* USE_DEBUGGER */
 
-#define KMP_GEN_TASK_ID()    _KMP_GEN_ID( __kmp_task_counter )
-#define KMP_GEN_TEAM_ID()    _KMP_GEN_ID( __kmp_team_counter )
+#define KMP_GEN_TASK_ID() _KMP_GEN_ID(__kmp_task_counter)
+#define KMP_GEN_TEAM_ID() _KMP_GEN_ID(__kmp_team_counter)
 
 /* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
 
-extern void __kmp_print_storage_map_gtid( int gtid, void *p1, void* p2, size_t size, char const *format, ... );
+extern void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2,
+                                         size_t size, char const *format, ...);
+
+extern void __kmp_serial_initialize(void);
+extern void __kmp_middle_initialize(void);
+extern void __kmp_parallel_initialize(void);
+
+extern void __kmp_internal_begin(void);
+extern void __kmp_internal_end_library(int gtid);
+extern void __kmp_internal_end_thread(int gtid);
+extern void __kmp_internal_end_atexit(void);
+extern void __kmp_internal_end_fini(void);
+extern void __kmp_internal_end_dtor(void);
+extern void __kmp_internal_end_dest(void *);
 
-extern void __kmp_serial_initialize( void );
-extern void __kmp_middle_initialize( void );
-extern void __kmp_parallel_initialize( void );
-
-extern void __kmp_internal_begin( void );
-extern void __kmp_internal_end_library( int gtid );
-extern void __kmp_internal_end_thread( int gtid );
-extern void __kmp_internal_end_atexit( void );
-extern void __kmp_internal_end_fini( void );
-extern void __kmp_internal_end_dtor( void );
-extern void __kmp_internal_end_dest( void* );
-
-extern int  __kmp_register_root( int initial_thread );
-extern void __kmp_unregister_root( int gtid );
+extern int __kmp_register_root(int initial_thread);
+extern void __kmp_unregister_root(int gtid);
 
-extern int  __kmp_ignore_mppbeg( void );
-extern int  __kmp_ignore_mppend( void );
+extern int __kmp_ignore_mppbeg(void);
+extern int __kmp_ignore_mppend(void);
 
-extern int  __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws );
-extern void __kmp_exit_single( int gtid );
+extern int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws);
+extern void __kmp_exit_single(int gtid);
 
-extern void __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref );
-extern void __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref );
+extern void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+extern void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
 
 #ifdef USE_LOAD_BALANCE
-extern int  __kmp_get_load_balance( int );
+extern int __kmp_get_load_balance(int);
 #endif
 
 #ifdef BUILD_TV
-extern void __kmp_tv_threadprivate_store( kmp_info_t *th, void *global_addr, void *thread_addr );
+extern void __kmp_tv_threadprivate_store(kmp_info_t *th, void *global_addr,
+                                         void *thread_addr);
 #endif
 
-extern int  __kmp_get_global_thread_id( void );
-extern int  __kmp_get_global_thread_id_reg( void );
-extern void __kmp_exit_thread( int exit_status );
-extern void __kmp_abort( char const * format, ... );
-extern void __kmp_abort_thread( void );
-extern void __kmp_abort_process( void );
-extern void __kmp_warn( char const * format, ... );
+extern int __kmp_get_global_thread_id(void);
+extern int __kmp_get_global_thread_id_reg(void);
+extern void __kmp_exit_thread(int exit_status);
+extern void __kmp_abort(char const *format, ...);
+extern void __kmp_abort_thread(void);
+extern void __kmp_abort_process(void);
+extern void __kmp_warn(char const *format, ...);
+
+extern void __kmp_set_num_threads(int new_nth, int gtid);
+
+// Returns current thread (pointer to kmp_info_t). Current thread *must* be
+// registered.
+static inline kmp_info_t *__kmp_entry_thread() {
+  int gtid = __kmp_entry_gtid();
 
-extern void __kmp_set_num_threads( int new_nth, int gtid );
-
-// Returns current thread (pointer to kmp_info_t). Current thread *must* be registered.
-static inline kmp_info_t * __kmp_entry_thread()
-{
-      int gtid = __kmp_entry_gtid();
-
-      return __kmp_threads[gtid];
+  return __kmp_threads[gtid];
 }
 
-extern void __kmp_set_max_active_levels( int gtid, int new_max_active_levels );
-extern int  __kmp_get_max_active_levels( int gtid );
-extern int  __kmp_get_ancestor_thread_num( int gtid, int level );
-extern int  __kmp_get_team_size( int gtid, int level );
-extern void __kmp_set_schedule( int gtid, kmp_sched_t new_sched, int chunk );
-extern void __kmp_get_schedule( int gtid, kmp_sched_t * sched, int * chunk );
-
-extern unsigned short __kmp_get_random( kmp_info_t * thread );
-extern void __kmp_init_random( kmp_info_t * thread );
-
-extern kmp_r_sched_t __kmp_get_schedule_global( void );
-extern void __kmp_adjust_num_threads( int new_nproc );
-
-extern void * ___kmp_allocate( size_t size KMP_SRC_LOC_DECL );
-extern void * ___kmp_page_allocate( size_t size KMP_SRC_LOC_DECL );
-extern void   ___kmp_free( void * ptr KMP_SRC_LOC_DECL );
-#define __kmp_allocate( size )      ___kmp_allocate( (size) KMP_SRC_LOC_CURR )
-#define __kmp_page_allocate( size ) ___kmp_page_allocate( (size) KMP_SRC_LOC_CURR )
-#define __kmp_free( ptr )           ___kmp_free( (ptr) KMP_SRC_LOC_CURR )
+extern void __kmp_set_max_active_levels(int gtid, int new_max_active_levels);
+extern int __kmp_get_max_active_levels(int gtid);
+extern int __kmp_get_ancestor_thread_num(int gtid, int level);
+extern int __kmp_get_team_size(int gtid, int level);
+extern void __kmp_set_schedule(int gtid, kmp_sched_t new_sched, int chunk);
+extern void __kmp_get_schedule(int gtid, kmp_sched_t *sched, int *chunk);
+
+extern unsigned short __kmp_get_random(kmp_info_t *thread);
+extern void __kmp_init_random(kmp_info_t *thread);
+
+extern kmp_r_sched_t __kmp_get_schedule_global(void);
+extern void __kmp_adjust_num_threads(int new_nproc);
+
+extern void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL);
+extern void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL);
+extern void ___kmp_free(void *ptr KMP_SRC_LOC_DECL);
+#define __kmp_allocate(size) ___kmp_allocate((size)KMP_SRC_LOC_CURR)
+#define __kmp_page_allocate(size) ___kmp_page_allocate((size)KMP_SRC_LOC_CURR)
+#define __kmp_free(ptr) ___kmp_free((ptr)KMP_SRC_LOC_CURR)
 
 #if USE_FAST_MEMORY
-extern void * ___kmp_fast_allocate( kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL );
-extern void   ___kmp_fast_free( kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL );
-extern void   __kmp_free_fast_memory( kmp_info_t *this_thr );
-extern void   __kmp_initialize_fast_memory( kmp_info_t *this_thr );
-#define __kmp_fast_allocate( this_thr, size ) ___kmp_fast_allocate( (this_thr), (size) KMP_SRC_LOC_CURR )
-#define __kmp_fast_free( this_thr, ptr )      ___kmp_fast_free( (this_thr), (ptr) KMP_SRC_LOC_CURR )
-#endif
-
-extern void * ___kmp_thread_malloc( kmp_info_t *th, size_t size KMP_SRC_LOC_DECL );
-extern void * ___kmp_thread_calloc( kmp_info_t *th, size_t nelem, size_t elsize KMP_SRC_LOC_DECL );
-extern void * ___kmp_thread_realloc( kmp_info_t *th, void *ptr, size_t size KMP_SRC_LOC_DECL );
-extern void   ___kmp_thread_free( kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL );
-#define __kmp_thread_malloc(  th, size )          ___kmp_thread_malloc(  (th), (size)            KMP_SRC_LOC_CURR )
-#define __kmp_thread_calloc(  th, nelem, elsize ) ___kmp_thread_calloc(  (th), (nelem), (elsize) KMP_SRC_LOC_CURR )
-#define __kmp_thread_realloc( th, ptr, size )     ___kmp_thread_realloc( (th), (ptr), (size)     KMP_SRC_LOC_CURR )
-#define __kmp_thread_free(    th, ptr )           ___kmp_thread_free(    (th), (ptr)             KMP_SRC_LOC_CURR )
-
-#define KMP_INTERNAL_MALLOC(sz)    malloc(sz)
-#define KMP_INTERNAL_FREE(p)       free(p)
-#define KMP_INTERNAL_REALLOC(p,sz) realloc((p),(sz))
-#define KMP_INTERNAL_CALLOC(n,sz)  calloc((n),(sz))
-
-extern void __kmp_push_num_threads( ident_t *loc, int gtid, int num_threads );
-
-#if OMP_40_ENABLED
-extern void __kmp_push_proc_bind( ident_t *loc, int gtid, kmp_proc_bind_t proc_bind );
-extern void __kmp_push_num_teams( ident_t *loc, int gtid, int num_teams, int num_threads );
-#endif
-
-extern void __kmp_yield( int cond );
-
-extern void __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid,
-    enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
-    kmp_int32 chunk );
-extern void __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid,
-    enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
-    kmp_int32 chunk );
-extern void __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid,
-    enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
-    kmp_int64 chunk );
-extern void __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid,
-    enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
-    kmp_int64 chunk );
-
-extern int __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid,
-    kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st );
-extern int __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid,
-    kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st );
-extern int __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid,
-    kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st );
-extern int __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid,
-    kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st );
-
-extern void __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid );
-extern void __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid );
-extern void __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid );
-extern void __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid );
-
+extern void *___kmp_fast_allocate(kmp_info_t *this_thr,
+                                  size_t size KMP_SRC_LOC_DECL);
+extern void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL);
+extern void __kmp_free_fast_memory(kmp_info_t *this_thr);
+extern void __kmp_initialize_fast_memory(kmp_info_t *this_thr);
+#define __kmp_fast_allocate(this_thr, size)                                    \
+  ___kmp_fast_allocate((this_thr), (size)KMP_SRC_LOC_CURR)
+#define __kmp_fast_free(this_thr, ptr)                                         \
+  ___kmp_fast_free((this_thr), (ptr)KMP_SRC_LOC_CURR)
+#endif
+
+extern void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL);
+extern void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
+                                  size_t elsize KMP_SRC_LOC_DECL);
+extern void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
+                                   size_t size KMP_SRC_LOC_DECL);
+extern void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL);
+#define __kmp_thread_malloc(th, size)                                          \
+  ___kmp_thread_malloc((th), (size)KMP_SRC_LOC_CURR)
+#define __kmp_thread_calloc(th, nelem, elsize)                                 \
+  ___kmp_thread_calloc((th), (nelem), (elsize)KMP_SRC_LOC_CURR)
+#define __kmp_thread_realloc(th, ptr, size)                                    \
+  ___kmp_thread_realloc((th), (ptr), (size)KMP_SRC_LOC_CURR)
+#define __kmp_thread_free(th, ptr)                                             \
+  ___kmp_thread_free((th), (ptr)KMP_SRC_LOC_CURR)
+
+#define KMP_INTERNAL_MALLOC(sz) malloc(sz)
+#define KMP_INTERNAL_FREE(p) free(p)
+#define KMP_INTERNAL_REALLOC(p, sz) realloc((p), (sz))
+#define KMP_INTERNAL_CALLOC(n, sz) calloc((n), (sz))
+
+extern void __kmp_push_num_threads(ident_t *loc, int gtid, int num_threads);
+
+#if OMP_40_ENABLED
+extern void __kmp_push_proc_bind(ident_t *loc, int gtid,
+                                 kmp_proc_bind_t proc_bind);
+extern void __kmp_push_num_teams(ident_t *loc, int gtid, int num_teams,
+                                 int num_threads);
+#endif
+
+extern void __kmp_yield(int cond);
+
+extern void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
+                                   enum sched_type schedule, kmp_int32 lb,
+                                   kmp_int32 ub, kmp_int32 st, kmp_int32 chunk);
+extern void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
+                                    enum sched_type schedule, kmp_uint32 lb,
+                                    kmp_uint32 ub, kmp_int32 st,
+                                    kmp_int32 chunk);
+extern void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
+                                   enum sched_type schedule, kmp_int64 lb,
+                                   kmp_int64 ub, kmp_int64 st, kmp_int64 chunk);
+extern void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
+                                    enum sched_type schedule, kmp_uint64 lb,
+                                    kmp_uint64 ub, kmp_int64 st,
+                                    kmp_int64 chunk);
+
+extern int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid,
+                                  kmp_int32 *p_last, kmp_int32 *p_lb,
+                                  kmp_int32 *p_ub, kmp_int32 *p_st);
+extern int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid,
+                                   kmp_int32 *p_last, kmp_uint32 *p_lb,
+                                   kmp_uint32 *p_ub, kmp_int32 *p_st);
+extern int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid,
+                                  kmp_int32 *p_last, kmp_int64 *p_lb,
+                                  kmp_int64 *p_ub, kmp_int64 *p_st);
+extern int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid,
+                                   kmp_int32 *p_last, kmp_uint64 *p_lb,
+                                   kmp_uint64 *p_ub, kmp_int64 *p_st);
+
+extern void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid);
+extern void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid);
+extern void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid);
+extern void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid);
 
 #ifdef KMP_GOMP_COMPAT
 
-extern void __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid,
-    enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
-    kmp_int32 chunk, int push_ws );
-extern void __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid,
-    enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
-    kmp_int32 chunk, int push_ws );
-extern void __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid,
-    enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
-    kmp_int64 chunk, int push_ws );
-extern void __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid,
-    enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
-    kmp_int64 chunk, int push_ws );
-extern void __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid );
-extern void __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid );
-extern void __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid );
-extern void __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid );
+extern void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
+                                      enum sched_type schedule, kmp_int32 lb,
+                                      kmp_int32 ub, kmp_int32 st,
+                                      kmp_int32 chunk, int push_ws);
+extern void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
+                                       enum sched_type schedule, kmp_uint32 lb,
+                                       kmp_uint32 ub, kmp_int32 st,
+                                       kmp_int32 chunk, int push_ws);
+extern void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
+                                      enum sched_type schedule, kmp_int64 lb,
+                                      kmp_int64 ub, kmp_int64 st,
+                                      kmp_int64 chunk, int push_ws);
+extern void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
+                                       enum sched_type schedule, kmp_uint64 lb,
+                                       kmp_uint64 ub, kmp_int64 st,
+                                       kmp_int64 chunk, int push_ws);
+extern void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid);
+extern void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid);
+extern void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid);
+extern void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid);
 
 #endif /* KMP_GOMP_COMPAT */
 
-
-extern kmp_uint32 __kmp_eq_4(  kmp_uint32 value, kmp_uint32 checker );
-extern kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker );
-extern kmp_uint32 __kmp_lt_4(  kmp_uint32 value, kmp_uint32 checker );
-extern kmp_uint32 __kmp_ge_4(  kmp_uint32 value, kmp_uint32 checker );
-extern kmp_uint32 __kmp_le_4(  kmp_uint32 value, kmp_uint32 checker );
-extern kmp_uint32 __kmp_wait_yield_4( kmp_uint32 volatile * spinner, kmp_uint32 checker, kmp_uint32 (*pred) (kmp_uint32, kmp_uint32), void * obj );
-extern void __kmp_wait_yield_4_ptr( void * spinner, kmp_uint32 checker, kmp_uint32 (* pred)( void *, kmp_uint32 ), void * obj );
+extern kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_wait_yield_4(kmp_uint32 volatile *spinner,
+                                     kmp_uint32 checker,
+                                     kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
+                                     void *obj);
+extern void __kmp_wait_yield_4_ptr(void *spinner, kmp_uint32 checker,
+                                   kmp_uint32 (*pred)(void *, kmp_uint32),
+                                   void *obj);
 
 class kmp_flag_32;
 class kmp_flag_64;
 class kmp_flag_oncore;
-extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag, int final_spin
+extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag,
+                          int final_spin
 #if USE_ITT_BUILD
-                   , void * itt_sync_obj
+                          ,
+                          void *itt_sync_obj
 #endif
-                   );
+                          );
 extern void __kmp_release_64(kmp_flag_64 *flag);
 
-extern void __kmp_infinite_loop( void );
+extern void __kmp_infinite_loop(void);
 
-extern void __kmp_cleanup( void );
+extern void __kmp_cleanup(void);
 
 #if KMP_HANDLE_SIGNALS
-    extern int  __kmp_handle_signals;
-    extern void __kmp_install_signals( int parallel_init );
-    extern void __kmp_remove_signals( void );
+extern int __kmp_handle_signals;
+extern void __kmp_install_signals(int parallel_init);
+extern void __kmp_remove_signals(void);
 #endif
 
-extern void __kmp_clear_system_time( void );
-extern void __kmp_read_system_time( double *delta );
+extern void __kmp_clear_system_time(void);
+extern void __kmp_read_system_time(double *delta);
 
-extern void __kmp_check_stack_overlap( kmp_info_t *thr );
+extern void __kmp_check_stack_overlap(kmp_info_t *thr);
 
-extern void __kmp_expand_host_name( char *buffer, size_t size );
-extern void __kmp_expand_file_name( char *result, size_t rlen, char *pattern );
+extern void __kmp_expand_host_name(char *buffer, size_t size);
+extern void __kmp_expand_file_name(char *result, size_t rlen, char *pattern);
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-extern void __kmp_initialize_system_tick( void );  /* Initialize timer tick value */
+extern void
+__kmp_initialize_system_tick(void); /* Initialize timer tick value */
 #endif
 
-extern void __kmp_runtime_initialize( void );  /* machine specific initialization */
-extern void __kmp_runtime_destroy( void );
+extern void
+__kmp_runtime_initialize(void); /* machine specific initialization */
+extern void __kmp_runtime_destroy(void);
 
 #if KMP_AFFINITY_SUPPORTED
-extern char *__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask);
+extern char *__kmp_affinity_print_mask(char *buf, int buf_len,
+                                       kmp_affin_mask_t *mask);
 extern void __kmp_affinity_initialize(void);
 extern void __kmp_affinity_uninitialize(void);
-extern void __kmp_affinity_set_init_mask(int gtid, int isa_root); /* set affinity according to KMP_AFFINITY */
+extern void __kmp_affinity_set_init_mask(
+    int gtid, int isa_root); /* set affinity according to KMP_AFFINITY */
 #if OMP_40_ENABLED
 extern void __kmp_affinity_set_place(int gtid);
 #endif
-extern void __kmp_affinity_determine_capable( const char *env_var );
+extern void __kmp_affinity_determine_capable(const char *env_var);
 extern int __kmp_aux_set_affinity(void **mask);
 extern int __kmp_aux_get_affinity(void **mask);
 extern int __kmp_aux_get_affinity_max_proc();
 extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
 extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
 extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
-extern void __kmp_balanced_affinity( int tid, int team_size );
+extern void __kmp_balanced_affinity(int tid, int team_size);
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 extern void __kmp_cleanup_hierarchy();
@@ -3086,208 +3273,226 @@ extern void __kmp_get_hierarchy(kmp_uint
 
 #if KMP_USE_FUTEX
 
-extern int __kmp_futex_determine_capable( void );
+extern int __kmp_futex_determine_capable(void);
 
 #endif // KMP_USE_FUTEX
 
-extern void __kmp_gtid_set_specific( int gtid );
-extern int  __kmp_gtid_get_specific( void );
+extern void __kmp_gtid_set_specific(int gtid);
+extern int __kmp_gtid_get_specific(void);
 
-extern double __kmp_read_cpu_time( void );
+extern double __kmp_read_cpu_time(void);
 
-extern int  __kmp_read_system_info( struct kmp_sys_info *info );
+extern int __kmp_read_system_info(struct kmp_sys_info *info);
 
 #if KMP_USE_MONITOR
-extern void __kmp_create_monitor( kmp_info_t *th );
+extern void __kmp_create_monitor(kmp_info_t *th);
 #endif
 
-extern void *__kmp_launch_thread( kmp_info_t *thr );
+extern void *__kmp_launch_thread(kmp_info_t *thr);
 
-extern void __kmp_create_worker( int gtid, kmp_info_t *th, size_t stack_size );
+extern void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size);
 
 #if KMP_OS_WINDOWS
-extern int  __kmp_still_running(kmp_info_t *th);
-extern int  __kmp_is_thread_alive( kmp_info_t * th, DWORD *exit_val );
-extern void __kmp_free_handle( kmp_thread_t tHandle );
+extern int __kmp_still_running(kmp_info_t *th);
+extern int __kmp_is_thread_alive(kmp_info_t *th, DWORD *exit_val);
+extern void __kmp_free_handle(kmp_thread_t tHandle);
 #endif
 
 #if KMP_USE_MONITOR
-extern void __kmp_reap_monitor( kmp_info_t *th );
+extern void __kmp_reap_monitor(kmp_info_t *th);
 #endif
-extern void __kmp_reap_worker( kmp_info_t *th );
-extern void __kmp_terminate_thread( int gtid );
+extern void __kmp_reap_worker(kmp_info_t *th);
+extern void __kmp_terminate_thread(int gtid);
 
-extern void __kmp_suspend_32( int th_gtid, kmp_flag_32 *flag );
-extern void __kmp_suspend_64( int th_gtid, kmp_flag_64 *flag );
-extern void __kmp_suspend_oncore( int th_gtid, kmp_flag_oncore *flag );
-extern void __kmp_resume_32( int target_gtid, kmp_flag_32 *flag );
-extern void __kmp_resume_64( int target_gtid, kmp_flag_64 *flag );
-extern void __kmp_resume_oncore( int target_gtid, kmp_flag_oncore *flag );
+extern void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag);
+extern void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag);
+extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag);
+extern void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag);
+extern void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag);
+extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag);
 
-extern void __kmp_elapsed( double * );
-extern void __kmp_elapsed_tick( double * );
+extern void __kmp_elapsed(double *);
+extern void __kmp_elapsed_tick(double *);
 
-extern void __kmp_enable( int old_state );
-extern void __kmp_disable( int *old_state );
+extern void __kmp_enable(int old_state);
+extern void __kmp_disable(int *old_state);
 
-extern void __kmp_thread_sleep( int millis );
+extern void __kmp_thread_sleep(int millis);
 
-extern void __kmp_common_initialize( void );
-extern void __kmp_common_destroy( void );
-extern void __kmp_common_destroy_gtid( int gtid );
+extern void __kmp_common_initialize(void);
+extern void __kmp_common_destroy(void);
+extern void __kmp_common_destroy_gtid(int gtid);
 
 #if KMP_OS_UNIX
-extern void __kmp_register_atfork( void );
+extern void __kmp_register_atfork(void);
 #endif
-extern void __kmp_suspend_initialize( void );
-extern void __kmp_suspend_uninitialize_thread( kmp_info_t *th );
+extern void __kmp_suspend_initialize(void);
+extern void __kmp_suspend_uninitialize_thread(kmp_info_t *th);
 
-extern kmp_info_t * __kmp_allocate_thread( kmp_root_t *root,
-                                           kmp_team_t *team, int tid);
+extern kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
+                                         int tid);
 #if OMP_40_ENABLED
-extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
+extern kmp_team_t *
+__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #if OMPT_SUPPORT
-                                         ompt_parallel_id_t ompt_parallel_id,
+                    ompt_parallel_id_t ompt_parallel_id,
 #endif
-                                         kmp_proc_bind_t proc_bind,
-                                         kmp_internal_control_t *new_icvs,
-                                         int argc USE_NESTED_HOT_ARG(kmp_info_t *thr) );
+                    kmp_proc_bind_t proc_bind, kmp_internal_control_t *new_icvs,
+                    int argc USE_NESTED_HOT_ARG(kmp_info_t *thr));
 #else
-extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
+extern kmp_team_t *
+__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #if OMPT_SUPPORT
-                                         ompt_parallel_id_t ompt_parallel_id,
+                    ompt_parallel_id_t ompt_parallel_id,
 #endif
-                                         kmp_internal_control_t *new_icvs,
-                                         int argc USE_NESTED_HOT_ARG(kmp_info_t *thr) );
+                    kmp_internal_control_t *new_icvs,
+                    int argc USE_NESTED_HOT_ARG(kmp_info_t *thr));
 #endif // OMP_40_ENABLED
-extern void __kmp_free_thread( kmp_info_t * );
-extern void __kmp_free_team( kmp_root_t *, kmp_team_t *  USE_NESTED_HOT_ARG(kmp_info_t *) );
-extern kmp_team_t * __kmp_reap_team( kmp_team_t * );
+extern void __kmp_free_thread(kmp_info_t *);
+extern void __kmp_free_team(kmp_root_t *,
+                            kmp_team_t *USE_NESTED_HOT_ARG(kmp_info_t *));
+extern kmp_team_t *__kmp_reap_team(kmp_team_t *);
 
 /* ------------------------------------------------------------------------ */
 
-extern void __kmp_initialize_bget( kmp_info_t *th );
-extern void __kmp_finalize_bget( kmp_info_t *th );
+extern void __kmp_initialize_bget(kmp_info_t *th);
+extern void __kmp_finalize_bget(kmp_info_t *th);
 
-KMP_EXPORT void *kmpc_malloc( size_t size );
-KMP_EXPORT void *kmpc_aligned_malloc( size_t size, size_t alignment );
-KMP_EXPORT void *kmpc_calloc( size_t nelem, size_t elsize );
-KMP_EXPORT void *kmpc_realloc( void *ptr, size_t size );
-KMP_EXPORT void  kmpc_free( void *ptr );
+KMP_EXPORT void *kmpc_malloc(size_t size);
+KMP_EXPORT void *kmpc_aligned_malloc(size_t size, size_t alignment);
+KMP_EXPORT void *kmpc_calloc(size_t nelem, size_t elsize);
+KMP_EXPORT void *kmpc_realloc(void *ptr, size_t size);
+KMP_EXPORT void kmpc_free(void *ptr);
 
-/* ------------------------------------------------------------------------ */
 /* declarations for internal use */
 
-extern int  __kmp_barrier( enum barrier_type bt, int gtid, int is_split,
-                           size_t reduce_size, void *reduce_data, void (*reduce)(void *, void *) );
-extern void __kmp_end_split_barrier ( enum barrier_type bt, int gtid );
+extern int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
+                         size_t reduce_size, void *reduce_data,
+                         void (*reduce)(void *, void *));
+extern void __kmp_end_split_barrier(enum barrier_type bt, int gtid);
 
 /*!
- * Tell the fork call which compiler generated the fork call, and therefore how to deal with the call.
+ * Tell the fork call which compiler generated the fork call, and therefore how
+ * to deal with the call.
  */
-enum fork_context_e
-{
-    fork_context_gnu,                           /**< Called from GNU generated code, so must not invoke the microtask internally. */
-    fork_context_intel,                         /**< Called from Intel generated code.  */
-    fork_context_last
+enum fork_context_e {
+  fork_context_gnu, /**< Called from GNU generated code, so must not invoke the
+                       microtask internally. */
+  fork_context_intel, /**< Called from Intel generated code.  */
+  fork_context_last
 };
-extern int __kmp_fork_call( ident_t *loc, int gtid, enum fork_context_e fork_context,
-  kmp_int32 argc,
+extern int __kmp_fork_call(ident_t *loc, int gtid,
+                           enum fork_context_e fork_context, kmp_int32 argc,
 #if OMPT_SUPPORT
-  void *unwrapped_task,
+                           void *unwrapped_task,
 #endif
-  microtask_t microtask, launch_t invoker,
+                           microtask_t microtask, launch_t invoker,
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
 #if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) && KMP_OS_LINUX
-                             va_list *ap
+                           va_list *ap
 #else
-                             va_list ap
+                           va_list ap
 #endif
-                             );
+                           );
 
-extern void __kmp_join_call( ident_t *loc, int gtid
+extern void __kmp_join_call(ident_t *loc, int gtid
 #if OMPT_SUPPORT
-                           , enum fork_context_e fork_context
+                            ,
+                            enum fork_context_e fork_context
 #endif
 #if OMP_40_ENABLED
-                           , int exit_teams = 0
+                            ,
+                            int exit_teams = 0
 #endif
-                           );
+                            );
 
 extern void __kmp_serialized_parallel(ident_t *id, kmp_int32 gtid);
-extern void __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team );
-extern void __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team );
-extern int __kmp_invoke_task_func( int gtid );
-extern void __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr, kmp_team_t *team );
-extern void __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr, kmp_team_t *team );
+extern void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team);
+extern void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team);
+extern int __kmp_invoke_task_func(int gtid);
+extern void __kmp_run_before_invoked_task(int gtid, int tid,
+                                          kmp_info_t *this_thr,
+                                          kmp_team_t *team);
+extern void __kmp_run_after_invoked_task(int gtid, int tid,
+                                         kmp_info_t *this_thr,
+                                         kmp_team_t *team);
 
 // should never have been exported
-KMP_EXPORT int __kmpc_invoke_task_func( int gtid );
+KMP_EXPORT int __kmpc_invoke_task_func(int gtid);
 #if OMP_40_ENABLED
-extern int  __kmp_invoke_teams_master( int gtid );
-extern void __kmp_teams_master( int gtid );
+extern int __kmp_invoke_teams_master(int gtid);
+extern void __kmp_teams_master(int gtid);
 #endif
-extern void __kmp_save_internal_controls( kmp_info_t * thread );
-extern void __kmp_user_set_library (enum library_type arg);
-extern void __kmp_aux_set_library (enum library_type arg);
-extern void __kmp_aux_set_stacksize( size_t arg);
-extern void __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid);
-extern void __kmp_aux_set_defaults( char const * str, int len );
+extern void __kmp_save_internal_controls(kmp_info_t *thread);
+extern void __kmp_user_set_library(enum library_type arg);
+extern void __kmp_aux_set_library(enum library_type arg);
+extern void __kmp_aux_set_stacksize(size_t arg);
+extern void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid);
+extern void __kmp_aux_set_defaults(char const *str, int len);
 
 /* Functions called from __kmp_aux_env_initialize() in kmp_settings.cpp */
-void kmpc_set_blocktime (int arg);
-void ompc_set_nested( int flag );
-void ompc_set_dynamic( int flag );
-void ompc_set_num_threads( int arg );
-
-extern void __kmp_push_current_task_to_thread( kmp_info_t *this_thr,
-                  kmp_team_t *team, int tid );
-extern void __kmp_pop_current_task_from_thread( kmp_info_t *this_thr );
-extern kmp_task_t* __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid,
-  kmp_tasking_flags_t *flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
-  kmp_routine_entry_t task_entry );
-extern void __kmp_init_implicit_task( ident_t *loc_ref, kmp_info_t *this_thr,
-                  kmp_team_t *team, int tid, int set_curr_task );
+void kmpc_set_blocktime(int arg);
+void ompc_set_nested(int flag);
+void ompc_set_dynamic(int flag);
+void ompc_set_num_threads(int arg);
+
+extern void __kmp_push_current_task_to_thread(kmp_info_t *this_thr,
+                                              kmp_team_t *team, int tid);
+extern void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr);
+extern kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                    kmp_tasking_flags_t *flags,
+                                    size_t sizeof_kmp_task_t,
+                                    size_t sizeof_shareds,
+                                    kmp_routine_entry_t task_entry);
+extern void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
+                                     kmp_team_t *team, int tid,
+                                     int set_curr_task);
 extern void __kmp_finish_implicit_task(kmp_info_t *this_thr);
 extern void __kmp_free_implicit_task(kmp_info_t *this_thr);
-
-int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
+int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid,
+                           kmp_flag_32 *flag, int final_spin,
                            int *thread_finished,
 #if USE_ITT_BUILD
-                           void * itt_sync_obj,
+                           void *itt_sync_obj,
 #endif /* USE_ITT_BUILD */
                            kmp_int32 is_constrained);
-int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
+int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
+                           kmp_flag_64 *flag, int final_spin,
                            int *thread_finished,
 #if USE_ITT_BUILD
-                           void * itt_sync_obj,
+                           void *itt_sync_obj,
 #endif /* USE_ITT_BUILD */
                            kmp_int32 is_constrained);
-int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
+int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
+                               kmp_flag_oncore *flag, int final_spin,
                                int *thread_finished,
 #if USE_ITT_BUILD
-                               void * itt_sync_obj,
+                               void *itt_sync_obj,
 #endif /* USE_ITT_BUILD */
                                kmp_int32 is_constrained);
 
-extern void __kmp_free_task_team( kmp_info_t *thread, kmp_task_team_t *task_team );
-extern void __kmp_reap_task_teams( void );
-extern void __kmp_wait_to_unref_task_teams( void );
-extern void __kmp_task_team_setup ( kmp_info_t *this_thr, kmp_team_t *team, int always );
-extern void __kmp_task_team_sync  ( kmp_info_t *this_thr, kmp_team_t *team );
-extern void __kmp_task_team_wait  ( kmp_info_t *this_thr, kmp_team_t *team
+extern void __kmp_free_task_team(kmp_info_t *thread,
+                                 kmp_task_team_t *task_team);
+extern void __kmp_reap_task_teams(void);
+extern void __kmp_wait_to_unref_task_teams(void);
+extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team,
+                                  int always);
+extern void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team);
+extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
 #if USE_ITT_BUILD
-                                    , void * itt_sync_obj
+                                 ,
+                                 void *itt_sync_obj
 #endif /* USE_ITT_BUILD */
-                                    , int wait=1
-);
-extern void __kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid );
+                                 ,
+                                 int wait = 1);
+extern void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread,
+                                  int gtid);
 
-extern int  __kmp_is_address_mapped( void *addr );
+extern int __kmp_is_address_mapped(void *addr);
 extern kmp_uint64 __kmp_hardware_timestamp(void);
 
 #if KMP_OS_UNIX
-extern int  __kmp_read_from_file( char const *path, char const *format, ... );
+extern int __kmp_read_from_file(char const *path, char const *format, ...);
 #endif
 
 /* ------------------------------------------------------------------------ */
@@ -3297,127 +3502,145 @@ extern int  __kmp_read_from_file( char c
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
-extern void       __kmp_query_cpuid( kmp_cpuinfo_t *p );
+extern void __kmp_query_cpuid(kmp_cpuinfo_t *p);
 
 #define __kmp_load_mxcsr(p) _mm_setcsr(*(p))
-static inline void __kmp_store_mxcsr( kmp_uint32 *p ) { *p = _mm_getcsr(); }
+static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = _mm_getcsr(); }
 
-extern void __kmp_load_x87_fpu_control_word( kmp_int16 *p );
-extern void __kmp_store_x87_fpu_control_word( kmp_int16 *p );
+extern void __kmp_load_x87_fpu_control_word(kmp_int16 *p);
+extern void __kmp_store_x87_fpu_control_word(kmp_int16 *p);
 extern void __kmp_clear_x87_fpu_status_word();
-# define KMP_X86_MXCSR_MASK      0xffffffc0   /* ignore status flags (6 lsb) */
+#define KMP_X86_MXCSR_MASK 0xffffffc0 /* ignore status flags (6 lsb) */
 
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-extern int __kmp_invoke_microtask( microtask_t pkfn, int gtid, int npr, int argc, void *argv[]
+extern int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int npr, int argc,
+                                  void *argv[]
 #if OMPT_SUPPORT
-                                   , void **exit_frame_ptr
+                                  ,
+                                  void **exit_frame_ptr
 #endif
-);
-
+                                  );
 
 /* ------------------------------------------------------------------------ */
 
-KMP_EXPORT void   __kmpc_begin                ( ident_t *, kmp_int32 flags );
-KMP_EXPORT void   __kmpc_end                  ( ident_t * );
-
-KMP_EXPORT void   __kmpc_threadprivate_register_vec ( ident_t *, void * data, kmpc_ctor_vec ctor,
-                                                  kmpc_cctor_vec cctor, kmpc_dtor_vec dtor, size_t vector_length );
-KMP_EXPORT void   __kmpc_threadprivate_register     ( ident_t *, void * data, kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor );
-KMP_EXPORT void * __kmpc_threadprivate              ( ident_t *, kmp_int32 global_tid, void * data, size_t size );
-
-KMP_EXPORT kmp_int32  __kmpc_global_thread_num  ( ident_t * );
-KMP_EXPORT kmp_int32  __kmpc_global_num_threads ( ident_t * );
-KMP_EXPORT kmp_int32  __kmpc_bound_thread_num   ( ident_t * );
-KMP_EXPORT kmp_int32  __kmpc_bound_num_threads  ( ident_t * );
-
-KMP_EXPORT kmp_int32  __kmpc_ok_to_fork     ( ident_t * );
-KMP_EXPORT void   __kmpc_fork_call          ( ident_t *, kmp_int32 nargs, kmpc_micro microtask, ... );
-
-KMP_EXPORT void   __kmpc_serialized_parallel     ( ident_t *, kmp_int32 global_tid );
-KMP_EXPORT void   __kmpc_end_serialized_parallel ( ident_t *, kmp_int32 global_tid );
-
-KMP_EXPORT void   __kmpc_flush              ( ident_t *);
-KMP_EXPORT void   __kmpc_barrier            ( ident_t *, kmp_int32 global_tid );
-KMP_EXPORT kmp_int32  __kmpc_master         ( ident_t *, kmp_int32 global_tid );
-KMP_EXPORT void   __kmpc_end_master         ( ident_t *, kmp_int32 global_tid );
-KMP_EXPORT void   __kmpc_ordered            ( ident_t *, kmp_int32 global_tid );
-KMP_EXPORT void   __kmpc_end_ordered        ( ident_t *, kmp_int32 global_tid );
-KMP_EXPORT void   __kmpc_critical           ( ident_t *, kmp_int32 global_tid, kmp_critical_name * );
-KMP_EXPORT void   __kmpc_end_critical       ( ident_t *, kmp_int32 global_tid, kmp_critical_name * );
+KMP_EXPORT void __kmpc_begin(ident_t *, kmp_int32 flags);
+KMP_EXPORT void __kmpc_end(ident_t *);
+
+KMP_EXPORT void __kmpc_threadprivate_register_vec(ident_t *, void *data,
+                                                  kmpc_ctor_vec ctor,
+                                                  kmpc_cctor_vec cctor,
+                                                  kmpc_dtor_vec dtor,
+                                                  size_t vector_length);
+KMP_EXPORT void __kmpc_threadprivate_register(ident_t *, void *data,
+                                              kmpc_ctor ctor, kmpc_cctor cctor,
+                                              kmpc_dtor dtor);
+KMP_EXPORT void *__kmpc_threadprivate(ident_t *, kmp_int32 global_tid,
+                                      void *data, size_t size);
+
+KMP_EXPORT kmp_int32 __kmpc_global_thread_num(ident_t *);
+KMP_EXPORT kmp_int32 __kmpc_global_num_threads(ident_t *);
+KMP_EXPORT kmp_int32 __kmpc_bound_thread_num(ident_t *);
+KMP_EXPORT kmp_int32 __kmpc_bound_num_threads(ident_t *);
+
+KMP_EXPORT kmp_int32 __kmpc_ok_to_fork(ident_t *);
+KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs,
+                                 kmpc_micro microtask, ...);
+
+KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid);
+
+KMP_EXPORT void __kmpc_flush(ident_t *);
+KMP_EXPORT void __kmpc_barrier(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_ordered(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_ordered(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_critical(ident_t *, kmp_int32 global_tid,
+                                kmp_critical_name *);
+KMP_EXPORT void __kmpc_end_critical(ident_t *, kmp_int32 global_tid,
+                                    kmp_critical_name *);
 
 #if OMP_45_ENABLED
-KMP_EXPORT void   __kmpc_critical_with_hint ( ident_t *, kmp_int32 global_tid, kmp_critical_name *, uintptr_t hint );
+KMP_EXPORT void __kmpc_critical_with_hint(ident_t *, kmp_int32 global_tid,
+                                          kmp_critical_name *, uintptr_t hint);
 #endif
 
-KMP_EXPORT kmp_int32  __kmpc_barrier_master ( ident_t *, kmp_int32 global_tid );
-KMP_EXPORT void   __kmpc_end_barrier_master ( ident_t *, kmp_int32 global_tid );
-
-KMP_EXPORT kmp_int32  __kmpc_barrier_master_nowait ( ident_t *, kmp_int32 global_tid );
-
-KMP_EXPORT kmp_int32  __kmpc_single         ( ident_t *, kmp_int32 global_tid );
-KMP_EXPORT void   __kmpc_end_single         ( ident_t *, kmp_int32 global_tid );
+KMP_EXPORT kmp_int32 __kmpc_barrier_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_barrier_master(ident_t *, kmp_int32 global_tid);
 
-KMP_EXPORT void KMPC_FOR_STATIC_INIT    ( ident_t *loc, kmp_int32 global_tid, kmp_int32 schedtype, kmp_int32 *plastiter,
-                                          kmp_int *plower, kmp_int *pupper, kmp_int *pstride, kmp_int incr, kmp_int chunk );
-
-KMP_EXPORT void __kmpc_for_static_fini  ( ident_t *loc, kmp_int32 global_tid );
-
-KMP_EXPORT void __kmpc_copyprivate( ident_t *loc, kmp_int32 global_tid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit );
-
-extern void KMPC_SET_NUM_THREADS        ( int arg );
-extern void KMPC_SET_DYNAMIC            ( int flag );
-extern void KMPC_SET_NESTED             ( int flag );
-
-/* --------------------------------------------------------------------------- */
-
-/*
- * Taskq interface routines
- */
+KMP_EXPORT kmp_int32 __kmpc_barrier_master_nowait(ident_t *,
+                                                  kmp_int32 global_tid);
+
+KMP_EXPORT kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_single(ident_t *, kmp_int32 global_tid);
+
+KMP_EXPORT void KMPC_FOR_STATIC_INIT(ident_t *loc, kmp_int32 global_tid,
+                                     kmp_int32 schedtype, kmp_int32 *plastiter,
+                                     kmp_int *plower, kmp_int *pupper,
+                                     kmp_int *pstride, kmp_int incr,
+                                     kmp_int chunk);
+
+KMP_EXPORT void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid);
+
+KMP_EXPORT void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
+                                   size_t cpy_size, void *cpy_data,
+                                   void (*cpy_func)(void *, void *),
+                                   kmp_int32 didit);
+
+extern void KMPC_SET_NUM_THREADS(int arg);
+extern void KMPC_SET_DYNAMIC(int flag);
+extern void KMPC_SET_NESTED(int flag);
+
+/* Taskq interface routines */
+KMP_EXPORT kmpc_thunk_t *__kmpc_taskq(ident_t *loc, kmp_int32 global_tid,
+                                      kmpc_task_t taskq_task,
+                                      size_t sizeof_thunk,
+                                      size_t sizeof_shareds, kmp_int32 flags,
+                                      kmpc_shared_vars_t **shareds);
+KMP_EXPORT void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid,
+                                 kmpc_thunk_t *thunk);
+KMP_EXPORT kmp_int32 __kmpc_task(ident_t *loc, kmp_int32 global_tid,
+                                 kmpc_thunk_t *thunk);
+KMP_EXPORT void __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid,
+                                  kmpc_thunk_t *thunk, kmp_int32 status);
+KMP_EXPORT void __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid,
+                                      kmpc_thunk_t *thunk);
+KMP_EXPORT kmpc_thunk_t *__kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid,
+                                            kmpc_thunk_t *taskq_thunk,
+                                            kmpc_task_t task);
+
+/* OMP 3.0 tasking interface routines */
+KMP_EXPORT kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
+                                     kmp_task_t *new_task);
+KMP_EXPORT kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                             kmp_int32 flags,
+                                             size_t sizeof_kmp_task_t,
+                                             size_t sizeof_shareds,
+                                             kmp_routine_entry_t task_entry);
+KMP_EXPORT void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
+                                          kmp_task_t *task);
+KMP_EXPORT void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
+                                             kmp_task_t *task);
+KMP_EXPORT kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
+                                           kmp_task_t *new_task);
+KMP_EXPORT kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid);
 
-KMP_EXPORT kmpc_thunk_t * __kmpc_taskq (ident_t *loc, kmp_int32 global_tid, kmpc_task_t taskq_task, size_t sizeof_thunk,
-                                        size_t sizeof_shareds, kmp_int32 flags, kmpc_shared_vars_t **shareds);
-KMP_EXPORT void __kmpc_end_taskq (ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk);
-KMP_EXPORT kmp_int32 __kmpc_task (ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk);
-KMP_EXPORT void __kmpc_taskq_task (ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, kmp_int32 status);
-KMP_EXPORT void __kmpc_end_taskq_task (ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk);
-KMP_EXPORT kmpc_thunk_t * __kmpc_task_buffer (ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk, kmpc_task_t task);
-
-/* ------------------------------------------------------------------------ */
-
-/*
- * OMP 3.0 tasking interface routines
- */
-
-KMP_EXPORT kmp_int32
-__kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task );
-KMP_EXPORT kmp_task_t*
-__kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
-                       size_t sizeof_kmp_task_t, size_t sizeof_shareds,
-                       kmp_routine_entry_t task_entry );
-KMP_EXPORT void
-__kmpc_omp_task_begin_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task );
-KMP_EXPORT void
-__kmpc_omp_task_complete_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task );
-KMP_EXPORT kmp_int32
-__kmpc_omp_task_parts( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task );
-KMP_EXPORT kmp_int32
-__kmpc_omp_taskwait( ident_t *loc_ref, kmp_int32 gtid );
-
-KMP_EXPORT kmp_int32
-__kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part );
+KMP_EXPORT kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid,
+                                          int end_part);
 
 #if TASK_UNUSED
-void __kmpc_omp_task_begin( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task );
-void __kmpc_omp_task_complete( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task );
+void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task);
+void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
+                              kmp_task_t *task);
 #endif // TASK_UNUSED
 
 /* ------------------------------------------------------------------------ */
 
 #if OMP_40_ENABLED
 
-KMP_EXPORT void __kmpc_taskgroup( ident_t * loc, int gtid );
-KMP_EXPORT void __kmpc_end_taskgroup( ident_t * loc, int gtid );
+KMP_EXPORT void __kmpc_taskgroup(ident_t *loc, int gtid);
+KMP_EXPORT void __kmpc_end_taskgroup(ident_t *loc, int gtid);
 
 KMP_EXPORT kmp_int32 __kmpc_omp_task_with_deps(
     ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 ndeps,
@@ -3432,154 +3655,169 @@ extern void __kmp_release_deps(kmp_int32
 extern void __kmp_dephash_free_entries(kmp_info_t *thread, kmp_dephash_t *h);
 extern void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h);
 
-extern kmp_int32 __kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate );
+extern kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
+                                bool serialize_immediate);
 
-KMP_EXPORT kmp_int32 __kmpc_cancel(ident_t* loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind);
-KMP_EXPORT kmp_int32 __kmpc_cancellationpoint(ident_t* loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind);
-KMP_EXPORT kmp_int32 __kmpc_cancel_barrier(ident_t* loc_ref, kmp_int32 gtid);
+KMP_EXPORT kmp_int32 __kmpc_cancel(ident_t *loc_ref, kmp_int32 gtid,
+                                   kmp_int32 cncl_kind);
+KMP_EXPORT kmp_int32 __kmpc_cancellationpoint(ident_t *loc_ref, kmp_int32 gtid,
+                                              kmp_int32 cncl_kind);
+KMP_EXPORT kmp_int32 __kmpc_cancel_barrier(ident_t *loc_ref, kmp_int32 gtid);
 KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind);
 
 #if OMP_45_ENABLED
 
-KMP_EXPORT void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask );
-KMP_EXPORT void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
-KMP_EXPORT void __kmpc_taskloop(ident_t *loc, kmp_int32 gtid, kmp_task_t *task, kmp_int32 if_val,
-                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
-                kmp_int32 nogroup, kmp_int32 sched, kmp_uint64 grainsize, void * task_dup );
+KMP_EXPORT void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask);
+KMP_EXPORT void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask);
+KMP_EXPORT void __kmpc_taskloop(ident_t *loc, kmp_int32 gtid, kmp_task_t *task,
+                                kmp_int32 if_val, kmp_uint64 *lb,
+                                kmp_uint64 *ub, kmp_int64 st, kmp_int32 nogroup,
+                                kmp_int32 sched, kmp_uint64 grainsize,
+                                void *task_dup);
 #endif
 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
 #if OMP_45_ENABLED
-KMP_EXPORT void* __kmpc_task_reduction_init(int gtid, int num_data, void *data);
-KMP_EXPORT void* __kmpc_task_reduction_get_th_data(int gtid, void *tg, void *d);
+KMP_EXPORT void *__kmpc_task_reduction_init(int gtid, int num_data, void *data);
+KMP_EXPORT void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *d);
 #endif
 
 #endif
 
-
-/*
- * Lock interface routines (fast versions with gtid passed in)
- */
-KMP_EXPORT void __kmpc_init_lock( ident_t *loc, kmp_int32 gtid,  void **user_lock );
-KMP_EXPORT void __kmpc_init_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
-KMP_EXPORT void __kmpc_destroy_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
-KMP_EXPORT void __kmpc_destroy_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
-KMP_EXPORT void __kmpc_set_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
-KMP_EXPORT void __kmpc_set_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
-KMP_EXPORT void __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
-KMP_EXPORT void __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
-KMP_EXPORT int __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
-KMP_EXPORT int __kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
+/* Lock interface routines (fast versions with gtid passed in) */
+KMP_EXPORT void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid,
+                                 void **user_lock);
+KMP_EXPORT void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                      void **user_lock);
+KMP_EXPORT void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid,
+                                    void **user_lock);
+KMP_EXPORT void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                         void **user_lock);
+KMP_EXPORT void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock);
+KMP_EXPORT void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                     void **user_lock);
+KMP_EXPORT void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid,
+                                  void **user_lock);
+KMP_EXPORT void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                       void **user_lock);
+KMP_EXPORT int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock);
+KMP_EXPORT int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                     void **user_lock);
 
 #if OMP_45_ENABLED
-KMP_EXPORT void __kmpc_init_lock_with_hint( ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint );
-KMP_EXPORT void __kmpc_init_nest_lock_with_hint( ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint );
-#endif
-
-/* ------------------------------------------------------------------------ */
-
-/*
- * Interface to fast scalable reduce methods routines
- */
-
-KMP_EXPORT kmp_int32 __kmpc_reduce_nowait( ident_t *loc, kmp_int32 global_tid,
-                                           kmp_int32 num_vars, size_t reduce_size,
-                                           void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
-                                           kmp_critical_name *lck );
-KMP_EXPORT void __kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck );
-KMP_EXPORT kmp_int32 __kmpc_reduce( ident_t *loc, kmp_int32 global_tid,
-                                    kmp_int32 num_vars, size_t reduce_size,
-                                    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
-                                    kmp_critical_name *lck );
-KMP_EXPORT void __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck );
-
-/*
- * internal fast reduction routines
- */
-
-extern PACKED_REDUCTION_METHOD_T
-__kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
-                                  kmp_int32 num_vars, size_t reduce_size,
-                                  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
-                                  kmp_critical_name *lck );
+KMP_EXPORT void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid,
+                                           void **user_lock, uintptr_t hint);
+KMP_EXPORT void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
+                                                void **user_lock,
+                                                uintptr_t hint);
+#endif
+
+/* Interface to fast scalable reduce methods routines */
+
+KMP_EXPORT kmp_int32 __kmpc_reduce_nowait(
+    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
+    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck);
+KMP_EXPORT void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
+                                         kmp_critical_name *lck);
+KMP_EXPORT kmp_int32 __kmpc_reduce(
+    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
+    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck);
+KMP_EXPORT void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
+                                  kmp_critical_name *lck);
+
+/* Internal fast reduction routines */
+
+extern PACKED_REDUCTION_METHOD_T __kmp_determine_reduction_method(
+    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
+    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck);
 
 // this function is for testing set/get/determine reduce method
-KMP_EXPORT kmp_int32 __kmp_get_reduce_method( void );
+KMP_EXPORT kmp_int32 __kmp_get_reduce_method(void);
 
 KMP_EXPORT kmp_uint64 __kmpc_get_taskid();
 KMP_EXPORT kmp_uint64 __kmpc_get_parent_taskid();
 
-/* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
-
 // C++ port
 // missing 'extern "C"' declarations
 
-KMP_EXPORT kmp_int32 __kmpc_in_parallel( ident_t *loc );
-KMP_EXPORT void __kmpc_pop_num_threads(  ident_t *loc, kmp_int32 global_tid );
-KMP_EXPORT void __kmpc_push_num_threads( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads );
+KMP_EXPORT kmp_int32 __kmpc_in_parallel(ident_t *loc);
+KMP_EXPORT void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
+                                        kmp_int32 num_threads);
 
 #if OMP_40_ENABLED
-KMP_EXPORT void __kmpc_push_proc_bind( ident_t *loc, kmp_int32 global_tid, int proc_bind );
-KMP_EXPORT void __kmpc_push_num_teams( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads );
-KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...);
+KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
+                                      int proc_bind);
+KMP_EXPORT void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
+                                      kmp_int32 num_teams,
+                                      kmp_int32 num_threads);
+KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc,
+                                  kmpc_micro microtask, ...);
 #endif
 #if OMP_45_ENABLED
-struct kmp_dim {  // loop bounds info casted to kmp_int64
-    kmp_int64 lo; // lower
-    kmp_int64 up; // upper
-    kmp_int64 st; // stride
-};
-KMP_EXPORT void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32 num_dims, struct kmp_dim * dims);
-KMP_EXPORT void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64 *vec);
-KMP_EXPORT void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64 *vec);
+struct kmp_dim { // loop bounds info casted to kmp_int64
+  kmp_int64 lo; // lower
+  kmp_int64 up; // upper
+  kmp_int64 st; // stride
+};
+KMP_EXPORT void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid,
+                                     kmp_int32 num_dims, struct kmp_dim *dims);
+KMP_EXPORT void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid,
+                                     kmp_int64 *vec);
+KMP_EXPORT void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid,
+                                     kmp_int64 *vec);
 KMP_EXPORT void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
 #endif
 
-KMP_EXPORT void*
-__kmpc_threadprivate_cached( ident_t * loc, kmp_int32 global_tid,
-                             void * data, size_t size, void *** cache );
+KMP_EXPORT void *__kmpc_threadprivate_cached(ident_t *loc, kmp_int32 global_tid,
+                                             void *data, size_t size,
+                                             void ***cache);
 
 // Symbols for MS mutual detection.
 extern int _You_must_link_with_exactly_one_OpenMP_library;
 extern int _You_must_link_with_Intel_OpenMP_library;
-#if KMP_OS_WINDOWS && ( KMP_VERSION_MAJOR > 4 )
-    extern int _You_must_link_with_Microsoft_OpenMP_library;
+#if KMP_OS_WINDOWS && (KMP_VERSION_MAJOR > 4)
+extern int _You_must_link_with_Microsoft_OpenMP_library;
 #endif
 
 
 // The routines below are not exported.
 // Consider making them 'static' in corresponding source files.
-void
-kmp_threadprivate_insert_private_data( int gtid, void *pc_addr, void *data_addr, size_t pc_size );
-struct private_common *
-kmp_threadprivate_insert( int gtid, void *pc_addr, void *data_addr, size_t pc_size );
+void kmp_threadprivate_insert_private_data(int gtid, void *pc_addr,
+                                           void *data_addr, size_t pc_size);
+struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
+                                                void *data_addr,
+                                                size_t pc_size);
 
-//
 // ompc_, kmpc_ entries moved from omp.h.
-//
 #if KMP_OS_WINDOWS
-#   define KMPC_CONVENTION __cdecl
+#define KMPC_CONVENTION __cdecl
 #else
-#   define KMPC_CONVENTION
+#define KMPC_CONVENTION
 #endif
 
 #ifndef __OMP_H
 typedef enum omp_sched_t {
-    omp_sched_static  = 1,
-    omp_sched_dynamic = 2,
-    omp_sched_guided  = 3,
-    omp_sched_auto    = 4
+  omp_sched_static = 1,
+  omp_sched_dynamic = 2,
+  omp_sched_guided = 3,
+  omp_sched_auto = 4
 } omp_sched_t;
-typedef void * kmp_affinity_mask_t;
+typedef void *kmp_affinity_mask_t;
 #endif
 
 KMP_EXPORT void KMPC_CONVENTION ompc_set_max_active_levels(int);
 KMP_EXPORT void KMPC_CONVENTION ompc_set_schedule(omp_sched_t, int);
-KMP_EXPORT int  KMPC_CONVENTION ompc_get_ancestor_thread_num(int);
-KMP_EXPORT int  KMPC_CONVENTION ompc_get_team_size(int);
-KMP_EXPORT int  KMPC_CONVENTION kmpc_set_affinity_mask_proc(int, kmp_affinity_mask_t *);
-KMP_EXPORT int  KMPC_CONVENTION kmpc_unset_affinity_mask_proc(int, kmp_affinity_mask_t *);
-KMP_EXPORT int  KMPC_CONVENTION kmpc_get_affinity_mask_proc(int, kmp_affinity_mask_t *);
+KMP_EXPORT int KMPC_CONVENTION ompc_get_ancestor_thread_num(int);
+KMP_EXPORT int KMPC_CONVENTION ompc_get_team_size(int);
+KMP_EXPORT int KMPC_CONVENTION
+kmpc_set_affinity_mask_proc(int, kmp_affinity_mask_t *);
+KMP_EXPORT int KMPC_CONVENTION
+kmpc_unset_affinity_mask_proc(int, kmp_affinity_mask_t *);
+KMP_EXPORT int KMPC_CONVENTION
+kmpc_get_affinity_mask_proc(int, kmp_affinity_mask_t *);
 
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize(int);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize_s(size_t);
@@ -3592,4 +3830,3 @@ KMP_EXPORT void KMPC_CONVENTION kmpc_set
 #endif
 
 #endif /* KMP_H */
-

Modified: openmp/trunk/runtime/src/kmp_affinity.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_affinity.cpp?rev=302929&r1=302928&r2=302929&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_affinity.cpp (original)
+++ openmp/trunk/runtime/src/kmp_affinity.cpp Fri May 12 13:01:32 2017
@@ -14,156 +14,149 @@
 
 
 #include "kmp.h"
+#include "kmp_affinity.h"
 #include "kmp_i18n.h"
 #include "kmp_io.h"
 #include "kmp_str.h"
 #include "kmp_wrapper_getpid.h"
-#include "kmp_affinity.h"
 
 // Store the real or imagined machine hierarchy here
 static hierarchy_info machine_hierarchy;
 
-void __kmp_cleanup_hierarchy() {
-    machine_hierarchy.fini();
-}
+void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
+
 
 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
-    kmp_uint32 depth;
-    // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
-    if (TCR_1(machine_hierarchy.uninitialized))
-        machine_hierarchy.init(NULL, nproc);
-
-    // Adjust the hierarchy in case num threads exceeds original
-    if (nproc > machine_hierarchy.base_num_threads)
-        machine_hierarchy.resize(nproc);
-
-    depth = machine_hierarchy.depth;
-    KMP_DEBUG_ASSERT(depth > 0);
-
-    thr_bar->depth = depth;
-    thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
-    thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
+  kmp_uint32 depth;
+  // The test below is true if affinity is available, but set to "none". Need to
+  // init on first use of hierarchical barrier.
+  if (TCR_1(machine_hierarchy.uninitialized))
+    machine_hierarchy.init(NULL, nproc);
+
+  // Adjust the hierarchy in case num threads exceeds original
+  if (nproc > machine_hierarchy.base_num_threads)
+    machine_hierarchy.resize(nproc);
+
+  depth = machine_hierarchy.depth;
+  KMP_DEBUG_ASSERT(depth > 0);
+
+  thr_bar->depth = depth;
+  thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1;
+  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
 }
 
 #if KMP_AFFINITY_SUPPORTED
 
 bool KMPAffinity::picked_api = false;
 
-void* KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
-void* KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
-void KMPAffinity::Mask::operator delete(void* p) { __kmp_free(p); }
-void KMPAffinity::Mask::operator delete[](void* p) { __kmp_free(p); }
-void* KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
-void KMPAffinity::operator delete(void* p) { __kmp_free(p); }
+void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
+void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
+void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
+void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
+void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
+void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
 
 void KMPAffinity::pick_api() {
-    KMPAffinity* affinity_dispatch;
-    if (picked_api)
-        return;
+  KMPAffinity *affinity_dispatch;
+  if (picked_api)
+    return;
 #if KMP_USE_HWLOC
-    if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
-        affinity_dispatch = new KMPHwlocAffinity();
-    } else
+  if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
+    affinity_dispatch = new KMPHwlocAffinity();
+  } else
 #endif
-    {
-        affinity_dispatch = new KMPNativeAffinity();
-    }
-    __kmp_affinity_dispatch = affinity_dispatch;
-    picked_api = true;
+  {
+    affinity_dispatch = new KMPNativeAffinity();
+  }
+  __kmp_affinity_dispatch = affinity_dispatch;
+  picked_api = true;
 }
 
 void KMPAffinity::destroy_api() {
-    if (__kmp_affinity_dispatch != NULL) {
-        delete __kmp_affinity_dispatch;
-        __kmp_affinity_dispatch = NULL;
-        picked_api = false;
-    }
+  if (__kmp_affinity_dispatch != NULL) {
+    delete __kmp_affinity_dispatch;
+    __kmp_affinity_dispatch = NULL;
+    picked_api = false;
+  }
 }
 
-//
 // Print the affinity mask to the character array in a pretty format.
-//
-char *
-__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
-{
-    KMP_ASSERT(buf_len >= 40);
-    char *scan = buf;
-    char *end = buf + buf_len - 1;
-
-    //
-    // Find first element / check for empty set.
-    //
-    size_t i;
-    i = mask->begin();
-    if (i == mask->end()) {
-        KMP_SNPRINTF(scan, end-scan+1, "{<empty>}");
-        while (*scan != '\0') scan++;
-        KMP_ASSERT(scan <= end);
-        return buf;
-    }
-
-    KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i);
-    while (*scan != '\0') scan++;
-    i++;
-    for (; i != mask->end(); i = mask->next(i)) {
-        if (! KMP_CPU_ISSET(i, mask)) {
-            continue;
-        }
-
-        //
-        // Check for buffer overflow.  A string of the form ",<n>" will have
-        // at most 10 characters, plus we want to leave room to print ",...}"
-        // if the set is too large to print for a total of 15 characters.
-        // We already left room for '\0' in setting end.
-        //
-        if (end - scan < 15) {
-           break;
-        }
-        KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i);
-        while (*scan != '\0') scan++;
-    }
-    if (i != mask->end()) {
-        KMP_SNPRINTF(scan, end-scan+1,  ",...");
-        while (*scan != '\0') scan++;
-    }
-    KMP_SNPRINTF(scan, end-scan+1, "}");
-    while (*scan != '\0') scan++;
+char *__kmp_affinity_print_mask(char *buf, int buf_len,
+                                kmp_affin_mask_t *mask) {
+  KMP_ASSERT(buf_len >= 40);
+  char *scan = buf;
+  char *end = buf + buf_len - 1;
+
+  // Find first element / check for empty set.
+  size_t i;
+  i = mask->begin();
+  if (i == mask->end()) {
+    KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
+    while (*scan != '\0')
+      scan++;
     KMP_ASSERT(scan <= end);
     return buf;
-}
+  }
 
+  KMP_SNPRINTF(scan, end - scan + 1, "{%ld", (long)i);
+  while (*scan != '\0')
+    scan++;
+  i++;
+  for (; i != mask->end(); i = mask->next(i)) {
+    if (!KMP_CPU_ISSET(i, mask)) {
+      continue;
+    }
+
+    // Check for buffer overflow.  A string of the form ",<n>" will have at most
+    // 10 characters, plus we want to leave room to print ",...}" if the set is
+    // too large to print for a total of 15 characters. We already left room for
+    // '\0' in setting end.
+    if (end - scan < 15) {
+      break;
+    }
+    KMP_SNPRINTF(scan, end - scan + 1, ",%-ld", (long)i);
+    while (*scan != '\0')
+      scan++;
+  }
+  if (i != mask->end()) {
+    KMP_SNPRINTF(scan, end - scan + 1, ",...");
+    while (*scan != '\0')
+      scan++;
+  }
+  KMP_SNPRINTF(scan, end - scan + 1, "}");
+  while (*scan != '\0')
+    scan++;
+  KMP_ASSERT(scan <= end);
+  return buf;
+}
 
-void
-__kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
-{
-    KMP_CPU_ZERO(mask);
+void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
+  KMP_CPU_ZERO(mask);
 
-# if KMP_GROUP_AFFINITY
+#if KMP_GROUP_AFFINITY
 
-    if (__kmp_num_proc_groups > 1) {
-        int group;
-        KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
-        for (group = 0; group < __kmp_num_proc_groups; group++) {
-            int i;
-            int num = __kmp_GetActiveProcessorCount(group);
-            for (i = 0; i < num; i++) {
-                KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
-            }
-        }
+  if (__kmp_num_proc_groups > 1) {
+    int group;
+    KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
+    for (group = 0; group < __kmp_num_proc_groups; group++) {
+      int i;
+      int num = __kmp_GetActiveProcessorCount(group);
+      for (i = 0; i < num; i++) {
+        KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
+      }
     }
-    else
+  } else
 
-# endif /* KMP_GROUP_AFFINITY */
+#endif /* KMP_GROUP_AFFINITY */
 
-    {
-        int proc;
-        for (proc = 0; proc < __kmp_xproc; proc++) {
-            KMP_CPU_SET(proc, mask);
-        }
+  {
+    int proc;
+    for (proc = 0; proc < __kmp_xproc; proc++) {
+      KMP_CPU_SET(proc, mask);
     }
+  }
 }
 
-//
 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
 // called to renumber the labels from [0..n] and place them into the child_num
 // vector of the address object.  This is done in case the labels used for
@@ -175,59 +168,53 @@ __kmp_affinity_entire_machine_mask(kmp_a
 // because we are paying attention to the labels themselves, not the ordinal
 // child numbers.  By using the child numbers in the sort, the result is
 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
-//
-static void
-__kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
-  int numAddrs)
-{
-    KMP_DEBUG_ASSERT(numAddrs > 0);
-    int depth = address2os->first.depth;
-    unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-    unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
-      * sizeof(unsigned));
-    int labCt;
+static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
+                                             int numAddrs) {
+  KMP_DEBUG_ASSERT(numAddrs > 0);
+  int depth = address2os->first.depth;
+  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+  int labCt;
+  for (labCt = 0; labCt < depth; labCt++) {
+    address2os[0].first.childNums[labCt] = counts[labCt] = 0;
+    lastLabel[labCt] = address2os[0].first.labels[labCt];
+  }
+  int i;
+  for (i = 1; i < numAddrs; i++) {
     for (labCt = 0; labCt < depth; labCt++) {
-        address2os[0].first.childNums[labCt] = counts[labCt] = 0;
-        lastLabel[labCt] = address2os[0].first.labels[labCt];
-    }
-    int i;
-    for (i = 1; i < numAddrs; i++) {
-        for (labCt = 0; labCt < depth; labCt++) {
-            if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
-                int labCt2;
-                for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
-                    counts[labCt2] = 0;
-                    lastLabel[labCt2] = address2os[i].first.labels[labCt2];
-                }
-                counts[labCt]++;
-                lastLabel[labCt] = address2os[i].first.labels[labCt];
-                break;
-            }
-        }
-        for (labCt = 0; labCt < depth; labCt++) {
-            address2os[i].first.childNums[labCt] = counts[labCt];
-        }
-        for (; labCt < (int)Address::maxDepth; labCt++) {
-            address2os[i].first.childNums[labCt] = 0;
+      if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
+        int labCt2;
+        for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
+          counts[labCt2] = 0;
+          lastLabel[labCt2] = address2os[i].first.labels[labCt2];
         }
+        counts[labCt]++;
+        lastLabel[labCt] = address2os[i].first.labels[labCt];
+        break;
+      }
     }
-    __kmp_free(lastLabel);
-    __kmp_free(counts);
+    for (labCt = 0; labCt < depth; labCt++) {
+      address2os[i].first.childNums[labCt] = counts[labCt];
+    }
+    for (; labCt < (int)Address::maxDepth; labCt++) {
+      address2os[i].first.childNums[labCt] = 0;
+    }
+  }
+  __kmp_free(lastLabel);
+  __kmp_free(counts);
 }
 
-
-//
 // All of the __kmp_affinity_create_*_map() routines should set
 // __kmp_affinity_masks to a vector of affinity mask objects of length
-// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
-// return the number of levels in the machine topology tree (zero if
+// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return
+// the number of levels in the machine topology tree (zero if
 // __kmp_affinity_type == affinity_none).
 //
-// All of the __kmp_affinity_create_*_map() routines should set *__kmp_affin_fullMask
-// to the affinity mask for the initialization thread.  They need to save and
-// restore the mask, and it could be needed later, so saving it is just an
-// optimization to avoid calling kmp_get_system_affinity() again.
-//
+// All of the __kmp_affinity_create_*_map() routines should set
+// *__kmp_affin_fullMask to the affinity mask for the initialization thread.
+// They need to save and restore the mask, and it could be needed later, so
+// saving it is just an optimization to avoid calling kmp_get_system_affinity()
+// again.
 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
 
 static int nCoresPerPkg, nPackages;
@@ -237,58 +224,45 @@ static int __kmp_ncores;
 #endif
 static int *__kmp_pu_os_idx = NULL;
 
-//
 // __kmp_affinity_uniform_topology() doesn't work when called from
 // places which support arbitrarily many levels in the machine topology
 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
 // __kmp_affinity_create_x2apicid_map().
-//
-inline static bool
-__kmp_affinity_uniform_topology()
-{
-    return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
+inline static bool __kmp_affinity_uniform_topology() {
+  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
 }
 
-
-//
 // Print out the detailed machine topology map, i.e. the physical locations
 // of each OS proc.
-//
-static void
-__kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
-  int pkgLevel, int coreLevel, int threadLevel)
-{
-    int proc;
+static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
+                                          int depth, int pkgLevel,
+                                          int coreLevel, int threadLevel) {
+  int proc;
 
-    KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
-    for (proc = 0; proc < len; proc++) {
-        int level;
-        kmp_str_buf_t buf;
-        __kmp_str_buf_init(&buf);
-        for (level = 0; level < depth; level++) {
-            if (level == threadLevel) {
-                __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
-            }
-            else if (level == coreLevel) {
-                __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
-            }
-            else if (level == pkgLevel) {
-                __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
-            }
-            else if (level > pkgLevel) {
-                __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
-                  level - pkgLevel - 1);
-            }
-            else {
-                __kmp_str_buf_print(&buf, "L%d ", level);
-            }
-            __kmp_str_buf_print(&buf, "%d ",
-              address2os[proc].first.labels[level]);
-        }
-        KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
-          buf.str);
-        __kmp_str_buf_free(&buf);
+  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
+  for (proc = 0; proc < len; proc++) {
+    int level;
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
+    for (level = 0; level < depth; level++) {
+      if (level == threadLevel) {
+        __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
+      } else if (level == coreLevel) {
+        __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
+      } else if (level == pkgLevel) {
+        __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
+      } else if (level > pkgLevel) {
+        __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
+                            level - pkgLevel - 1);
+      } else {
+        __kmp_str_buf_print(&buf, "L%d ", level);
+      }
+      __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]);
     }
+    KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
+               buf.str);
+    __kmp_str_buf_free(&buf);
+  }
 }
 
 #if KMP_USE_HWLOC
@@ -298,2734 +272,2423 @@ __kmp_affinity_print_topology(AddrUnsPai
 // have one thread context per core, we don't want the extra thread context
 // level if it offers no unique labels.  So they are removed.
 // return value: the new depth of address2os
-static int
-__kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, int nActiveThreads, int depth, int* pkgLevel, int* coreLevel, int* threadLevel) {
-    int level;
-    int i;
-    int radix1_detected;
-
-    for (level = depth-1; level >= 0; --level) {
-        // Always keep the package level
-        if (level == *pkgLevel)
-            continue;
-        // Detect if this level is radix 1
-        radix1_detected = 1;
-        for (i = 1; i < nActiveThreads; ++i) {
-            if (address2os[0].first.labels[level] != address2os[i].first.labels[level]) {
-                // There are differing label values for this level so it stays
-                radix1_detected = 0;
-                break;
-            }
-        }
-        if (!radix1_detected)
-            continue;
-        // Radix 1 was detected
-        if (level == *threadLevel) {
-            // If only one thread per core, then just decrement
-            // the depth which removes the threadlevel from address2os
-            for (i = 0; i < nActiveThreads; ++i) {
-                address2os[i].first.depth--;
-            }
-            *threadLevel = -1;
-        } else if (level == *coreLevel) {
-            // For core level, we move the thread labels over if they are still
-            // valid (*threadLevel != -1), and also reduce the depth another level
-            for (i = 0; i < nActiveThreads; ++i) {
-                if (*threadLevel != -1) {
-                    address2os[i].first.labels[*coreLevel] = address2os[i].first.labels[*threadLevel];
-                }
-                address2os[i].first.depth--;
-            }
-            *coreLevel = -1;
+static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os,
+                                                  int nActiveThreads, int depth,
+                                                  int *pkgLevel, int *coreLevel,
+                                                  int *threadLevel) {
+  int level;
+  int i;
+  int radix1_detected;
+
+  for (level = depth - 1; level >= 0; --level) {
+    // Always keep the package level
+    if (level == *pkgLevel)
+      continue;
+    // Detect if this level is radix 1
+    radix1_detected = 1;
+    for (i = 1; i < nActiveThreads; ++i) {
+      if (address2os[0].first.labels[level] !=
+          address2os[i].first.labels[level]) {
+        // There are differing label values for this level so it stays
+        radix1_detected = 0;
+        break;
+      }
+    }
+    if (!radix1_detected)
+      continue;
+    // Radix 1 was detected
+    if (level == *threadLevel) {
+      // If only one thread per core, then just decrement
+      // the depth which removes the threadlevel from address2os
+      for (i = 0; i < nActiveThreads; ++i) {
+        address2os[i].first.depth--;
+      }
+      *threadLevel = -1;
+    } else if (level == *coreLevel) {
+      // For core level, we move the thread labels over if they are still
+      // valid (*threadLevel != -1), and also reduce the depth another level
+      for (i = 0; i < nActiveThreads; ++i) {
+        if (*threadLevel != -1) {
+          address2os[i].first.labels[*coreLevel] =
+              address2os[i].first.labels[*threadLevel];
         }
+        address2os[i].first.depth--;
+      }
+      *coreLevel = -1;
     }
-    return address2os[0].first.depth;
+  }
+  return address2os[0].first.depth;
 }
 
-// Returns the number of objects of type 'type' below 'obj' within the topology tree structure.
-// e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is HWLOC_OBJ_PU, then
-//  this will return the number of PU's under the SOCKET object.
-static int
-__kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, hwloc_obj_type_t type) {
-    int retval = 0;
-    hwloc_obj_t first;
-    for(first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, obj->logical_index, type, 0);
-        first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == obj;
-        first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, first))
-    {
-        ++retval;
+// Returns the number of objects of type 'type' below 'obj' within the topology
+// tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
+// HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
+// object.
+static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
+                                           hwloc_obj_type_t type) {
+  int retval = 0;
+  hwloc_obj_t first;
+  for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
+                                           obj->logical_index, type, 0);
+       first != NULL &&
+       hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) ==
+           obj;
+       first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
+                                          first)) {
+    ++retval;
+  }
+  return retval;
+}
+
+static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
+                                           kmp_i18n_id_t *const msg_id) {
+  *address2os = NULL;
+  *msg_id = kmp_i18n_null;
+
+  // Save the affinity mask for the current thread.
+  kmp_affin_mask_t *oldMask;
+  KMP_CPU_ALLOC(oldMask);
+  __kmp_get_system_affinity(oldMask, TRUE);
+
+  int depth = 3;
+  int pkgLevel = 0;
+  int coreLevel = 1;
+  int threadLevel = 2;
+
+  if (!KMP_AFFINITY_CAPABLE()) {
+    // Hack to try and infer the machine topology using only the data
+    // available from cpuid on the current thread, and __kmp_xproc.
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+
+    nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(
+        hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0),
+        HWLOC_OBJ_CORE);
+    __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(
+        hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0),
+        HWLOC_OBJ_PU);
+    __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
+    nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      if (__kmp_affinity_uniform_topology()) {
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      } else {
+        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+      }
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
     }
-    return retval;
-}
-
-static int
-__kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
-  kmp_i18n_id_t *const msg_id)
-{
-    *address2os = NULL;
-    *msg_id = kmp_i18n_null;
-
-    //
-    // Save the affinity mask for the current thread.
-    //
-    kmp_affin_mask_t *oldMask;
-    KMP_CPU_ALLOC(oldMask);
-    __kmp_get_system_affinity(oldMask, TRUE);
-
-    int depth = 3;
-    int pkgLevel = 0;
-    int coreLevel = 1;
-    int threadLevel = 2;
+    KMP_CPU_FREE(oldMask);
+    return 0;
+  }
 
-    if (! KMP_AFFINITY_CAPABLE())
-    {
-        //
-        // Hack to try and infer the machine topology using only the data
-        // available from cpuid on the current thread, and __kmp_xproc.
-        //
-        KMP_ASSERT(__kmp_affinity_type == affinity_none);
-
-        nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0), HWLOC_OBJ_CORE);
-        __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
-        __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
-        nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
-        if (__kmp_affinity_verbose) {
-            KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
-            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-            if (__kmp_affinity_uniform_topology()) {
-                KMP_INFORM(Uniform, "KMP_AFFINITY");
-            } else {
-                KMP_INFORM(NonUniform, "KMP_AFFINITY");
-            }
-            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-              __kmp_nThreadsPerCore, __kmp_ncores);
-        }
-        KMP_CPU_FREE(oldMask);
-        return 0;
+  // Allocate the data structure to be returned.
+  AddrUnsPair *retval =
+      (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
+  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
+  // nCoresPerPkg, & nPackages.  Make sure all these vars are set
+  // correctly, and return if affinity is not enabled.
+
+  hwloc_obj_t pu;
+  hwloc_obj_t core;
+  hwloc_obj_t socket;
+  int nActiveThreads = 0;
+  int socket_identifier = 0;
+  // re-calculate globals to count only accessible resources
+  __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
+  for (socket =
+           hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0);
+       socket != NULL;
+       socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology,
+                                           HWLOC_OBJ_PACKAGE, socket),
+         socket_identifier++) {
+    int core_identifier = 0;
+    int num_active_cores = 0;
+    for (core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type,
+                                            socket->logical_index,
+                                            HWLOC_OBJ_CORE, 0);
+         core != NULL &&
+         hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type,
+                                        core) == socket;
+         core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE,
+                                           core),
+         core_identifier++) {
+      int pu_identifier = 0;
+      int num_active_threads = 0;
+      for (pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type,
+                                            core->logical_index, HWLOC_OBJ_PU,
+                                            0);
+           pu != NULL &&
+           hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type,
+                                          pu) == core;
+           pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU,
+                                           pu),
+           pu_identifier++) {
+        Address addr(3);
+        if(!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
+          continue; // skip inactive (inaccessible) unit
+        KA_TRACE(20,
+                 ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
+                  socket->os_index, socket->logical_index, core->os_index,
+                  core->logical_index, pu->os_index,pu->logical_index));
+        addr.labels[0] = socket_identifier; // package
+        addr.labels[1] = core_identifier; // core
+        addr.labels[2] = pu_identifier; // pu
+        retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
+        __kmp_pu_os_idx[nActiveThreads] =
+            pu->os_index; // keep os index for each active pu
+        nActiveThreads++;
+        ++num_active_threads; // count active threads per core
+      }
+      if (num_active_threads) { // were there any active threads on the core?
+        ++__kmp_ncores; // count total active cores
+        ++num_active_cores; // count active cores per socket
+        if (num_active_threads > __kmp_nThreadsPerCore)
+          __kmp_nThreadsPerCore = num_active_threads; // calc maximum
+      }
     }
+    if (num_active_cores) { // were there any active cores on the socket?
+      ++nPackages; // count total active packages
+      if (num_active_cores > nCoresPerPkg)
+        nCoresPerPkg = num_active_cores; // calc maximum
+    }
+  }
+
+  // If there's only one thread context to bind to, return now.
+  KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
+  KMP_ASSERT(nActiveThreads > 0);
+  if (nActiveThreads == 1) {
+    __kmp_ncores = nPackages = 1;
+    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+    if (__kmp_affinity_verbose) {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
 
-    //
-    // Allocate the data structure to be returned.
-    //
-    AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
-    __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-
-    //
-    // When affinity is off, this routine will still be called to set
-    // __kmp_ncores, as well as __kmp_nThreadsPerCore,
-    // nCoresPerPkg, & nPackages.  Make sure all these vars are set
-    // correctly, and return if affinity is not enabled.
-    //
-
-    hwloc_obj_t pu;
-    hwloc_obj_t core;
-    hwloc_obj_t socket;
-    int nActiveThreads = 0;
-    int socket_identifier = 0;
-    // re-calculate globals to count only accessible resources
-    __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
-    for(socket = hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0);
-        socket != NULL;
-        socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, socket),
-        socket_identifier++)
-    {
-        int core_identifier = 0;
-        int num_active_cores = 0;
-        for(core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, socket->logical_index, HWLOC_OBJ_CORE, 0);
-            core != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, core) == socket;
-            core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, core),
-            core_identifier++)
-        {
-            int pu_identifier = 0;
-            int num_active_threads = 0;
-            for(pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, core->logical_index, HWLOC_OBJ_PU, 0);
-                pu != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, pu) == core;
-                pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, pu),
-                pu_identifier++)
-            {
-                Address addr(3);
-                if(! KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
-                    continue;         // skip inactive (inaccessible) unit
-                KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
-                    socket->os_index, socket->logical_index, core->os_index, core->logical_index, pu->os_index,pu->logical_index));
-                addr.labels[0] = socket_identifier; // package
-                addr.labels[1] = core_identifier; // core
-                addr.labels[2] = pu_identifier; // pu
-                retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
-                __kmp_pu_os_idx[nActiveThreads] = pu->os_index; // keep os index for each active pu
-                nActiveThreads++;
-                ++num_active_threads; // count active threads per core
-            }
-            if (num_active_threads) { // were there any active threads on the core?
-                ++__kmp_ncores;       // count total active cores
-                ++num_active_cores;   // count active cores per socket
-                if (num_active_threads > __kmp_nThreadsPerCore)
-                    __kmp_nThreadsPerCore = num_active_threads; // calc maximum
-            }
-        }
-        if (num_active_cores) {       // were there any active cores on the socket?
-            ++nPackages;              // count total active packages
-            if (num_active_cores > nCoresPerPkg)
-                nCoresPerPkg = num_active_cores; // calc maximum
-        }
+      KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+      if (__kmp_affinity_respect_mask) {
+        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+      } else {
+        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+      }
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
     }
 
-    //
-    // If there's only one thread context to bind to, return now.
-    //
-    KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
-    KMP_ASSERT(nActiveThreads > 0);
-    if (nActiveThreads == 1) {
-        __kmp_ncores = nPackages = 1;
-        __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-        if (__kmp_affinity_verbose) {
-            char buf[KMP_AFFIN_MASK_PRINT_LEN];
-            __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-
-            KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
-            if (__kmp_affinity_respect_mask) {
-                KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-            } else {
-                KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-            }
-            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-            KMP_INFORM(Uniform, "KMP_AFFINITY");
-            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-              __kmp_nThreadsPerCore, __kmp_ncores);
-        }
-
-        if (__kmp_affinity_type == affinity_none) {
-            __kmp_free(retval);
-            KMP_CPU_FREE(oldMask);
-            return 0;
-        }
-
-        //
-        // Form an Address object which only includes the package level.
-        //
-        Address addr(1);
-        addr.labels[0] = retval[0].first.labels[pkgLevel];
-        retval[0].first = addr;
-
-        if (__kmp_affinity_gran_levels < 0) {
-            __kmp_affinity_gran_levels = 0;
-        }
-
-        if (__kmp_affinity_verbose) {
-            __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
-        }
-
-        *address2os = retval;
-        KMP_CPU_FREE(oldMask);
-        return 1;
+    if (__kmp_affinity_type == affinity_none) {
+      __kmp_free(retval);
+      KMP_CPU_FREE(oldMask);
+      return 0;
     }
 
-    //
-    // Sort the table by physical Id.
-    //
-    qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
+    // Form an Address object which only includes the package level.
+    Address addr(1);
+    addr.labels[0] = retval[0].first.labels[pkgLevel];
+    retval[0].first = addr;
 
-    //
-    // Check to see if the machine topology is uniform
-    //
-    unsigned uniform = (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads);
+    if (__kmp_affinity_gran_levels < 0) {
+      __kmp_affinity_gran_levels = 0;
+    }
 
-    //
-    // Print the machine topology summary.
-    //
     if (__kmp_affinity_verbose) {
-        char mask[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-
-        KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
-        if (__kmp_affinity_respect_mask) {
-            KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
-        } else {
-            KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
-        }
-        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-        if (uniform) {
-            KMP_INFORM(Uniform, "KMP_AFFINITY");
-        } else {
-            KMP_INFORM(NonUniform, "KMP_AFFINITY");
-        }
-
-        kmp_str_buf_t buf;
-        __kmp_str_buf_init(&buf);
+      __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
+    }
 
-        __kmp_str_buf_print(&buf, "%d", nPackages);
-        //for (level = 1; level <= pkgLevel; level++) {
-        //    __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
-       // }
-        KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
-          __kmp_nThreadsPerCore, __kmp_ncores);
+    *address2os = retval;
+    KMP_CPU_FREE(oldMask);
+    return 1;
+  }
 
-        __kmp_str_buf_free(&buf);
+  // Sort the table by physical Id.
+  qsort(retval, nActiveThreads, sizeof(*retval),
+        __kmp_affinity_cmp_Address_labels);
+
+  // Check to see if the machine topology is uniform
+  unsigned uniform =
+      (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads);
+
+  // Print the machine topology summary.
+  if (__kmp_affinity_verbose) {
+    char mask[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+    KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+    if (__kmp_affinity_respect_mask) {
+      KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
+    } else {
+      KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
     }
-
-    if (__kmp_affinity_type == affinity_none) {
-        __kmp_free(retval);
-        KMP_CPU_FREE(oldMask);
-        return 0;
+    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+    if (uniform) {
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+    } else {
+      KMP_INFORM(NonUniform, "KMP_AFFINITY");
     }
 
-    //
-    // Find any levels with radiix 1, and remove them from the map
-    // (except for the package level).
-    //
-    depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel);
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
 
-    if (__kmp_affinity_gran_levels < 0) {
-        //
-        // Set the granularity level based on what levels are modeled
-        // in the machine topology map.
-        //
-        __kmp_affinity_gran_levels = 0;
-        if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
-            __kmp_affinity_gran_levels++;
-        }
-        if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
-            __kmp_affinity_gran_levels++;
-        }
-        if (__kmp_affinity_gran > affinity_gran_package) {
-            __kmp_affinity_gran_levels++;
-        }
-    }
+    __kmp_str_buf_print(&buf, "%d", nPackages);
+    // for (level = 1; level <= pkgLevel; level++) {
+    //    __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
+    // }
+    KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
+               __kmp_nThreadsPerCore, __kmp_ncores);
 
-    if (__kmp_affinity_verbose) {
-        __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel,
-          coreLevel, threadLevel);
-    }
+    __kmp_str_buf_free(&buf);
+  }
 
+  if (__kmp_affinity_type == affinity_none) {
+    __kmp_free(retval);
     KMP_CPU_FREE(oldMask);
-    *address2os = retval;
-    return depth;
+    return 0;
+  }
+
+  // Find any levels with radiix 1, and remove them from the map
+  // (except for the package level).
+  depth = __kmp_affinity_remove_radix_one_levels(
+      retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel);
+
+  if (__kmp_affinity_gran_levels < 0) {
+    // Set the granularity level based on what levels are modeled
+    // in the machine topology map.
+    __kmp_affinity_gran_levels = 0;
+    if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
+      __kmp_affinity_gran_levels++;
+    }
+    if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
+      __kmp_affinity_gran_levels++;
+    }
+    if (__kmp_affinity_gran > affinity_gran_package) {
+      __kmp_affinity_gran_levels++;
+    }
+  }
+
+  if (__kmp_affinity_verbose) {
+    __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel,
+                                  coreLevel, threadLevel);
+  }
+
+  KMP_CPU_FREE(oldMask);
+  *address2os = retval;
+  return depth;
 }
 #endif // KMP_USE_HWLOC
 
-//
 // If we don't know how to retrieve the machine's processor topology, or
 // encounter an error in doing so, this routine is called to form a "flat"
 // mapping of os thread id's <-> processor id's.
-//
-static int
-__kmp_affinity_create_flat_map(AddrUnsPair **address2os,
-  kmp_i18n_id_t *const msg_id)
-{
-    *address2os = NULL;
-    *msg_id = kmp_i18n_null;
-
-    //
-    // Even if __kmp_affinity_type == affinity_none, this routine might still
-    // called to set __kmp_ncores, as well as
-    // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
-    //
-    if (! KMP_AFFINITY_CAPABLE()) {
-        KMP_ASSERT(__kmp_affinity_type == affinity_none);
-        __kmp_ncores = nPackages = __kmp_xproc;
-        __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-        if (__kmp_affinity_verbose) {
-            KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
-            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-            KMP_INFORM(Uniform, "KMP_AFFINITY");
-            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-              __kmp_nThreadsPerCore, __kmp_ncores);
-        }
-        return 0;
-    }
-
-    //
-    // When affinity is off, this routine will still be called to set
-    // __kmp_ncores, as well as __kmp_nThreadsPerCore,
-    // nCoresPerPkg, & nPackages.  Make sure all these vars are set
-    //  correctly, and return now if affinity is not enabled.
-    //
-    __kmp_ncores = nPackages = __kmp_avail_proc;
+static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
+                                          kmp_i18n_id_t *const msg_id) {
+  *address2os = NULL;
+  *msg_id = kmp_i18n_null;
+
+  // Even if __kmp_affinity_type == affinity_none, this routine might still
+  // called to set __kmp_ncores, as well as
+  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  if (!KMP_AFFINITY_CAPABLE()) {
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    __kmp_ncores = nPackages = __kmp_xproc;
     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
     if (__kmp_affinity_verbose) {
-        char buf[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
-
-        KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
-        if (__kmp_affinity_respect_mask) {
-            KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-        } else {
-            KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-        }
-        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-        KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-          __kmp_nThreadsPerCore, __kmp_ncores);
-    }
-    KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-    __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-    if (__kmp_affinity_type == affinity_none) {
-        int avail_ct = 0;
-        int i;
-        KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
-            if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask))
-                continue;
-            __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
-        }
-        return 0;
+      KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
     }
+    return 0;
+  }
 
-    //
-    // Contruct the data structure to be returned.
-    //
-    *address2os = (AddrUnsPair*)
-      __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly, and return now if affinity is
+  // not enabled.
+  __kmp_ncores = nPackages = __kmp_avail_proc;
+  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+  if (__kmp_affinity_verbose) {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              __kmp_affin_fullMask);
+
+    KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
+    if (__kmp_affinity_respect_mask) {
+      KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+    } else {
+      KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+    }
+    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+    KMP_INFORM(Uniform, "KMP_AFFINITY");
+    KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+               __kmp_nThreadsPerCore, __kmp_ncores);
+  }
+  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
+  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  if (__kmp_affinity_type == affinity_none) {
     int avail_ct = 0;
-    unsigned int i;
+    int i;
     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
-        //
-        // Skip this proc if it is not included in the machine model.
-        //
-        if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
-            continue;
-        }
-        __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
-        Address addr(1);
-        addr.labels[0] = i;
-        (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
-    }
-    if (__kmp_affinity_verbose) {
-        KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
+      if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask))
+        continue;
+      __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
     }
+    return 0;
+  }
 
-    if (__kmp_affinity_gran_levels < 0) {
-        //
-        // Only the package level is modeled in the machine topology map,
-        // so the #levels of granularity is either 0 or 1.
-        //
-        if (__kmp_affinity_gran > affinity_gran_package) {
-            __kmp_affinity_gran_levels = 1;
-        }
-        else {
-            __kmp_affinity_gran_levels = 0;
-        }
+  // Contruct the data structure to be returned.
+  *address2os =
+      (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
+  int avail_ct = 0;
+  unsigned int i;
+  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+    // Skip this proc if it is not included in the machine model.
+    if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
+      continue;
+    }
+    __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
+    Address addr(1);
+    addr.labels[0] = i;
+    (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
+  }
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
+  }
+
+  if (__kmp_affinity_gran_levels < 0) {
+    // Only the package level is modeled in the machine topology map,
+    // so the #levels of granularity is either 0 or 1.
+    if (__kmp_affinity_gran > affinity_gran_package) {
+      __kmp_affinity_gran_levels = 1;
+    } else {
+      __kmp_affinity_gran_levels = 0;
     }
-    return 1;
+  }
+  return 1;
 }
 
+#if KMP_GROUP_AFFINITY
 
-# if KMP_GROUP_AFFINITY
-
-//
 // If multiple Windows* OS processor groups exist, we can create a 2-level
-// topology map with the groups at level 0 and the individual procs at
-// level 1.
-//
+// topology map with the groups at level 0 and the individual procs at level 1.
 // This facilitates letting the threads float among all procs in a group,
 // if granularity=group (the default when there are multiple groups).
-//
-static int
-__kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
-  kmp_i18n_id_t *const msg_id)
-{
-    *address2os = NULL;
-    *msg_id = kmp_i18n_null;
-
-    //
-    // If we don't have multiple processor groups, return now.
-    // The flat mapping will be used.
-    //
-    if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(__kmp_affin_fullMask) >= 0)) {
-        // FIXME set *msg_id
-        return -1;
-    }
-
-    //
-    // Contruct the data structure to be returned.
-    //
-    *address2os = (AddrUnsPair*)
-      __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
-    KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-    __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-    int avail_ct = 0;
-    int i;
-    KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
-        //
-        // Skip this proc if it is not included in the machine model.
-        //
-        if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
-            continue;
-        }
-        __kmp_pu_os_idx[avail_ct] = i;  // suppose indices are flat
-        Address addr(2);
-        addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
-        addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
-        (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
+static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
+                                                kmp_i18n_id_t *const msg_id) {
+  *address2os = NULL;
+  *msg_id = kmp_i18n_null;
+
+  // If we don't have multiple processor groups, return now.
+  // The flat mapping will be used.
+  if ((!KMP_AFFINITY_CAPABLE()) ||
+      (__kmp_get_proc_group(__kmp_affin_fullMask) >= 0)) {
+    // FIXME set *msg_id
+    return -1;
+  }
+
+  // Contruct the data structure to be returned.
+  *address2os =
+      (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
+  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
+  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  int avail_ct = 0;
+  int i;
+  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+    // Skip this proc if it is not included in the machine model.
+    if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
+      continue;
+    }
+    __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
+    Address addr(2);
+    addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
+    addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
+    (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
 
-        if (__kmp_affinity_verbose) {
-            KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
-              addr.labels[1]);
-        }
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
+                 addr.labels[1]);
     }
+  }
 
-    if (__kmp_affinity_gran_levels < 0) {
-        if (__kmp_affinity_gran == affinity_gran_group) {
-            __kmp_affinity_gran_levels = 1;
-        }
-        else if ((__kmp_affinity_gran == affinity_gran_fine)
-          || (__kmp_affinity_gran == affinity_gran_thread)) {
-            __kmp_affinity_gran_levels = 0;
-        }
-        else {
-            const char *gran_str = NULL;
-            if (__kmp_affinity_gran == affinity_gran_core) {
-                gran_str = "core";
-            }
-            else if (__kmp_affinity_gran == affinity_gran_package) {
-                gran_str = "package";
-            }
-            else if (__kmp_affinity_gran == affinity_gran_node) {
-                gran_str = "node";
-            }
-            else {
-                KMP_ASSERT(0);
-            }
+  if (__kmp_affinity_gran_levels < 0) {
+    if (__kmp_affinity_gran == affinity_gran_group) {
+      __kmp_affinity_gran_levels = 1;
+    } else if ((__kmp_affinity_gran == affinity_gran_fine) ||
+               (__kmp_affinity_gran == affinity_gran_thread)) {
+      __kmp_affinity_gran_levels = 0;
+    } else {
+      const char *gran_str = NULL;
+      if (__kmp_affinity_gran == affinity_gran_core) {
+        gran_str = "core";
+      } else if (__kmp_affinity_gran == affinity_gran_package) {
+        gran_str = "package";
+      } else if (__kmp_affinity_gran == affinity_gran_node) {
+        gran_str = "node";
+      } else {
+        KMP_ASSERT(0);
+      }
 
-            // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
-            __kmp_affinity_gran_levels = 0;
-        }
+      // Warning: can't use affinity granularity \"gran\" with group topology
+      // method, using "thread"
+      __kmp_affinity_gran_levels = 0;
     }
-    return 2;
+  }
+  return 2;
 }
 
-# endif /* KMP_GROUP_AFFINITY */
-
+#endif /* KMP_GROUP_AFFINITY */
 
-# if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
-static int
-__kmp_cpuid_mask_width(int count) {
-    int r = 0;
+static int __kmp_cpuid_mask_width(int count) {
+  int r = 0;
 
-    while((1<<r) < count)
-        ++r;
-    return r;
+  while ((1 << r) < count)
+    ++r;
+  return r;
 }
 
-
 class apicThreadInfo {
 public:
-    unsigned osId;              // param to __kmp_affinity_bind_thread
-    unsigned apicId;            // from cpuid after binding
-    unsigned maxCoresPerPkg;    //      ""
-    unsigned maxThreadsPerPkg;  //      ""
-    unsigned pkgId;             // inferred from above values
-    unsigned coreId;            //      ""
-    unsigned threadId;          //      ""
+  unsigned osId; // param to __kmp_affinity_bind_thread
+  unsigned apicId; // from cpuid after binding
+  unsigned maxCoresPerPkg; //      ""
+  unsigned maxThreadsPerPkg; //      ""
+  unsigned pkgId; // inferred from above values
+  unsigned coreId; //      ""
+  unsigned threadId; //      ""
 };
 
-
-static int
-__kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
-{
-    const apicThreadInfo *aa = (const apicThreadInfo *)a;
-    const apicThreadInfo *bb = (const apicThreadInfo *)b;
-    if (aa->osId < bb->osId) return -1;
-    if (aa->osId > bb->osId) return 1;
-    return 0;
+static int __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a,
+                                                   const void *b) {
+  const apicThreadInfo *aa = (const apicThreadInfo *)a;
+  const apicThreadInfo *bb = (const apicThreadInfo *)b;
+  if (aa->osId < bb->osId)
+    return -1;
+  if (aa->osId > bb->osId)
+    return 1;
+  return 0;
 }
 
-
-static int
-__kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
-{
-    const apicThreadInfo *aa = (const apicThreadInfo *)a;
-    const apicThreadInfo *bb = (const apicThreadInfo *)b;
-    if (aa->pkgId < bb->pkgId) return -1;
-    if (aa->pkgId > bb->pkgId) return 1;
-    if (aa->coreId < bb->coreId) return -1;
-    if (aa->coreId > bb->coreId) return 1;
-    if (aa->threadId < bb->threadId) return -1;
-    if (aa->threadId > bb->threadId) return 1;
-    return 0;
+static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
+                                                     const void *b) {
+  const apicThreadInfo *aa = (const apicThreadInfo *)a;
+  const apicThreadInfo *bb = (const apicThreadInfo *)b;
+  if (aa->pkgId < bb->pkgId)
+    return -1;
+  if (aa->pkgId > bb->pkgId)
+    return 1;
+  if (aa->coreId < bb->coreId)
+    return -1;
+  if (aa->coreId > bb->coreId)
+    return 1;
+  if (aa->threadId < bb->threadId)
+    return -1;
+  if (aa->threadId > bb->threadId)
+    return 1;
+  return 0;
 }
 
-
-//
 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
 // an algorithm which cycles through the available os threads, setting
 // the current thread's affinity mask to that thread, and then retrieves
 // the Apic Id for each thread context using the cpuid instruction.
-//
-static int
-__kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
-  kmp_i18n_id_t *const msg_id)
-{
-    kmp_cpuid buf;
-    int rc;
-    *address2os = NULL;
-    *msg_id = kmp_i18n_null;
+static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
+                                            kmp_i18n_id_t *const msg_id) {
+  kmp_cpuid buf;
+  int rc;
+  *address2os = NULL;
+  *msg_id = kmp_i18n_null;
+
+  // Check if cpuid leaf 4 is supported.
+  __kmp_x86_cpuid(0, 0, &buf);
+  if (buf.eax < 4) {
+    *msg_id = kmp_i18n_str_NoLeaf4Support;
+    return -1;
+  }
+
+  // The algorithm used starts by setting the affinity to each available thread
+  // and retrieving info from the cpuid instruction, so if we are not capable of
+  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
+  // need to do something else - use the defaults that we calculated from
+  // issuing cpuid without binding to each proc.
+  if (!KMP_AFFINITY_CAPABLE()) {
+    // Hack to try and infer the machine topology using only the data
+    // available from cpuid on the current thread, and __kmp_xproc.
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+
+    // Get an upper bound on the number of threads per package using cpuid(1).
+    // On some OS/chps combinations where HT is supported by the chip but is
+    // disabled, this value will be 2 on a single core chip. Usually, it will be
+    // 2 if HT is enabled and 1 if HT is disabled.
+    __kmp_x86_cpuid(1, 0, &buf);
+    int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
+    if (maxThreadsPerPkg == 0) {
+      maxThreadsPerPkg = 1;
+    }
+
+    // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
+    // value.
+    //
+    // The author of cpu_count.cpp treated this only an upper bound on the
+    // number of cores, but I haven't seen any cases where it was greater than
+    // the actual number of cores, so we will treat it as exact in this block of
+    // code.
+    //
+    // First, we need to check if cpuid(4) is supported on this chip. To see if
+    // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
+    // greater.
+    __kmp_x86_cpuid(0, 0, &buf);
+    if (buf.eax >= 4) {
+      __kmp_x86_cpuid(4, 0, &buf);
+      nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
+    } else {
+      nCoresPerPkg = 1;
+    }
 
-    //
-    // Check if cpuid leaf 4 is supported.
-    //
-        __kmp_x86_cpuid(0, 0, &buf);
-        if (buf.eax < 4) {
-            *msg_id = kmp_i18n_str_NoLeaf4Support;
-            return -1;
-        }
-
-    //
-    // The algorithm used starts by setting the affinity to each available
-    // thread and retrieving info from the cpuid instruction, so if we are
-    // not capable of calling __kmp_get_system_affinity() and
-    // _kmp_get_system_affinity(), then we need to do something else - use
-    // the defaults that we calculated from issuing cpuid without binding
-    // to each proc.
-    //
-    if (! KMP_AFFINITY_CAPABLE()) {
-        //
-        // Hack to try and infer the machine topology using only the data
-        // available from cpuid on the current thread, and __kmp_xproc.
-        //
-        KMP_ASSERT(__kmp_affinity_type == affinity_none);
-
-        //
-        // Get an upper bound on the number of threads per package using
-        // cpuid(1).
-        //
-        // On some OS/chps combinations where HT is supported by the chip
-        // but is disabled, this value will be 2 on a single core chip.
-        // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
-        //
-        __kmp_x86_cpuid(1, 0, &buf);
-        int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
-        if (maxThreadsPerPkg == 0) {
-            maxThreadsPerPkg = 1;
-        }
-
-        //
-        // The num cores per pkg comes from cpuid(4).
-        // 1 must be added to the encoded value.
-        //
-        // The author of cpu_count.cpp treated this only an upper bound
-        // on the number of cores, but I haven't seen any cases where it
-        // was greater than the actual number of cores, so we will treat
-        // it as exact in this block of code.
-        //
-        // First, we need to check if cpuid(4) is supported on this chip.
-        // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
-        // has the value n or greater.
-        //
-        __kmp_x86_cpuid(0, 0, &buf);
-        if (buf.eax >= 4) {
-            __kmp_x86_cpuid(4, 0, &buf);
-            nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
-        }
-        else {
-            nCoresPerPkg = 1;
-        }
-
-        //
-        // There is no way to reliably tell if HT is enabled without issuing
-        // the cpuid instruction from every thread, can correlating the cpuid
-        // info, so if the machine is not affinity capable, we assume that HT
-        // is off.  We have seen quite a few machines where maxThreadsPerPkg
-        // is 2, yet the machine does not support HT.
-        //
-        // - Older OSes are usually found on machines with older chips, which
-        //   do not support HT.
-        //
-        // - The performance penalty for mistakenly identifying a machine as
-        //   HT when it isn't (which results in blocktime being incorrecly set
-        //   to 0) is greater than the penalty when for mistakenly identifying
-        //   a machine as being 1 thread/core when it is really HT enabled
-        //   (which results in blocktime being incorrectly set to a positive
-        //   value).
-        //
-        __kmp_ncores = __kmp_xproc;
-        nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
-        __kmp_nThreadsPerCore = 1;
-        if (__kmp_affinity_verbose) {
-            KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
-            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-            if (__kmp_affinity_uniform_topology()) {
-                KMP_INFORM(Uniform, "KMP_AFFINITY");
-            } else {
-                KMP_INFORM(NonUniform, "KMP_AFFINITY");
-            }
-            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-              __kmp_nThreadsPerCore, __kmp_ncores);
-        }
-        return 0;
+    // There is no way to reliably tell if HT is enabled without issuing the
+    // cpuid instruction from every thread, can correlating the cpuid info, so
+    // if the machine is not affinity capable, we assume that HT is off. We have
+    // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
+    // does not support HT.
+    //
+    // - Older OSes are usually found on machines with older chips, which do not
+    //   support HT.
+    // - The performance penalty for mistakenly identifying a machine as HT when
+    //   it isn't (which results in blocktime being incorrecly set to 0) is
+    //   greater than the penalty when for mistakenly identifying a machine as
+    //   being 1 thread/core when it is really HT enabled (which results in
+    //   blocktime being incorrectly set to a positive value).
+    __kmp_ncores = __kmp_xproc;
+    nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    __kmp_nThreadsPerCore = 1;
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      if (__kmp_affinity_uniform_topology()) {
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      } else {
+        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+      }
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
     }
+    return 0;
+  }
 
-    //
-    //
-    // From here on, we can assume that it is safe to call
-    // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
-    // even if __kmp_affinity_type = affinity_none.
-    //
-
-    //
-    // Save the affinity mask for the current thread.
-    //
-    kmp_affin_mask_t *oldMask;
-    KMP_CPU_ALLOC(oldMask);
-    KMP_ASSERT(oldMask != NULL);
-    __kmp_get_system_affinity(oldMask, TRUE);
-
-    //
-    // Run through each of the available contexts, binding the current thread
-    // to it, and obtaining the pertinent information using the cpuid instr.
-    //
-    // The relevant information is:
-    //
-    // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
-    //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
-    //
-    // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
-    //    value of this field determines the width of the core# + thread#
-    //    fields in the Apic Id.  It is also an upper bound on the number
-    //    of threads per package, but it has been verified that situations
-    //    happen were it is not exact.  In particular, on certain OS/chip
-    //    combinations where Intel(R) Hyper-Threading Technology is supported
-    //    by the chip but has
-    //    been disabled, the value of this field will be 2 (for a single core
-    //    chip).  On other OS/chip combinations supporting
-    //    Intel(R) Hyper-Threading Technology, the value of
-    //    this field will be 1 when Intel(R) Hyper-Threading Technology is
-    //    disabled and 2 when it is enabled.
-    //
-    // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
-    //    value of this field (+1) determines the width of the core# field in
-    //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
-    //    an upper bound, but the IA-32 architecture manual says that it is
-    //    exactly the number of cores per package, and I haven't seen any
-    //    case where it wasn't.
-    //
-    // From this information, deduce the package Id, core Id, and thread Id,
-    // and set the corresponding fields in the apicThreadInfo struct.
-    //
-    unsigned i;
-    apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
+  // From here on, we can assume that it is safe to call
+  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
+  // __kmp_affinity_type = affinity_none.
+
+  // Save the affinity mask for the current thread.
+  kmp_affin_mask_t *oldMask;
+  KMP_CPU_ALLOC(oldMask);
+  KMP_ASSERT(oldMask != NULL);
+  __kmp_get_system_affinity(oldMask, TRUE);
+
+  // Run through each of the available contexts, binding the current thread
+  // to it, and obtaining the pertinent information using the cpuid instr.
+  //
+  // The relevant information is:
+  // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
+  //     has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
+  // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
+  //     of this field determines the width of the core# + thread# fields in the
+  //     Apic Id. It is also an upper bound on the number of threads per
+  //     package, but it has been verified that situations happen were it is not
+  //     exact. In particular, on certain OS/chip combinations where Intel(R)
+  //     Hyper-Threading Technology is supported by the chip but has been
+  //     disabled, the value of this field will be 2 (for a single core chip).
+  //     On other OS/chip combinations supporting Intel(R) Hyper-Threading
+  //     Technology, the value of this field will be 1 when Intel(R)
+  //     Hyper-Threading Technology is disabled and 2 when it is enabled.
+  // - Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4). The value
+  //     of this field (+1) determines the width of the core# field in the Apic
+  //     Id. The comments in "cpucount.cpp" say that this value is an upper
+  //     bound, but the IA-32 architecture manual says that it is exactly the
+  //     number of cores per package, and I haven't seen any case where it
+  //     wasn't.
+  //
+  // From this information, deduce the package Id, core Id, and thread Id,
+  // and set the corresponding fields in the apicThreadInfo struct.
+  unsigned i;
+  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
       __kmp_avail_proc * sizeof(apicThreadInfo));
-    unsigned nApics = 0;
-    KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
-        //
-        // Skip this proc if it is not included in the machine model.
-        //
-        if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
-            continue;
-        }
-        KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
-
-        __kmp_affinity_dispatch->bind_thread(i);
-        threadInfo[nApics].osId = i;
-
-        //
-        // The apic id and max threads per pkg come from cpuid(1).
-        //
-        __kmp_x86_cpuid(1, 0, &buf);
-        if (((buf.edx >> 9) & 1) == 0) {
-            __kmp_set_system_affinity(oldMask, TRUE);
-            __kmp_free(threadInfo);
-            KMP_CPU_FREE(oldMask);
-            *msg_id = kmp_i18n_str_ApicNotPresent;
-            return -1;
-        }
-        threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
-        threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
-        if (threadInfo[nApics].maxThreadsPerPkg == 0) {
-            threadInfo[nApics].maxThreadsPerPkg = 1;
-        }
-
-        //
-        // Max cores per pkg comes from cpuid(4).
-        // 1 must be added to the encoded value.
-        //
-        // First, we need to check if cpuid(4) is supported on this chip.
-        // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
-        // has the value n or greater.
-        //
-        __kmp_x86_cpuid(0, 0, &buf);
-        if (buf.eax >= 4) {
-            __kmp_x86_cpuid(4, 0, &buf);
-            threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
-        }
-        else {
-            threadInfo[nApics].maxCoresPerPkg = 1;
-        }
-
-        //
-        // Infer the pkgId / coreId / threadId using only the info
-        // obtained locally.
-        //
-        int widthCT = __kmp_cpuid_mask_width(
-          threadInfo[nApics].maxThreadsPerPkg);
-        threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
-
-        int widthC = __kmp_cpuid_mask_width(
-          threadInfo[nApics].maxCoresPerPkg);
-        int widthT = widthCT - widthC;
-        if (widthT < 0) {
-            //
-            // I've never seen this one happen, but I suppose it could, if
-            // the cpuid instruction on a chip was really screwed up.
-            // Make sure to restore the affinity mask before the tail call.
-            //
-            __kmp_set_system_affinity(oldMask, TRUE);
-            __kmp_free(threadInfo);
-            KMP_CPU_FREE(oldMask);
-            *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-            return -1;
-        }
-
-        int maskC = (1 << widthC) - 1;
-        threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
-          &maskC;
-
-        int maskT = (1 << widthT) - 1;
-        threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
-
-        nApics++;
-    }
-
-    //
-    // We've collected all the info we need.
-    // Restore the old affinity mask for this thread.
-    //
-    __kmp_set_system_affinity(oldMask, TRUE);
-
-    //
-    // If there's only one thread context to bind to, form an Address object
-    // with depth 1 and return immediately (or, if affinity is off, set
-    // address2os to NULL and return).
-    //
-    // If it is configured to omit the package level when there is only a
-    // single package, the logic at the end of this routine won't work if
-    // there is only a single thread - it would try to form an Address
-    // object with depth 0.
-    //
-    KMP_ASSERT(nApics > 0);
-    if (nApics == 1) {
-        __kmp_ncores = nPackages = 1;
-        __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-        if (__kmp_affinity_verbose) {
-            char buf[KMP_AFFIN_MASK_PRINT_LEN];
-            __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-
-            KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
-            if (__kmp_affinity_respect_mask) {
-                KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-            } else {
-                KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-            }
-            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-            KMP_INFORM(Uniform, "KMP_AFFINITY");
-            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-              __kmp_nThreadsPerCore, __kmp_ncores);
-        }
-
-        if (__kmp_affinity_type == affinity_none) {
-            __kmp_free(threadInfo);
-            KMP_CPU_FREE(oldMask);
-            return 0;
-        }
-
-        *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
-        Address addr(1);
-        addr.labels[0] = threadInfo[0].pkgId;
-        (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
-
-        if (__kmp_affinity_gran_levels < 0) {
-            __kmp_affinity_gran_levels = 0;
-        }
-
-        if (__kmp_affinity_verbose) {
-            __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
-        }
-
-        __kmp_free(threadInfo);
-        KMP_CPU_FREE(oldMask);
-        return 1;
+  unsigned nApics = 0;
+  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+    // Skip this proc if it is not included in the machine model.
+    if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
+      continue;
+    }
+    KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
+
+    __kmp_affinity_dispatch->bind_thread(i);
+    threadInfo[nApics].osId = i;
+
+    // The apic id and max threads per pkg come from cpuid(1).
+    __kmp_x86_cpuid(1, 0, &buf);
+    if (((buf.edx >> 9) & 1) == 0) {
+      __kmp_set_system_affinity(oldMask, TRUE);
+      __kmp_free(threadInfo);
+      KMP_CPU_FREE(oldMask);
+      *msg_id = kmp_i18n_str_ApicNotPresent;
+      return -1;
+    }
+    threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
+    threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
+    if (threadInfo[nApics].maxThreadsPerPkg == 0) {
+      threadInfo[nApics].maxThreadsPerPkg = 1;
+    }
+
+    // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
+    // value.
+    //
+    // First, we need to check if cpuid(4) is supported on this chip. To see if
+    // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
+    // or greater.
+    __kmp_x86_cpuid(0, 0, &buf);
+    if (buf.eax >= 4) {
+      __kmp_x86_cpuid(4, 0, &buf);
+      threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
+    } else {
+      threadInfo[nApics].maxCoresPerPkg = 1;
     }
 
-    //
-    // Sort the threadInfo table by physical Id.
-    //
-    qsort(threadInfo, nApics, sizeof(*threadInfo),
-      __kmp_affinity_cmp_apicThreadInfo_phys_id);
-
-    //
-    // The table is now sorted by pkgId / coreId / threadId, but we really
-    // don't know the radix of any of the fields.  pkgId's may be sparsely
-    // assigned among the chips on a system.  Although coreId's are usually
-    // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
-    // [0..threadsPerCore-1], we don't want to make any such assumptions.
-    //
-    // For that matter, we don't know what coresPerPkg and threadsPerCore
-    // (or the total # packages) are at this point - we want to determine
-    // that now.  We only have an upper bound on the first two figures.
-    //
-    // We also perform a consistency check at this point: the values returned
-    // by the cpuid instruction for any thread bound to a given package had
-    // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
-    //
-    nPackages = 1;
-    nCoresPerPkg = 1;
-    __kmp_nThreadsPerCore = 1;
-    unsigned nCores = 1;
-
-    unsigned pkgCt = 1;                         // to determine radii
-    unsigned lastPkgId = threadInfo[0].pkgId;
-    unsigned coreCt = 1;
-    unsigned lastCoreId = threadInfo[0].coreId;
-    unsigned threadCt = 1;
-    unsigned lastThreadId = threadInfo[0].threadId;
-
-                                                // intra-pkg consist checks
-    unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
-    unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
-
-    for (i = 1; i < nApics; i++) {
-        if (threadInfo[i].pkgId != lastPkgId) {
-            nCores++;
-            pkgCt++;
-            lastPkgId = threadInfo[i].pkgId;
-            if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
-            coreCt = 1;
-            lastCoreId = threadInfo[i].coreId;
-            if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
-            threadCt = 1;
-            lastThreadId = threadInfo[i].threadId;
-
-            //
-            // This is a different package, so go on to the next iteration
-            // without doing any consistency checks.  Reset the consistency
-            // check vars, though.
-            //
-            prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
-            prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
-            continue;
-        }
-
-        if (threadInfo[i].coreId != lastCoreId) {
-            nCores++;
-            coreCt++;
-            lastCoreId = threadInfo[i].coreId;
-            if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
-            threadCt = 1;
-            lastThreadId = threadInfo[i].threadId;
-        }
-        else if (threadInfo[i].threadId != lastThreadId) {
-            threadCt++;
-            lastThreadId = threadInfo[i].threadId;
-        }
-        else {
-            __kmp_free(threadInfo);
-            KMP_CPU_FREE(oldMask);
-            *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
-            return -1;
-        }
-
-        //
-        // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
-        // fields agree between all the threads bounds to a given package.
-        //
-        if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
-          || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
-            __kmp_free(threadInfo);
-            KMP_CPU_FREE(oldMask);
-            *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
-            return -1;
-        }
-    }
-    nPackages = pkgCt;
-    if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
-    if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
-
-    //
-    // When affinity is off, this routine will still be called to set
-    // __kmp_ncores, as well as __kmp_nThreadsPerCore,
-    // nCoresPerPkg, & nPackages.  Make sure all these vars are set
-    // correctly, and return now if affinity is not enabled.
-    //
-    __kmp_ncores = nCores;
+    // Infer the pkgId / coreId / threadId using only the info obtained locally.
+    int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
+    threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
+
+    int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
+    int widthT = widthCT - widthC;
+    if (widthT < 0) {
+      // I've never seen this one happen, but I suppose it could, if the cpuid
+      // instruction on a chip was really screwed up. Make sure to restore the
+      // affinity mask before the tail call.
+      __kmp_set_system_affinity(oldMask, TRUE);
+      __kmp_free(threadInfo);
+      KMP_CPU_FREE(oldMask);
+      *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+      return -1;
+    }
+
+    int maskC = (1 << widthC) - 1;
+    threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
+
+    int maskT = (1 << widthT) - 1;
+    threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
+
+    nApics++;
+  }
+
+  // We've collected all the info we need.
+  // Restore the old affinity mask for this thread.
+  __kmp_set_system_affinity(oldMask, TRUE);
+
+  // If there's only one thread context to bind to, form an Address object
+  // with depth 1 and return immediately (or, if affinity is off, set
+  // address2os to NULL and return).
+  //
+  // If it is configured to omit the package level when there is only a single
+  // package, the logic at the end of this routine won't work if there is only
+  // a single thread - it would try to form an Address object with depth 0.
+  KMP_ASSERT(nApics > 0);
+  if (nApics == 1) {
+    __kmp_ncores = nPackages = 1;
+    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
     if (__kmp_affinity_verbose) {
-        char buf[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-
-        KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
-        if (__kmp_affinity_respect_mask) {
-            KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-        } else {
-            KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-        }
-        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-        if (__kmp_affinity_uniform_topology()) {
-            KMP_INFORM(Uniform, "KMP_AFFINITY");
-        } else {
-            KMP_INFORM(NonUniform, "KMP_AFFINITY");
-        }
-        KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-          __kmp_nThreadsPerCore, __kmp_ncores);
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
 
+      KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
+      if (__kmp_affinity_respect_mask) {
+        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+      } else {
+        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+      }
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
     }
-    KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-    KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
-    __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-    for (i = 0; i < nApics; ++i) {
-        __kmp_pu_os_idx[i] = threadInfo[i].osId;
-    }
+
     if (__kmp_affinity_type == affinity_none) {
-        __kmp_free(threadInfo);
-        KMP_CPU_FREE(oldMask);
-        return 0;
+      __kmp_free(threadInfo);
+      KMP_CPU_FREE(oldMask);
+      return 0;
     }
 
-    //
-    // Now that we've determined the number of packages, the number of cores
-    // per package, and the number of threads per core, we can construct the
-    // data structure that is to be returned.
-    //
-    int pkgLevel = 0;
-    int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
-    int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
-    unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
-
-    KMP_ASSERT(depth > 0);
-    *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
-
-    for (i = 0; i < nApics; ++i) {
-        Address addr(depth);
-        unsigned os = threadInfo[i].osId;
-        int d = 0;
-
-        if (pkgLevel >= 0) {
-            addr.labels[d++] = threadInfo[i].pkgId;
-        }
-        if (coreLevel >= 0) {
-            addr.labels[d++] = threadInfo[i].coreId;
-        }
-        if (threadLevel >= 0) {
-            addr.labels[d++] = threadInfo[i].threadId;
-        }
-        (*address2os)[i] = AddrUnsPair(addr, os);
-    }
+    *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
+    Address addr(1);
+    addr.labels[0] = threadInfo[0].pkgId;
+    (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
 
     if (__kmp_affinity_gran_levels < 0) {
-        //
-        // Set the granularity level based on what levels are modeled
-        // in the machine topology map.
-        //
-        __kmp_affinity_gran_levels = 0;
-        if ((threadLevel >= 0)
-          && (__kmp_affinity_gran > affinity_gran_thread)) {
-            __kmp_affinity_gran_levels++;
-        }
-        if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
-            __kmp_affinity_gran_levels++;
-        }
-        if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
-            __kmp_affinity_gran_levels++;
-        }
+      __kmp_affinity_gran_levels = 0;
     }
 
     if (__kmp_affinity_verbose) {
-        __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
-          coreLevel, threadLevel);
+      __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
     }
 
     __kmp_free(threadInfo);
     KMP_CPU_FREE(oldMask);
-    return depth;
-}
-
-
-//
-// Intel(R) microarchitecture code name Nehalem, Dunnington and later
-// architectures support a newer interface for specifying the x2APIC Ids,
-// based on cpuid leaf 11.
-//
-static int
-__kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
-  kmp_i18n_id_t *const msg_id)
-{
-    kmp_cpuid buf;
-
-    *address2os = NULL;
-    *msg_id = kmp_i18n_null;
+    return 1;
+  }
 
-    //
-    // Check to see if cpuid leaf 11 is supported.
-    //
-    __kmp_x86_cpuid(0, 0, &buf);
-    if (buf.eax < 11) {
-        *msg_id = kmp_i18n_str_NoLeaf11Support;
-        return -1;
+  // Sort the threadInfo table by physical Id.
+  qsort(threadInfo, nApics, sizeof(*threadInfo),
+        __kmp_affinity_cmp_apicThreadInfo_phys_id);
+
+  // The table is now sorted by pkgId / coreId / threadId, but we really don't
+  // know the radix of any of the fields. pkgId's may be sparsely assigned among
+  // the chips on a system. Although coreId's are usually assigned
+  // [0 .. coresPerPkg-1] and threadId's are usually assigned
+  // [0..threadsPerCore-1], we don't want to make any such assumptions.
+  //
+  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
+  // total # packages) are at this point - we want to determine that now. We
+  // only have an upper bound on the first two figures.
+  //
+  // We also perform a consistency check at this point: the values returned by
+  // the cpuid instruction for any thread bound to a given package had better
+  // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
+  nPackages = 1;
+  nCoresPerPkg = 1;
+  __kmp_nThreadsPerCore = 1;
+  unsigned nCores = 1;
+
+  unsigned pkgCt = 1; // to determine radii
+  unsigned lastPkgId = threadInfo[0].pkgId;
+  unsigned coreCt = 1;
+  unsigned lastCoreId = threadInfo[0].coreId;
+  unsigned threadCt = 1;
+  unsigned lastThreadId = threadInfo[0].threadId;
+
+  // intra-pkg consist checks
+  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
+  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
+
+  for (i = 1; i < nApics; i++) {
+    if (threadInfo[i].pkgId != lastPkgId) {
+      nCores++;
+      pkgCt++;
+      lastPkgId = threadInfo[i].pkgId;
+      if ((int)coreCt > nCoresPerPkg)
+        nCoresPerPkg = coreCt;
+      coreCt = 1;
+      lastCoreId = threadInfo[i].coreId;
+      if ((int)threadCt > __kmp_nThreadsPerCore)
+        __kmp_nThreadsPerCore = threadCt;
+      threadCt = 1;
+      lastThreadId = threadInfo[i].threadId;
+
+      // This is a different package, so go on to the next iteration without
+      // doing any consistency checks. Reset the consistency check vars, though.
+      prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
+      prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
+      continue;
+    }
+
+    if (threadInfo[i].coreId != lastCoreId) {
+      nCores++;
+      coreCt++;
+      lastCoreId = threadInfo[i].coreId;
+      if ((int)threadCt > __kmp_nThreadsPerCore)
+        __kmp_nThreadsPerCore = threadCt;
+      threadCt = 1;
+      lastThreadId = threadInfo[i].threadId;
+    } else if (threadInfo[i].threadId != lastThreadId) {
+      threadCt++;
+      lastThreadId = threadInfo[i].threadId;
+    } else {
+      __kmp_free(threadInfo);
+      KMP_CPU_FREE(oldMask);
+      *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
+      return -1;
+    }
+
+    // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
+    // fields agree between all the threads bounds to a given package.
+    if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
+        (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
+      __kmp_free(threadInfo);
+      KMP_CPU_FREE(oldMask);
+      *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
+      return -1;
+    }
+  }
+  nPackages = pkgCt;
+  if ((int)coreCt > nCoresPerPkg)
+    nCoresPerPkg = coreCt;
+  if ((int)threadCt > __kmp_nThreadsPerCore)
+    __kmp_nThreadsPerCore = threadCt;
+
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly, and return now if affinity is
+  // not enabled.
+  __kmp_ncores = nCores;
+  if (__kmp_affinity_verbose) {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+    KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
+    if (__kmp_affinity_respect_mask) {
+      KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+    } else {
+      KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
     }
-    __kmp_x86_cpuid(11, 0, &buf);
-    if (buf.ebx == 0) {
-        *msg_id = kmp_i18n_str_NoLeaf11Support;
-        return -1;
+    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+    if (__kmp_affinity_uniform_topology()) {
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+    } else {
+      KMP_INFORM(NonUniform, "KMP_AFFINITY");
     }
+    KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+               __kmp_nThreadsPerCore, __kmp_ncores);
+  }
+  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
+  KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
+  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  for (i = 0; i < nApics; ++i) {
+    __kmp_pu_os_idx[i] = threadInfo[i].osId;
+  }
+  if (__kmp_affinity_type == affinity_none) {
+    __kmp_free(threadInfo);
+    KMP_CPU_FREE(oldMask);
+    return 0;
+  }
 
-    //
-    // Find the number of levels in the machine topology.  While we're at it,
-    // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
-    // try to get more accurate values later by explicitly counting them,
-    // but get reasonable defaults now, in case we return early.
-    //
-    int level;
-    int threadLevel = -1;
-    int coreLevel = -1;
-    int pkgLevel = -1;
-    __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
+  // Now that we've determined the number of packages, the number of cores per
+  // package, and the number of threads per core, we can construct the data
+  // structure that is to be returned.
+  int pkgLevel = 0;
+  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
+  int threadLevel =
+      (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
+  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
+
+  KMP_ASSERT(depth > 0);
+  *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
+
+  for (i = 0; i < nApics; ++i) {
+    Address addr(depth);
+    unsigned os = threadInfo[i].osId;
+    int d = 0;
 
-    for (level = 0;; level++) {
-        if (level > 31) {
-            //
-            // FIXME: Hack for DPD200163180
-            //
-            // If level is big then something went wrong -> exiting
-            //
-            // There could actually be 32 valid levels in the machine topology,
-            // but so far, the only machine we have seen which does not exit
-            // this loop before iteration 32 has fubar x2APIC settings.
-            //
-            // For now, just reject this case based upon loop trip count.
-            //
-            *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-            return -1;
-        }
-        __kmp_x86_cpuid(11, level, &buf);
-        if (buf.ebx == 0) {
-            if (pkgLevel < 0) {
-                //
-                // Will infer nPackages from __kmp_xproc
-                //
-                pkgLevel = level;
-                level++;
-            }
-            break;
-        }
-        int kind = (buf.ecx >> 8) & 0xff;
-        if (kind == 1) {
-            //
-            // SMT level
-            //
-            threadLevel = level;
-            coreLevel = -1;
-            pkgLevel = -1;
-            __kmp_nThreadsPerCore = buf.ebx & 0xffff;
-            if (__kmp_nThreadsPerCore == 0) {
-                *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-                return -1;
-            }
-        }
-        else if (kind == 2) {
-            //
-            // core level
-            //
-            coreLevel = level;
-            pkgLevel = -1;
-            nCoresPerPkg = buf.ebx & 0xffff;
-            if (nCoresPerPkg == 0) {
-                *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-                return -1;
-            }
-        }
-        else {
-            if (level <= 0) {
-                *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-                return -1;
-            }
-            if (pkgLevel >= 0) {
-                continue;
-            }
-            pkgLevel = level;
-            nPackages = buf.ebx & 0xffff;
-            if (nPackages == 0) {
-                *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-                return -1;
-            }
-        }
-    }
-    int depth = level;
-
-    //
-    // In the above loop, "level" was counted from the finest level (usually
-    // thread) to the coarsest.  The caller expects that we will place the
-    // labels in (*address2os)[].first.labels[] in the inverse order, so
-    // we need to invert the vars saying which level means what.
-    //
-    if (threadLevel >= 0) {
-        threadLevel = depth - threadLevel - 1;
+    if (pkgLevel >= 0) {
+      addr.labels[d++] = threadInfo[i].pkgId;
     }
     if (coreLevel >= 0) {
-        coreLevel = depth - coreLevel - 1;
+      addr.labels[d++] = threadInfo[i].coreId;
     }
-    KMP_DEBUG_ASSERT(pkgLevel >= 0);
-    pkgLevel = depth - pkgLevel - 1;
-
-    //
-    // The algorithm used starts by setting the affinity to each available
-    // thread and retrieving info from the cpuid instruction, so if we are
-    // not capable of calling __kmp_get_system_affinity() and
-    // _kmp_get_system_affinity(), then we need to do something else - use
-    // the defaults that we calculated from issuing cpuid without binding
-    // to each proc.
-    //
-    if (! KMP_AFFINITY_CAPABLE())
-    {
-        //
-        // Hack to try and infer the machine topology using only the data
-        // available from cpuid on the current thread, and __kmp_xproc.
-        //
-        KMP_ASSERT(__kmp_affinity_type == affinity_none);
-
-        __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
-        nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
-        if (__kmp_affinity_verbose) {
-            KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
-            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-            if (__kmp_affinity_uniform_topology()) {
-                KMP_INFORM(Uniform, "KMP_AFFINITY");
-            } else {
-                KMP_INFORM(NonUniform, "KMP_AFFINITY");
-            }
-            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-              __kmp_nThreadsPerCore, __kmp_ncores);
-        }
-        return 0;
+    if (threadLevel >= 0) {
+      addr.labels[d++] = threadInfo[i].threadId;
     }
+    (*address2os)[i] = AddrUnsPair(addr, os);
+  }
 
-    //
-    //
-    // From here on, we can assume that it is safe to call
-    // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
-    // even if __kmp_affinity_type = affinity_none.
-    //
-
-    //
-    // Save the affinity mask for the current thread.
-    //
-    kmp_affin_mask_t *oldMask;
-    KMP_CPU_ALLOC(oldMask);
-    __kmp_get_system_affinity(oldMask, TRUE);
-
-    //
-    // Allocate the data structure to be returned.
-    //
-    AddrUnsPair *retval = (AddrUnsPair *)
-      __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
-
-    //
-    // Run through each of the available contexts, binding the current thread
-    // to it, and obtaining the pertinent information using the cpuid instr.
-    //
-    unsigned int proc;
-    int nApics = 0;
-    KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
-        //
-        // Skip this proc if it is not included in the machine model.
-        //
-        if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
-            continue;
-        }
-        KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
-
-        __kmp_affinity_dispatch->bind_thread(proc);
-
-        //
-        // Extrach the labels for each level in the machine topology map
-        // from the Apic ID.
-        //
-        Address addr(depth);
-        int prev_shift = 0;
-
-        for (level = 0; level < depth; level++) {
-            __kmp_x86_cpuid(11, level, &buf);
-            unsigned apicId = buf.edx;
-            if (buf.ebx == 0) {
-                if (level != depth - 1) {
-                    KMP_CPU_FREE(oldMask);
-                    *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
-                    return -1;
-                }
-                addr.labels[depth - level - 1] = apicId >> prev_shift;
-                level++;
-                break;
-            }
-            int shift = buf.eax & 0x1f;
-            int mask = (1 << shift) - 1;
-            addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
-            prev_shift = shift;
-        }
-        if (level != depth) {
-            KMP_CPU_FREE(oldMask);
-            *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
-            return -1;
-        }
+  if (__kmp_affinity_gran_levels < 0) {
+    // Set the granularity level based on what levels are modeled in the machine
+    // topology map.
+    __kmp_affinity_gran_levels = 0;
+    if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
+      __kmp_affinity_gran_levels++;
+    }
+    if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
+      __kmp_affinity_gran_levels++;
+    }
+    if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
+      __kmp_affinity_gran_levels++;
+    }
+  }
+
+  if (__kmp_affinity_verbose) {
+    __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
+                                  coreLevel, threadLevel);
+  }
+
+  __kmp_free(threadInfo);
+  KMP_CPU_FREE(oldMask);
+  return depth;
+}
 
-        retval[nApics] = AddrUnsPair(addr, proc);
-        nApics++;
+// Intel(R) microarchitecture code name Nehalem, Dunnington and later
+// architectures support a newer interface for specifying the x2APIC Ids,
+// based on cpuid leaf 11.
+static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
+                                              kmp_i18n_id_t *const msg_id) {
+  kmp_cpuid buf;
+  *address2os = NULL;
+  *msg_id = kmp_i18n_null;
+
+  // Check to see if cpuid leaf 11 is supported.
+  __kmp_x86_cpuid(0, 0, &buf);
+  if (buf.eax < 11) {
+    *msg_id = kmp_i18n_str_NoLeaf11Support;
+    return -1;
+  }
+  __kmp_x86_cpuid(11, 0, &buf);
+  if (buf.ebx == 0) {
+    *msg_id = kmp_i18n_str_NoLeaf11Support;
+    return -1;
+  }
+
+  // Find the number of levels in the machine topology. While we're at it, get
+  // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to
+  // get more accurate values later by explicitly counting them, but get
+  // reasonable defaults now, in case we return early.
+  int level;
+  int threadLevel = -1;
+  int coreLevel = -1;
+  int pkgLevel = -1;
+  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
+
+  for (level = 0;; level++) {
+    if (level > 31) {
+      // FIXME: Hack for DPD200163180
+      //
+      // If level is big then something went wrong -> exiting
+      //
+      // There could actually be 32 valid levels in the machine topology, but so
+      // far, the only machine we have seen which does not exit this loop before
+      // iteration 32 has fubar x2APIC settings.
+      //
+      // For now, just reject this case based upon loop trip count.
+      *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+      return -1;
     }
+    __kmp_x86_cpuid(11, level, &buf);
+    if (buf.ebx == 0) {
+      if (pkgLevel < 0) {
+        // Will infer nPackages from __kmp_xproc
+        pkgLevel = level;
+        level++;
+      }
+      break;
+    }
+    int kind = (buf.ecx >> 8) & 0xff;
+    if (kind == 1) {
+      // SMT level
+      threadLevel = level;
+      coreLevel = -1;
+      pkgLevel = -1;
+      __kmp_nThreadsPerCore = buf.ebx & 0xffff;
+      if (__kmp_nThreadsPerCore == 0) {
+        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+        return -1;
+      }
+    } else if (kind == 2) {
+      // core level
+      coreLevel = level;
+      pkgLevel = -1;
+      nCoresPerPkg = buf.ebx & 0xffff;
+      if (nCoresPerPkg == 0) {
+        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+        return -1;
+      }
+    } else {
+      if (level <= 0) {
+        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+        return -1;
+      }
+      if (pkgLevel >= 0) {
+        continue;
+      }
+      pkgLevel = level;
+      nPackages = buf.ebx & 0xffff;
+      if (nPackages == 0) {
+        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+        return -1;
+      }
+    }
+  }
+  int depth = level;
 
-    //
-    // We've collected all the info we need.
-    // Restore the old affinity mask for this thread.
-    //
-    __kmp_set_system_affinity(oldMask, TRUE);
-
-    //
-    // If there's only one thread context to bind to, return now.
-    //
-    KMP_ASSERT(nApics > 0);
-    if (nApics == 1) {
-        __kmp_ncores = nPackages = 1;
-        __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-        if (__kmp_affinity_verbose) {
-            char buf[KMP_AFFIN_MASK_PRINT_LEN];
-            __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-
-            KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
-            if (__kmp_affinity_respect_mask) {
-                KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-            } else {
-                KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-            }
-            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-            KMP_INFORM(Uniform, "KMP_AFFINITY");
-            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-              __kmp_nThreadsPerCore, __kmp_ncores);
-        }
-
-        if (__kmp_affinity_type == affinity_none) {
-            __kmp_free(retval);
-            KMP_CPU_FREE(oldMask);
-            return 0;
-        }
-
-        //
-        // Form an Address object which only includes the package level.
-        //
-        Address addr(1);
-        addr.labels[0] = retval[0].first.labels[pkgLevel];
-        retval[0].first = addr;
-
-        if (__kmp_affinity_gran_levels < 0) {
-            __kmp_affinity_gran_levels = 0;
-        }
-
-        if (__kmp_affinity_verbose) {
-            __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
-        }
+  // In the above loop, "level" was counted from the finest level (usually
+  // thread) to the coarsest.  The caller expects that we will place the labels
+  // in (*address2os)[].first.labels[] in the inverse order, so we need to
+  // invert the vars saying which level means what.
+  if (threadLevel >= 0) {
+    threadLevel = depth - threadLevel - 1;
+  }
+  if (coreLevel >= 0) {
+    coreLevel = depth - coreLevel - 1;
+  }
+  KMP_DEBUG_ASSERT(pkgLevel >= 0);
+  pkgLevel = depth - pkgLevel - 1;
+
+  // The algorithm used starts by setting the affinity to each available thread
+  // and retrieving info from the cpuid instruction, so if we are not capable of
+  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
+  // need to do something else - use the defaults that we calculated from
+  // issuing cpuid without binding to each proc.
+  if (!KMP_AFFINITY_CAPABLE()) {
+    // Hack to try and infer the machine topology using only the data
+    // available from cpuid on the current thread, and __kmp_xproc.
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
 
-        *address2os = retval;
-        KMP_CPU_FREE(oldMask);
-        return 1;
+    __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
+    nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      if (__kmp_affinity_uniform_topology()) {
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      } else {
+        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+      }
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
     }
+    return 0;
+  }
 
-    //
-    // Sort the table by physical Id.
-    //
-    qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
+  // From here on, we can assume that it is safe to call
+  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
+  // __kmp_affinity_type = affinity_none.
+
+  // Save the affinity mask for the current thread.
+  kmp_affin_mask_t *oldMask;
+  KMP_CPU_ALLOC(oldMask);
+  __kmp_get_system_affinity(oldMask, TRUE);
+
+  // Allocate the data structure to be returned.
+  AddrUnsPair *retval =
+      (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
+
+  // Run through each of the available contexts, binding the current thread
+  // to it, and obtaining the pertinent information using the cpuid instr.
+  unsigned int proc;
+  int nApics = 0;
+  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
+    // Skip this proc if it is not included in the machine model.
+    if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+      continue;
+    }
+    KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
+
+    __kmp_affinity_dispatch->bind_thread(proc);
+
+    // Extract labels for each level in the machine topology map from Apic ID.
+    Address addr(depth);
+    int prev_shift = 0;
 
-    //
-    // Find the radix at each of the levels.
-    //
-    unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-    unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-    unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-    unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
     for (level = 0; level < depth; level++) {
-        totals[level] = 1;
-        maxCt[level] = 1;
-        counts[level] = 1;
-        last[level] = retval[0].first.labels[level];
-    }
-
-    //
-    // From here on, the iteration variable "level" runs from the finest
-    // level to the coarsest, i.e. we iterate forward through
-    // (*address2os)[].first.labels[] - in the previous loops, we iterated
-    // backwards.
-    //
-    for (proc = 1; (int)proc < nApics; proc++) {
-        int level;
-        for (level = 0; level < depth; level++) {
-            if (retval[proc].first.labels[level] != last[level]) {
-                int j;
-                for (j = level + 1; j < depth; j++) {
-                    totals[j]++;
-                    counts[j] = 1;
-                    // The line below causes printing incorrect topology information
-                    // in case the max value for some level (maxCt[level]) is encountered earlier than
-                    // some less value while going through the array.
-                    // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
-                    // whereas it must be 4.
-                    // TODO!!! Check if it can be commented safely
-                    //maxCt[j] = 1;
-                    last[j] = retval[proc].first.labels[j];
-                }
-                totals[level]++;
-                counts[level]++;
-                if (counts[level] > maxCt[level]) {
-                    maxCt[level] = counts[level];
-                }
-                last[level] = retval[proc].first.labels[level];
-                break;
-            }
-            else if (level == depth - 1) {
-                __kmp_free(last);
-                __kmp_free(maxCt);
-                __kmp_free(counts);
-                __kmp_free(totals);
-                __kmp_free(retval);
-                KMP_CPU_FREE(oldMask);
-                *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
-                return -1;
-            }
+      __kmp_x86_cpuid(11, level, &buf);
+      unsigned apicId = buf.edx;
+      if (buf.ebx == 0) {
+        if (level != depth - 1) {
+          KMP_CPU_FREE(oldMask);
+          *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
+          return -1;
         }
-    }
+        addr.labels[depth - level - 1] = apicId >> prev_shift;
+        level++;
+        break;
+      }
+      int shift = buf.eax & 0x1f;
+      int mask = (1 << shift) - 1;
+      addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
+      prev_shift = shift;
+    }
+    if (level != depth) {
+      KMP_CPU_FREE(oldMask);
+      *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
+      return -1;
+    }
+
+    retval[nApics] = AddrUnsPair(addr, proc);
+    nApics++;
+  }
+
+  // We've collected all the info we need.
+  // Restore the old affinity mask for this thread.
+  __kmp_set_system_affinity(oldMask, TRUE);
+
+  // If there's only one thread context to bind to, return now.
+  KMP_ASSERT(nApics > 0);
+  if (nApics == 1) {
+    __kmp_ncores = nPackages = 1;
+    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+    if (__kmp_affinity_verbose) {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
 
-    //
-    // When affinity is off, this routine will still be called to set
-    // __kmp_ncores, as well as __kmp_nThreadsPerCore,
-    // nCoresPerPkg, & nPackages.  Make sure all these vars are set
-    // correctly, and return if affinity is not enabled.
-    //
-    if (threadLevel >= 0) {
-        __kmp_nThreadsPerCore = maxCt[threadLevel];
-    }
-    else {
-        __kmp_nThreadsPerCore = 1;
+      KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
+      if (__kmp_affinity_respect_mask) {
+        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+      } else {
+        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+      }
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
     }
-    nPackages = totals[pkgLevel];
 
-    if (coreLevel >= 0) {
-        __kmp_ncores = totals[coreLevel];
-        nCoresPerPkg = maxCt[coreLevel];
-    }
-    else {
-        __kmp_ncores = nPackages;
-        nCoresPerPkg = 1;
+    if (__kmp_affinity_type == affinity_none) {
+      __kmp_free(retval);
+      KMP_CPU_FREE(oldMask);
+      return 0;
     }
 
-    //
-    // Check to see if the machine topology is uniform
-    //
-    unsigned prod = maxCt[0];
-    for (level = 1; level < depth; level++) {
-       prod *= maxCt[level];
+    // Form an Address object which only includes the package level.
+    Address addr(1);
+    addr.labels[0] = retval[0].first.labels[pkgLevel];
+    retval[0].first = addr;
+
+    if (__kmp_affinity_gran_levels < 0) {
+      __kmp_affinity_gran_levels = 0;
     }
-    bool uniform = (prod == totals[level - 1]);
 
-    //
-    // Print the machine topology summary.
-    //
     if (__kmp_affinity_verbose) {
-        char mask[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+      __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
+    }
 
-        KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
-        if (__kmp_affinity_respect_mask) {
-            KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
-        } else {
-            KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
-        }
-        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-        if (uniform) {
-            KMP_INFORM(Uniform, "KMP_AFFINITY");
-        } else {
-            KMP_INFORM(NonUniform, "KMP_AFFINITY");
-        }
+    *address2os = retval;
+    KMP_CPU_FREE(oldMask);
+    return 1;
+  }
 
-        kmp_str_buf_t buf;
-        __kmp_str_buf_init(&buf);
+  // Sort the table by physical Id.
+  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
 
-        __kmp_str_buf_print(&buf, "%d", totals[0]);
-        for (level = 1; level <= pkgLevel; level++) {
-            __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
+  // Find the radix at each of the levels.
+  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+  for (level = 0; level < depth; level++) {
+    totals[level] = 1;
+    maxCt[level] = 1;
+    counts[level] = 1;
+    last[level] = retval[0].first.labels[level];
+  }
+
+  // From here on, the iteration variable "level" runs from the finest level to
+  // the coarsest, i.e. we iterate forward through
+  // (*address2os)[].first.labels[] - in the previous loops, we iterated
+  // backwards.
+  for (proc = 1; (int)proc < nApics; proc++) {
+    int level;
+    for (level = 0; level < depth; level++) {
+      if (retval[proc].first.labels[level] != last[level]) {
+        int j;
+        for (j = level + 1; j < depth; j++) {
+          totals[j]++;
+          counts[j] = 1;
+          // The line below causes printing incorrect topology information in
+          // case the max value for some level (maxCt[level]) is encountered
+          // earlier than some less value while going through the array. For
+          // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then
+          // maxCt[1] == 2
+          // whereas it must be 4.
+          // TODO!!! Check if it can be commented safely
+          // maxCt[j] = 1;
+          last[j] = retval[proc].first.labels[j];
+        }
+        totals[level]++;
+        counts[level]++;
+        if (counts[level] > maxCt[level]) {
+          maxCt[level] = counts[level];
         }
-        KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
-          __kmp_nThreadsPerCore, __kmp_ncores);
-
-        __kmp_str_buf_free(&buf);
-    }
-    KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-    KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
-    __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-    for (proc = 0; (int)proc < nApics; ++proc) {
-        __kmp_pu_os_idx[proc] = retval[proc].second;
-    }
-    if (__kmp_affinity_type == affinity_none) {
+        last[level] = retval[proc].first.labels[level];
+        break;
+      } else if (level == depth - 1) {
         __kmp_free(last);
         __kmp_free(maxCt);
         __kmp_free(counts);
         __kmp_free(totals);
         __kmp_free(retval);
         KMP_CPU_FREE(oldMask);
-        return 0;
+        *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
+        return -1;
+      }
     }
+  }
 
-    //
-    // Find any levels with radiix 1, and remove them from the map
-    // (except for the package level).
-    //
-    int new_depth = 0;
-    for (level = 0; level < depth; level++) {
-        if ((maxCt[level] == 1) && (level != pkgLevel)) {
-           continue;
-        }
-        new_depth++;
-    }
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly, and return if affinity is not
+  // enabled.
+  if (threadLevel >= 0) {
+    __kmp_nThreadsPerCore = maxCt[threadLevel];
+  } else {
+    __kmp_nThreadsPerCore = 1;
+  }
+  nPackages = totals[pkgLevel];
 
-    //
-    // If we are removing any levels, allocate a new vector to return,
-    // and copy the relevant information to it.
-    //
-    if (new_depth != depth) {
-        AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
-          sizeof(AddrUnsPair) * nApics);
-        for (proc = 0; (int)proc < nApics; proc++) {
-            Address addr(new_depth);
-            new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
-        }
-        int new_level = 0;
-        int newPkgLevel = -1;
-        int newCoreLevel = -1;
-        int newThreadLevel = -1;
-        int i;
-        for (level = 0; level < depth; level++) {
-            if ((maxCt[level] == 1)
-              && (level != pkgLevel)) {
-                //
-                // Remove this level. Never remove the package level
-                //
-                continue;
-            }
-            if (level == pkgLevel) {
-                newPkgLevel = level;
-            }
-            if (level == coreLevel) {
-                newCoreLevel = level;
-            }
-            if (level == threadLevel) {
-                newThreadLevel = level;
-            }
-            for (proc = 0; (int)proc < nApics; proc++) {
-                new_retval[proc].first.labels[new_level]
-                  = retval[proc].first.labels[level];
-            }
-            new_level++;
-        }
+  if (coreLevel >= 0) {
+    __kmp_ncores = totals[coreLevel];
+    nCoresPerPkg = maxCt[coreLevel];
+  } else {
+    __kmp_ncores = nPackages;
+    nCoresPerPkg = 1;
+  }
 
-        __kmp_free(retval);
-        retval = new_retval;
-        depth = new_depth;
-        pkgLevel = newPkgLevel;
-        coreLevel = newCoreLevel;
-        threadLevel = newThreadLevel;
+  // Check to see if the machine topology is uniform
+  unsigned prod = maxCt[0];
+  for (level = 1; level < depth; level++) {
+    prod *= maxCt[level];
+  }
+  bool uniform = (prod == totals[level - 1]);
+
+  // Print the machine topology summary.
+  if (__kmp_affinity_verbose) {
+    char mask[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+    KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
+    if (__kmp_affinity_respect_mask) {
+      KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
+    } else {
+      KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
     }
-
-    if (__kmp_affinity_gran_levels < 0) {
-        //
-        // Set the granularity level based on what levels are modeled
-        // in the machine topology map.
-        //
-        __kmp_affinity_gran_levels = 0;
-        if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
-            __kmp_affinity_gran_levels++;
-        }
-        if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
-            __kmp_affinity_gran_levels++;
-        }
-        if (__kmp_affinity_gran > affinity_gran_package) {
-            __kmp_affinity_gran_levels++;
-        }
+    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+    if (uniform) {
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+    } else {
+      KMP_INFORM(NonUniform, "KMP_AFFINITY");
     }
 
-    if (__kmp_affinity_verbose) {
-        __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
-          coreLevel, threadLevel);
-    }
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
 
+    __kmp_str_buf_print(&buf, "%d", totals[0]);
+    for (level = 1; level <= pkgLevel; level++) {
+      __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
+    }
+    KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
+               __kmp_nThreadsPerCore, __kmp_ncores);
+
+    __kmp_str_buf_free(&buf);
+  }
+  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
+  KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
+  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  for (proc = 0; (int)proc < nApics; ++proc) {
+    __kmp_pu_os_idx[proc] = retval[proc].second;
+  }
+  if (__kmp_affinity_type == affinity_none) {
     __kmp_free(last);
     __kmp_free(maxCt);
     __kmp_free(counts);
     __kmp_free(totals);
+    __kmp_free(retval);
     KMP_CPU_FREE(oldMask);
-    *address2os = retval;
-    return depth;
-}
-
-
-# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+    return 0;
+  }
 
+  // Find any levels with radiix 1, and remove them from the map
+  // (except for the package level).
+  int new_depth = 0;
+  for (level = 0; level < depth; level++) {
+    if ((maxCt[level] == 1) && (level != pkgLevel)) {
+      continue;
+    }
+    new_depth++;
+  }
+
+  // If we are removing any levels, allocate a new vector to return,
+  // and copy the relevant information to it.
+  if (new_depth != depth) {
+    AddrUnsPair *new_retval =
+        (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
+    for (proc = 0; (int)proc < nApics; proc++) {
+      Address addr(new_depth);
+      new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
+    }
+    int new_level = 0;
+    int newPkgLevel = -1;
+    int newCoreLevel = -1;
+    int newThreadLevel = -1;
+    int i;
+    for (level = 0; level < depth; level++) {
+      if ((maxCt[level] == 1) && (level != pkgLevel)) {
+        // Remove this level. Never remove the package level
+        continue;
+      }
+      if (level == pkgLevel) {
+        newPkgLevel = level;
+      }
+      if (level == coreLevel) {
+        newCoreLevel = level;
+      }
+      if (level == threadLevel) {
+        newThreadLevel = level;
+      }
+      for (proc = 0; (int)proc < nApics; proc++) {
+        new_retval[proc].first.labels[new_level] =
+            retval[proc].first.labels[level];
+      }
+      new_level++;
+    }
 
-#define osIdIndex       0
-#define threadIdIndex   1
-#define coreIdIndex     2
-#define pkgIdIndex      3
-#define nodeIdIndex     4
+    __kmp_free(retval);
+    retval = new_retval;
+    depth = new_depth;
+    pkgLevel = newPkgLevel;
+    coreLevel = newCoreLevel;
+    threadLevel = newThreadLevel;
+  }
+
+  if (__kmp_affinity_gran_levels < 0) {
+    // Set the granularity level based on what levels are modeled
+    // in the machine topology map.
+    __kmp_affinity_gran_levels = 0;
+    if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
+      __kmp_affinity_gran_levels++;
+    }
+    if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
+      __kmp_affinity_gran_levels++;
+    }
+    if (__kmp_affinity_gran > affinity_gran_package) {
+      __kmp_affinity_gran_levels++;
+    }
+  }
+
+  if (__kmp_affinity_verbose) {
+    __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel,
+                                  threadLevel);
+  }
+
+  __kmp_free(last);
+  __kmp_free(maxCt);
+  __kmp_free(counts);
+  __kmp_free(totals);
+  KMP_CPU_FREE(oldMask);
+  *address2os = retval;
+  return depth;
+}
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#define osIdIndex 0
+#define threadIdIndex 1
+#define coreIdIndex 2
+#define pkgIdIndex 3
+#define nodeIdIndex 4
 
 typedef unsigned *ProcCpuInfo;
 static unsigned maxIndex = pkgIdIndex;
 
-
-static int
-__kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
-{
-    const unsigned *aa = (const unsigned *)a;
-    const unsigned *bb = (const unsigned *)b;
-    if (aa[osIdIndex] < bb[osIdIndex]) return -1;
-    if (aa[osIdIndex] > bb[osIdIndex]) return 1;
-    return 0;
+static int __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) {
+  const unsigned *aa = (const unsigned *)a;
+  const unsigned *bb = (const unsigned *)b;
+  if (aa[osIdIndex] < bb[osIdIndex])
+    return -1;
+  if (aa[osIdIndex] > bb[osIdIndex])
+    return 1;
+  return 0;
 };
 
-
-static int
-__kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
-{
-    unsigned i;
-    const unsigned *aa = *((const unsigned **)a);
-    const unsigned *bb = *((const unsigned **)b);
-    for (i = maxIndex; ; i--) {
-        if (aa[i] < bb[i]) return -1;
-        if (aa[i] > bb[i]) return 1;
-        if (i == osIdIndex) break;
-    }
-    return 0;
+static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
+                                                  const void *b) {
+  unsigned i;
+  const unsigned *aa = *((const unsigned **)a);
+  const unsigned *bb = *((const unsigned **)b);
+  for (i = maxIndex;; i--) {
+    if (aa[i] < bb[i])
+      return -1;
+    if (aa[i] > bb[i])
+      return 1;
+    if (i == osIdIndex)
+      break;
+  }
+  return 0;
 }
 
-
-//
 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
 // affinity map.
-//
-static int
-__kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
-  kmp_i18n_id_t *const msg_id, FILE *f)
-{
-    *address2os = NULL;
-    *msg_id = kmp_i18n_null;
-
-    //
-    // Scan of the file, and count the number of "processor" (osId) fields,
-    // and find the highest value of <n> for a node_<n> field.
-    //
-    char buf[256];
-    unsigned num_records = 0;
-    while (! feof(f)) {
-        buf[sizeof(buf) - 1] = 1;
-        if (! fgets(buf, sizeof(buf), f)) {
-            //
-            // Read errors presumably because of EOF
-            //
-            break;
-        }
-
-        char s1[] = "processor";
-        if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
-            num_records++;
-            continue;
-        }
-
-        //
-        // FIXME - this will match "node_<n> <garbage>"
-        //
-        unsigned level;
-        if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
-            if (nodeIdIndex + level >= maxIndex) {
-                maxIndex = nodeIdIndex + level;
-            }
-            continue;
-        }
-    }
-
-    //
-    // Check for empty file / no valid processor records, or too many.
-    // The number of records can't exceed the number of valid bits in the
-    // affinity mask.
-    //
-    if (num_records == 0) {
-        *line = 0;
-        *msg_id = kmp_i18n_str_NoProcRecords;
-        return -1;
-    }
-    if (num_records > (unsigned)__kmp_xproc) {
-        *line = 0;
-        *msg_id = kmp_i18n_str_TooManyProcRecords;
-        return -1;
-    }
-
-    //
-    // Set the file pointer back to the begginning, so that we can scan the
-    // file again, this time performing a full parse of the data.
-    // Allocate a vector of ProcCpuInfo object, where we will place the data.
-    // Adding an extra element at the end allows us to remove a lot of extra
-    // checks for termination conditions.
-    //
-    if (fseek(f, 0, SEEK_SET) != 0) {
-        *line = 0;
-        *msg_id = kmp_i18n_str_CantRewindCpuinfo;
-        return -1;
-    }
-
-    //
-    // Allocate the array of records to store the proc info in.  The dummy
-    // element at the end makes the logic in filling them out easier to code.
-    //
-    unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
-      * sizeof(unsigned *));
-    unsigned i;
-    for (i = 0; i <= num_records; i++) {
-        threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
-          * sizeof(unsigned));
-    }
-
-#define CLEANUP_THREAD_INFO \
-    for (i = 0; i <= num_records; i++) {                                \
-        __kmp_free(threadInfo[i]);                                      \
-    }                                                                   \
-    __kmp_free(threadInfo);
-
-    //
-    // A value of UINT_MAX means that we didn't find the field
-    //
-    unsigned __index;
-
-#define INIT_PROC_INFO(p) \
-    for (__index = 0; __index <= maxIndex; __index++) {                 \
-        (p)[__index] = UINT_MAX;                                        \
-    }
-
-    for (i = 0; i <= num_records; i++) {
-        INIT_PROC_INFO(threadInfo[i]);
+static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
+                                             int *line,
+                                             kmp_i18n_id_t *const msg_id,
+                                             FILE *f) {
+  *address2os = NULL;
+  *msg_id = kmp_i18n_null;
+
+  // Scan of the file, and count the number of "processor" (osId) fields,
+  // and find the highest value of <n> for a node_<n> field.
+  char buf[256];
+  unsigned num_records = 0;
+  while (!feof(f)) {
+    buf[sizeof(buf) - 1] = 1;
+    if (!fgets(buf, sizeof(buf), f)) {
+      // Read errors presumably because of EOF
+      break;
+    }
+
+    char s1[] = "processor";
+    if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
+      num_records++;
+      continue;
+    }
+
+    // FIXME - this will match "node_<n> <garbage>"
+    unsigned level;
+    if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
+      if (nodeIdIndex + level >= maxIndex) {
+        maxIndex = nodeIdIndex + level;
+      }
+      continue;
     }
+  }
 
-    unsigned num_avail = 0;
+  // Check for empty file / no valid processor records, or too many. The number
+  // of records can't exceed the number of valid bits in the affinity mask.
+  if (num_records == 0) {
     *line = 0;
-    while (! feof(f)) {
-        //
-        // Create an inner scoping level, so that all the goto targets at the
-        // end of the loop appear in an outer scoping level.  This avoids
-        // warnings about jumping past an initialization to a target in the
-        // same block.
-        //
-        {
-            buf[sizeof(buf) - 1] = 1;
-            bool long_line = false;
-            if (! fgets(buf, sizeof(buf), f)) {
-                //
-                // Read errors presumably because of EOF
-                //
-                // If there is valid data in threadInfo[num_avail], then fake
-                // a blank line in ensure that the last address gets parsed.
-                //
-                bool valid = false;
-                for (i = 0; i <= maxIndex; i++) {
-                    if (threadInfo[num_avail][i] != UINT_MAX) {
-                        valid = true;
-                    }
-                }
-                if (! valid) {
-                    break;
-                }
-                buf[0] = 0;
-            } else if (!buf[sizeof(buf) - 1]) {
-                //
-                // The line is longer than the buffer.  Set a flag and don't
-                // emit an error if we were going to ignore the line, anyway.
-                //
-                long_line = true;
-
-#define CHECK_LINE \
-    if (long_line) {                                                    \
-        CLEANUP_THREAD_INFO;                                            \
-        *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
-        return -1;                                                      \
-    }
-            }
-            (*line)++;
+    *msg_id = kmp_i18n_str_NoProcRecords;
+    return -1;
+  }
+  if (num_records > (unsigned)__kmp_xproc) {
+    *line = 0;
+    *msg_id = kmp_i18n_str_TooManyProcRecords;
+    return -1;
+  }
+
+  // Set the file pointer back to the begginning, so that we can scan the file
+  // again, this time performing a full parse of the data. Allocate a vector of
+  // ProcCpuInfo object, where we will place the data. Adding an extra element
+  // at the end allows us to remove a lot of extra checks for termination
+  // conditions.
+  if (fseek(f, 0, SEEK_SET) != 0) {
+    *line = 0;
+    *msg_id = kmp_i18n_str_CantRewindCpuinfo;
+    return -1;
+  }
+
+  // Allocate the array of records to store the proc info in.  The dummy
+  // element at the end makes the logic in filling them out easier to code.
+  unsigned **threadInfo =
+      (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
+  unsigned i;
+  for (i = 0; i <= num_records; i++) {
+    threadInfo[i] =
+        (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+  }
+
+#define CLEANUP_THREAD_INFO                                                    \
+  for (i = 0; i <= num_records; i++) {                                         \
+    __kmp_free(threadInfo[i]);                                                 \
+  }                                                                            \
+  __kmp_free(threadInfo);
+
+  // A value of UINT_MAX means that we didn't find the field
+  unsigned __index;
+
+#define INIT_PROC_INFO(p)                                                      \
+  for (__index = 0; __index <= maxIndex; __index++) {                          \
+    (p)[__index] = UINT_MAX;                                                   \
+  }
+
+  for (i = 0; i <= num_records; i++) {
+    INIT_PROC_INFO(threadInfo[i]);
+  }
+
+  unsigned num_avail = 0;
+  *line = 0;
+  while (!feof(f)) {
+    // Create an inner scoping level, so that all the goto targets at the end of
+    // the loop appear in an outer scoping level. This avoids warnings about
+    // jumping past an initialization to a target in the same block.
+    {
+      buf[sizeof(buf) - 1] = 1;
+      bool long_line = false;
+      if (!fgets(buf, sizeof(buf), f)) {
+        // Read errors presumably because of EOF
+        // If there is valid data in threadInfo[num_avail], then fake
+        // a blank line in ensure that the last address gets parsed.
+        bool valid = false;
+        for (i = 0; i <= maxIndex; i++) {
+          if (threadInfo[num_avail][i] != UINT_MAX) {
+            valid = true;
+          }
+        }
+        if (!valid) {
+          break;
+        }
+        buf[0] = 0;
+      } else if (!buf[sizeof(buf) - 1]) {
+        // The line is longer than the buffer.  Set a flag and don't
+        // emit an error if we were going to ignore the line, anyway.
+        long_line = true;
+
+#define CHECK_LINE                                                             \
+  if (long_line) {                                                             \
+    CLEANUP_THREAD_INFO;                                                       \
+    *msg_id = kmp_i18n_str_LongLineCpuinfo;                                    \
+    return -1;                                                                 \
+  }
+      }
+      (*line)++;
 
-            char s1[] = "processor";
-            if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
-                CHECK_LINE;
-                char *p = strchr(buf + sizeof(s1) - 1, ':');
-                unsigned val;
-                if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
-                if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
-                threadInfo[num_avail][osIdIndex] = val;
+      char s1[] = "processor";
+      if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s1) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][osIdIndex] = val;
 #if KMP_OS_LINUX && USE_SYSFS_INFO
-                char path[256];
-                KMP_SNPRINTF(path, sizeof(path),
-                    "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
-                    threadInfo[num_avail][osIdIndex]);
-                __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
-
-                KMP_SNPRINTF(path, sizeof(path),
-                    "/sys/devices/system/cpu/cpu%u/topology/core_id",
-                    threadInfo[num_avail][osIdIndex]);
-                __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
-                continue;
+        char path[256];
+        KMP_SNPRINTF(
+            path, sizeof(path),
+            "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
+            threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
+
+        KMP_SNPRINTF(path, sizeof(path),
+                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
+                     threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
+        continue;
 #else
-            }
-            char s2[] = "physical id";
-            if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
-                CHECK_LINE;
-                char *p = strchr(buf + sizeof(s2) - 1, ':');
-                unsigned val;
-                if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
-                if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
-                threadInfo[num_avail][pkgIdIndex] = val;
-                continue;
-            }
-            char s3[] = "core id";
-            if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
-                CHECK_LINE;
-                char *p = strchr(buf + sizeof(s3) - 1, ':');
-                unsigned val;
-                if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
-                if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
-                threadInfo[num_avail][coreIdIndex] = val;
-                continue;
+      }
+      char s2[] = "physical id";
+      if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s2) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][pkgIdIndex] = val;
+        continue;
+      }
+      char s3[] = "core id";
+      if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s3) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][coreIdIndex] = val;
+        continue;
 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
-            }
-            char s4[] = "thread id";
-            if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
-                CHECK_LINE;
-                char *p = strchr(buf + sizeof(s4) - 1, ':');
-                unsigned val;
-                if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
-                if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
-                threadInfo[num_avail][threadIdIndex] = val;
-                continue;
-            }
-            unsigned level;
-            if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
-                CHECK_LINE;
-                char *p = strchr(buf + sizeof(s4) - 1, ':');
-                unsigned val;
-                if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
-                KMP_ASSERT(nodeIdIndex + level <= maxIndex);
-                if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
-                threadInfo[num_avail][nodeIdIndex + level] = val;
-                continue;
-            }
-
-            //
-            // We didn't recognize the leading token on the line.
-            // There are lots of leading tokens that we don't recognize -
-            // if the line isn't empty, go on to the next line.
-            //
-            if ((*buf != 0) && (*buf != '\n')) {
-                //
-                // If the line is longer than the buffer, read characters
-                // until we find a newline.
-                //
-                if (long_line) {
-                    int ch;
-                    while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
-                }
-                continue;
-            }
-
-            //
-            // A newline has signalled the end of the processor record.
-            // Check that there aren't too many procs specified.
-            //
-            if ((int)num_avail == __kmp_xproc) {
-                CLEANUP_THREAD_INFO;
-                *msg_id = kmp_i18n_str_TooManyEntries;
-                return -1;
-            }
-
-            //
-            // Check for missing fields.  The osId field must be there, and we
-            // currently require that the physical id field is specified, also.
-            //
-            if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
-                CLEANUP_THREAD_INFO;
-                *msg_id = kmp_i18n_str_MissingProcField;
-                return -1;
-            }
-            if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
-                CLEANUP_THREAD_INFO;
-                *msg_id = kmp_i18n_str_MissingPhysicalIDField;
-                return -1;
-            }
-
-            //
-            // Skip this proc if it is not included in the machine model.
-            //
-            if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], __kmp_affin_fullMask)) {
-                INIT_PROC_INFO(threadInfo[num_avail]);
-                continue;
-            }
+      }
+      char s4[] = "thread id";
+      if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s4) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][threadIdIndex] = val;
+        continue;
+      }
+      unsigned level;
+      if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s4) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        KMP_ASSERT(nodeIdIndex + level <= maxIndex);
+        if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][nodeIdIndex + level] = val;
+        continue;
+      }
 
-            //
-            // We have a successful parse of this proc's info.
-            // Increment the counter, and prepare for the next proc.
-            //
-            num_avail++;
-            KMP_ASSERT(num_avail <= num_records);
-            INIT_PROC_INFO(threadInfo[num_avail]);
+      // We didn't recognize the leading token on the line. There are lots of
+      // leading tokens that we don't recognize - if the line isn't empty, go on
+      // to the next line.
+      if ((*buf != 0) && (*buf != '\n')) {
+        // If the line is longer than the buffer, read characters
+        // until we find a newline.
+        if (long_line) {
+          int ch;
+          while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
+            ;
         }
         continue;
+      }
 
-        no_val:
+      // A newline has signalled the end of the processor record.
+      // Check that there aren't too many procs specified.
+      if ((int)num_avail == __kmp_xproc) {
         CLEANUP_THREAD_INFO;
-        *msg_id = kmp_i18n_str_MissingValCpuinfo;
+        *msg_id = kmp_i18n_str_TooManyEntries;
         return -1;
+      }
 
-        dup_field:
+      // Check for missing fields.  The osId field must be there, and we
+      // currently require that the physical id field is specified, also.
+      if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
         CLEANUP_THREAD_INFO;
-        *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
+        *msg_id = kmp_i18n_str_MissingProcField;
         return -1;
-    }
-    *line = 0;
-
-# if KMP_MIC && REDUCE_TEAM_SIZE
-    unsigned teamSize = 0;
-# endif // KMP_MIC && REDUCE_TEAM_SIZE
-
-    // check for num_records == __kmp_xproc ???
-
-    //
-    // If there's only one thread context to bind to, form an Address object
-    // with depth 1 and return immediately (or, if affinity is off, set
-    // address2os to NULL and return).
-    //
-    // If it is configured to omit the package level when there is only a
-    // single package, the logic at the end of this routine won't work if
-    // there is only a single thread - it would try to form an Address
-    // object with depth 0.
-    //
-    KMP_ASSERT(num_avail > 0);
-    KMP_ASSERT(num_avail <= num_records);
-    if (num_avail == 1) {
-        __kmp_ncores = 1;
-        __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
-        if (__kmp_affinity_verbose) {
-            if (! KMP_AFFINITY_CAPABLE()) {
-                KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
-                KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-                KMP_INFORM(Uniform, "KMP_AFFINITY");
-            }
-            else {
-                char buf[KMP_AFFIN_MASK_PRINT_LEN];
-                __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-                  __kmp_affin_fullMask);
-                KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
-                if (__kmp_affinity_respect_mask) {
-                    KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-                } else {
-                    KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-                }
-                KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-                KMP_INFORM(Uniform, "KMP_AFFINITY");
-            }
-            int index;
-            kmp_str_buf_t buf;
-            __kmp_str_buf_init(&buf);
-            __kmp_str_buf_print(&buf, "1");
-            for (index = maxIndex - 1; index > pkgIdIndex; index--) {
-                __kmp_str_buf_print(&buf, " x 1");
-            }
-            KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
-            __kmp_str_buf_free(&buf);
-        }
-
-        if (__kmp_affinity_type == affinity_none) {
-            CLEANUP_THREAD_INFO;
-            return 0;
-        }
-
-        *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
-        Address addr(1);
-        addr.labels[0] = threadInfo[0][pkgIdIndex];
-        (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
-
-        if (__kmp_affinity_gran_levels < 0) {
-            __kmp_affinity_gran_levels = 0;
-        }
-
-        if (__kmp_affinity_verbose) {
-            __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
-        }
-
+      }
+      if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
         CLEANUP_THREAD_INFO;
-        return 1;
-    }
-
-    //
-    // Sort the threadInfo table by physical Id.
-    //
-    qsort(threadInfo, num_avail, sizeof(*threadInfo),
-      __kmp_affinity_cmp_ProcCpuInfo_phys_id);
-
-    //
-    // The table is now sorted by pkgId / coreId / threadId, but we really
-    // don't know the radix of any of the fields.  pkgId's may be sparsely
-    // assigned among the chips on a system.  Although coreId's are usually
-    // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
-    // [0..threadsPerCore-1], we don't want to make any such assumptions.
-    //
-    // For that matter, we don't know what coresPerPkg and threadsPerCore
-    // (or the total # packages) are at this point - we want to determine
-    // that now.  We only have an upper bound on the first two figures.
-    //
-    unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
-      * sizeof(unsigned));
-    unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
-      * sizeof(unsigned));
-    unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
-      * sizeof(unsigned));
-    unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
-      * sizeof(unsigned));
-
-    bool assign_thread_ids = false;
-    unsigned threadIdCt;
-    unsigned index;
+        *msg_id = kmp_i18n_str_MissingPhysicalIDField;
+        return -1;
+      }
 
-    restart_radix_check:
-    threadIdCt = 0;
+      // Skip this proc if it is not included in the machine model.
+      if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
+                         __kmp_affin_fullMask)) {
+        INIT_PROC_INFO(threadInfo[num_avail]);
+        continue;
+      }
 
-    //
-    // Initialize the counter arrays with data from threadInfo[0].
-    //
-    if (assign_thread_ids) {
-        if (threadInfo[0][threadIdIndex] == UINT_MAX) {
-            threadInfo[0][threadIdIndex] = threadIdCt++;
-        }
-        else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
-            threadIdCt = threadInfo[0][threadIdIndex] + 1;
-        }
-    }
-    for (index = 0; index <= maxIndex; index++) {
-        counts[index] = 1;
-        maxCt[index] = 1;
-        totals[index] = 1;
-        lastId[index] = threadInfo[0][index];;
+      // We have a successful parse of this proc's info.
+      // Increment the counter, and prepare for the next proc.
+      num_avail++;
+      KMP_ASSERT(num_avail <= num_records);
+      INIT_PROC_INFO(threadInfo[num_avail]);
     }
+    continue;
 
-    //
-    // Run through the rest of the OS procs.
-    //
-    for (i = 1; i < num_avail; i++) {
-        //
-        // Find the most significant index whose id differs
-        // from the id for the previous OS proc.
-        //
-        for (index = maxIndex; index >= threadIdIndex; index--) {
-            if (assign_thread_ids && (index == threadIdIndex)) {
-                //
-                // Auto-assign the thread id field if it wasn't specified.
-                //
-                if (threadInfo[i][threadIdIndex] == UINT_MAX) {
-                    threadInfo[i][threadIdIndex] = threadIdCt++;
-                }
+  no_val:
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_MissingValCpuinfo;
+    return -1;
 
-                //
-                // Aparrently the thread id field was specified for some
-                // entries and not others.  Start the thread id counter
-                // off at the next higher thread id.
-                //
-                else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
-                    threadIdCt = threadInfo[i][threadIdIndex] + 1;
-                }
-            }
-            if (threadInfo[i][index] != lastId[index]) {
-                //
-                // Run through all indices which are less significant,
-                // and reset the counts to 1.
-                //
-                // At all levels up to and including index, we need to
-                // increment the totals and record the last id.
-                //
-                unsigned index2;
-                for (index2 = threadIdIndex; index2 < index; index2++) {
-                    totals[index2]++;
-                    if (counts[index2] > maxCt[index2]) {
-                        maxCt[index2] = counts[index2];
-                    }
-                    counts[index2] = 1;
-                    lastId[index2] = threadInfo[i][index2];
-                }
-                counts[index]++;
-                totals[index]++;
-                lastId[index] = threadInfo[i][index];
-
-                if (assign_thread_ids && (index > threadIdIndex)) {
-
-# if KMP_MIC && REDUCE_TEAM_SIZE
-                    //
-                    // The default team size is the total #threads in the machine
-                    // minus 1 thread for every core that has 3 or more threads.
-                    //
-                    teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
-# endif // KMP_MIC && REDUCE_TEAM_SIZE
-
-                    //
-                    // Restart the thread counter, as we are on a new core.
-                    //
-                    threadIdCt = 0;
-
-                    //
-                    // Auto-assign the thread id field if it wasn't specified.
-                    //
-                    if (threadInfo[i][threadIdIndex] == UINT_MAX) {
-                        threadInfo[i][threadIdIndex] = threadIdCt++;
-                    }
-
-                    //
-                    // Aparrently the thread id field was specified for some
-                    // entries and not others.  Start the thread id counter
-                    // off at the next higher thread id.
-                    //
-                    else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
-                        threadIdCt = threadInfo[i][threadIdIndex] + 1;
-                    }
-                }
-                break;
-            }
+  dup_field:
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
+    return -1;
+  }
+  *line = 0;
+
+#if KMP_MIC && REDUCE_TEAM_SIZE
+  unsigned teamSize = 0;
+#endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+  // check for num_records == __kmp_xproc ???
+
+  // If there's only one thread context to bind to, form an Address object with
+  // depth 1 and return immediately (or, if affinity is off, set address2os to
+  // NULL and return).
+  //
+  // If it is configured to omit the package level when there is only a single
+  // package, the logic at the end of this routine won't work if there is only a
+  // single thread - it would try to form an Address object with depth 0.
+  KMP_ASSERT(num_avail > 0);
+  KMP_ASSERT(num_avail <= num_records);
+  if (num_avail == 1) {
+    __kmp_ncores = 1;
+    __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
+    if (__kmp_affinity_verbose) {
+      if (!KMP_AFFINITY_CAPABLE()) {
+        KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
+        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      } else {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  __kmp_affin_fullMask);
+        KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
+        if (__kmp_affinity_respect_mask) {
+          KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+        } else {
+          KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
         }
-        if (index < threadIdIndex) {
-            //
-            // If thread ids were specified, it is an error if they are not
-            // unique.  Also, check that we waven't already restarted the
-            // loop (to be safe - shouldn't need to).
-            //
-            if ((threadInfo[i][threadIdIndex] != UINT_MAX)
-              || assign_thread_ids) {
-                __kmp_free(lastId);
-                __kmp_free(totals);
-                __kmp_free(maxCt);
-                __kmp_free(counts);
-                CLEANUP_THREAD_INFO;
-                *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
-                return -1;
-            }
+        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      }
+      int index;
+      kmp_str_buf_t buf;
+      __kmp_str_buf_init(&buf);
+      __kmp_str_buf_print(&buf, "1");
+      for (index = maxIndex - 1; index > pkgIdIndex; index--) {
+        __kmp_str_buf_print(&buf, " x 1");
+      }
+      KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
+      __kmp_str_buf_free(&buf);
+    }
 
-            //
-            // If the thread ids were not specified and we see entries
-            // entries that are duplicates, start the loop over and
-            // assign the thread ids manually.
-            //
-            assign_thread_ids = true;
-            goto restart_radix_check;
-        }
+    if (__kmp_affinity_type == affinity_none) {
+      CLEANUP_THREAD_INFO;
+      return 0;
     }
 
-# if KMP_MIC && REDUCE_TEAM_SIZE
-    //
-    // The default team size is the total #threads in the machine
-    // minus 1 thread for every core that has 3 or more threads.
-    //
-    teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
-# endif // KMP_MIC && REDUCE_TEAM_SIZE
+    *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
+    Address addr(1);
+    addr.labels[0] = threadInfo[0][pkgIdIndex];
+    (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
 
-    for (index = threadIdIndex; index <= maxIndex; index++) {
-        if (counts[index] > maxCt[index]) {
-            maxCt[index] = counts[index];
-        }
+    if (__kmp_affinity_gran_levels < 0) {
+      __kmp_affinity_gran_levels = 0;
     }
 
-    __kmp_nThreadsPerCore = maxCt[threadIdIndex];
-    nCoresPerPkg = maxCt[coreIdIndex];
-    nPackages = totals[pkgIdIndex];
-
-    //
-    // Check to see if the machine topology is uniform
-    //
-    unsigned prod = totals[maxIndex];
-    for (index = threadIdIndex; index < maxIndex; index++) {
-       prod *= maxCt[index];
+    if (__kmp_affinity_verbose) {
+      __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
     }
-    bool uniform = (prod == totals[threadIdIndex]);
 
-    //
-    // When affinity is off, this routine will still be called to set
-    // __kmp_ncores, as well as __kmp_nThreadsPerCore,
-    // nCoresPerPkg, & nPackages.  Make sure all these vars are set
-    // correctly, and return now if affinity is not enabled.
-    //
-    __kmp_ncores = totals[coreIdIndex];
+    CLEANUP_THREAD_INFO;
+    return 1;
+  }
 
-    if (__kmp_affinity_verbose) {
-        if (! KMP_AFFINITY_CAPABLE()) {
-                KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
-                KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-                if (uniform) {
-                    KMP_INFORM(Uniform, "KMP_AFFINITY");
-                } else {
-                    KMP_INFORM(NonUniform, "KMP_AFFINITY");
-                }
+  // Sort the threadInfo table by physical Id.
+  qsort(threadInfo, num_avail, sizeof(*threadInfo),
+        __kmp_affinity_cmp_ProcCpuInfo_phys_id);
+
+  // The table is now sorted by pkgId / coreId / threadId, but we really don't
+  // know the radix of any of the fields. pkgId's may be sparsely assigned among
+  // the chips on a system. Although coreId's are usually assigned
+  // [0 .. coresPerPkg-1] and threadId's are usually assigned
+  // [0..threadsPerCore-1], we don't want to make any such assumptions.
+  //
+  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
+  // total # packages) are at this point - we want to determine that now. We
+  // only have an upper bound on the first two figures.
+  unsigned *counts =
+      (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+  unsigned *maxCt =
+      (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+  unsigned *totals =
+      (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+  unsigned *lastId =
+      (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+
+  bool assign_thread_ids = false;
+  unsigned threadIdCt;
+  unsigned index;
+
+restart_radix_check:
+  threadIdCt = 0;
+
+  // Initialize the counter arrays with data from threadInfo[0].
+  if (assign_thread_ids) {
+    if (threadInfo[0][threadIdIndex] == UINT_MAX) {
+      threadInfo[0][threadIdIndex] = threadIdCt++;
+    } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
+      threadIdCt = threadInfo[0][threadIdIndex] + 1;
+    }
+  }
+  for (index = 0; index <= maxIndex; index++) {
+    counts[index] = 1;
+    maxCt[index] = 1;
+    totals[index] = 1;
+    lastId[index] = threadInfo[0][index];
+    ;
+  }
+
+  // Run through the rest of the OS procs.
+  for (i = 1; i < num_avail; i++) {
+    // Find the most significant index whose id differs from the id for the
+    // previous OS proc.
+    for (index = maxIndex; index >= threadIdIndex; index--) {
+      if (assign_thread_ids && (index == threadIdIndex)) {
+        // Auto-assign the thread id field if it wasn't specified.
+        if (threadInfo[i][threadIdIndex] == UINT_MAX) {
+          threadInfo[i][threadIdIndex] = threadIdCt++;
+        }
+        // Aparrently the thread id field was specified for some entries and not
+        // others. Start the thread id counter off at the next higher thread id.
+        else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
+          threadIdCt = threadInfo[i][threadIdIndex] + 1;
         }
-        else {
-            char buf[KMP_AFFIN_MASK_PRINT_LEN];
-            __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask);
-                KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
-                if (__kmp_affinity_respect_mask) {
-                    KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-                } else {
-                    KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-                }
-                KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-                if (uniform) {
-                    KMP_INFORM(Uniform, "KMP_AFFINITY");
-                } else {
-                    KMP_INFORM(NonUniform, "KMP_AFFINITY");
-                }
+      }
+      if (threadInfo[i][index] != lastId[index]) {
+        // Run through all indices which are less significant, and reset the
+        // counts to 1. At all levels up to and including index, we need to
+        // increment the totals and record the last id.
+        unsigned index2;
+        for (index2 = threadIdIndex; index2 < index; index2++) {
+          totals[index2]++;
+          if (counts[index2] > maxCt[index2]) {
+            maxCt[index2] = counts[index2];
+          }
+          counts[index2] = 1;
+          lastId[index2] = threadInfo[i][index2];
         }
-        kmp_str_buf_t buf;
-        __kmp_str_buf_init(&buf);
+        counts[index]++;
+        totals[index]++;
+        lastId[index] = threadInfo[i][index];
+
+        if (assign_thread_ids && (index > threadIdIndex)) {
+
+#if KMP_MIC && REDUCE_TEAM_SIZE
+          // The default team size is the total #threads in the machine
+          // minus 1 thread for every core that has 3 or more threads.
+          teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
+#endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+          // Restart the thread counter, as we are on a new core.
+          threadIdCt = 0;
+
+          // Auto-assign the thread id field if it wasn't specified.
+          if (threadInfo[i][threadIdIndex] == UINT_MAX) {
+            threadInfo[i][threadIdIndex] = threadIdCt++;
+          }
 
-        __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
-        for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
-            __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
+          // Aparrently the thread id field was specified for some entries and
+          // not others. Start the thread id counter off at the next higher
+          // thread id.
+          else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
+            threadIdCt = threadInfo[i][threadIdIndex] + 1;
+          }
         }
-        KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
-          maxCt[threadIdIndex], __kmp_ncores);
-
-        __kmp_str_buf_free(&buf);
-    }
-
-# if KMP_MIC && REDUCE_TEAM_SIZE
-    //
-    // Set the default team size.
-    //
-    if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
-        __kmp_dflt_team_nth = teamSize;
-        KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
-          __kmp_dflt_team_nth));
-    }
-# endif // KMP_MIC && REDUCE_TEAM_SIZE
-
-    KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-    KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc);
-    __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-    for (i = 0; i < num_avail; ++i) { // fill the os indices
-        __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
+        break;
+      }
     }
-
-    if (__kmp_affinity_type == affinity_none) {
+    if (index < threadIdIndex) {
+      // If thread ids were specified, it is an error if they are not unique.
+      // Also, check that we waven't already restarted the loop (to be safe -
+      // shouldn't need to).
+      if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
         __kmp_free(lastId);
         __kmp_free(totals);
         __kmp_free(maxCt);
         __kmp_free(counts);
         CLEANUP_THREAD_INFO;
-        return 0;
-    }
-
-    //
-    // Count the number of levels which have more nodes at that level than
-    // at the parent's level (with there being an implicit root node of
-    // the top level).  This is equivalent to saying that there is at least
-    // one node at this level which has a sibling.  These levels are in the
-    // map, and the package level is always in the map.
-    //
-    bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
-    int level = 0;
-    for (index = threadIdIndex; index < maxIndex; index++) {
-        KMP_ASSERT(totals[index] >= totals[index + 1]);
-        inMap[index] = (totals[index] > totals[index + 1]);
-    }
-    inMap[maxIndex] = (totals[maxIndex] > 1);
-    inMap[pkgIdIndex] = true;
-
-    int depth = 0;
-    for (index = threadIdIndex; index <= maxIndex; index++) {
-        if (inMap[index]) {
-            depth++;
-        }
-    }
-    KMP_ASSERT(depth > 0);
-
-    //
-    // Construct the data structure that is to be returned.
-    //
-    *address2os = (AddrUnsPair*)
-      __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
-    int pkgLevel = -1;
-    int coreLevel = -1;
-    int threadLevel = -1;
-
-    for (i = 0; i < num_avail; ++i) {
-        Address addr(depth);
-        unsigned os = threadInfo[i][osIdIndex];
-        int src_index;
-        int dst_index = 0;
-
-        for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
-            if (! inMap[src_index]) {
-                continue;
-            }
-            addr.labels[dst_index] = threadInfo[i][src_index];
-            if (src_index == pkgIdIndex) {
-                pkgLevel = dst_index;
-            }
-            else if (src_index == coreIdIndex) {
-                coreLevel = dst_index;
-            }
-            else if (src_index == threadIdIndex) {
-                threadLevel = dst_index;
-            }
-            dst_index++;
-        }
-        (*address2os)[i] = AddrUnsPair(addr, os);
-    }
-
-    if (__kmp_affinity_gran_levels < 0) {
-        //
-        // Set the granularity level based on what levels are modeled
-        // in the machine topology map.
-        //
-        unsigned src_index;
-        __kmp_affinity_gran_levels = 0;
-        for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
-            if (! inMap[src_index]) {
-                continue;
-            }
-            switch (src_index) {
-                case threadIdIndex:
-                if (__kmp_affinity_gran > affinity_gran_thread) {
-                    __kmp_affinity_gran_levels++;
-                }
-
-                break;
-                case coreIdIndex:
-                if (__kmp_affinity_gran > affinity_gran_core) {
-                    __kmp_affinity_gran_levels++;
-                }
-                break;
+        *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
+        return -1;
+      }
 
-                case pkgIdIndex:
-                if (__kmp_affinity_gran > affinity_gran_package) {
-                    __kmp_affinity_gran_levels++;
-                }
-                break;
-            }
-        }
+      // If the thread ids were not specified and we see entries entries that
+      // are duplicates, start the loop over and assign the thread ids manually.
+      assign_thread_ids = true;
+      goto restart_radix_check;
+    }
+  }
+
+#if KMP_MIC && REDUCE_TEAM_SIZE
+  // The default team size is the total #threads in the machine
+  // minus 1 thread for every core that has 3 or more threads.
+  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
+#endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+  for (index = threadIdIndex; index <= maxIndex; index++) {
+    if (counts[index] > maxCt[index]) {
+      maxCt[index] = counts[index];
+    }
+  }
+
+  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
+  nCoresPerPkg = maxCt[coreIdIndex];
+  nPackages = totals[pkgIdIndex];
+
+  // Check to see if the machine topology is uniform
+  unsigned prod = totals[maxIndex];
+  for (index = threadIdIndex; index < maxIndex; index++) {
+    prod *= maxCt[index];
+  }
+  bool uniform = (prod == totals[threadIdIndex]);
+
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly, and return now if affinity is
+  // not enabled.
+  __kmp_ncores = totals[coreIdIndex];
+
+  if (__kmp_affinity_verbose) {
+    if (!KMP_AFFINITY_CAPABLE()) {
+      KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      if (uniform) {
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      } else {
+        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+      }
+    } else {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                __kmp_affin_fullMask);
+      KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
+      if (__kmp_affinity_respect_mask) {
+        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+      } else {
+        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+      }
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      if (uniform) {
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      } else {
+        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+      }
     }
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
 
-    if (__kmp_affinity_verbose) {
-        __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
-          coreLevel, threadLevel);
-    }
+    __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
+    for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
+      __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
+    }
+    KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
+               maxCt[threadIdIndex], __kmp_ncores);
+
+    __kmp_str_buf_free(&buf);
+  }
+
+#if KMP_MIC && REDUCE_TEAM_SIZE
+  // Set the default team size.
+  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
+    __kmp_dflt_team_nth = teamSize;
+    KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
+                  "__kmp_dflt_team_nth = %d\n",
+                  __kmp_dflt_team_nth));
+  }
+#endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
+  KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc);
+  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  for (i = 0; i < num_avail; ++i) { // fill the os indices
+    __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
+  }
 
-    __kmp_free(inMap);
+  if (__kmp_affinity_type == affinity_none) {
     __kmp_free(lastId);
     __kmp_free(totals);
     __kmp_free(maxCt);
     __kmp_free(counts);
     CLEANUP_THREAD_INFO;
-    return depth;
-}
-
+    return 0;
+  }
 
-//
-// Create and return a table of affinity masks, indexed by OS thread ID.
-// This routine handles OR'ing together all the affinity masks of threads
-// that are sufficiently close, if granularity > fine.
-//
-static kmp_affin_mask_t *
-__kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
-  AddrUnsPair *address2os, unsigned numAddrs)
-{
-    //
-    // First form a table of affinity masks in order of OS thread id.
-    //
-    unsigned depth;
-    unsigned maxOsId;
-    unsigned i;
+  // Count the number of levels which have more nodes at that level than at the
+  // parent's level (with there being an implicit root node of the top level).
+  // This is equivalent to saying that there is at least one node at this level
+  // which has a sibling. These levels are in the map, and the package level is
+  // always in the map.
+  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
+  int level = 0;
+  for (index = threadIdIndex; index < maxIndex; index++) {
+    KMP_ASSERT(totals[index] >= totals[index + 1]);
+    inMap[index] = (totals[index] > totals[index + 1]);
+  }
+  inMap[maxIndex] = (totals[maxIndex] > 1);
+  inMap[pkgIdIndex] = true;
+
+  int depth = 0;
+  for (index = threadIdIndex; index <= maxIndex; index++) {
+    if (inMap[index]) {
+      depth++;
+    }
+  }
+  KMP_ASSERT(depth > 0);
+
+  // Construct the data structure that is to be returned.
+  *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail);
+  int pkgLevel = -1;
+  int coreLevel = -1;
+  int threadLevel = -1;
+
+  for (i = 0; i < num_avail; ++i) {
+    Address addr(depth);
+    unsigned os = threadInfo[i][osIdIndex];
+    int src_index;
+    int dst_index = 0;
 
-    KMP_ASSERT(numAddrs > 0);
-    depth = address2os[0].first.depth;
+    for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
+      if (!inMap[src_index]) {
+        continue;
+      }
+      addr.labels[dst_index] = threadInfo[i][src_index];
+      if (src_index == pkgIdIndex) {
+        pkgLevel = dst_index;
+      } else if (src_index == coreIdIndex) {
+        coreLevel = dst_index;
+      } else if (src_index == threadIdIndex) {
+        threadLevel = dst_index;
+      }
+      dst_index++;
+    }
+    (*address2os)[i] = AddrUnsPair(addr, os);
+  }
 
-    maxOsId = 0;
-    for (i = 0; i < numAddrs; i++) {
-        unsigned osId = address2os[i].second;
-        if (osId > maxOsId) {
-            maxOsId = osId;
+  if (__kmp_affinity_gran_levels < 0) {
+    // Set the granularity level based on what levels are modeled
+    // in the machine topology map.
+    unsigned src_index;
+    __kmp_affinity_gran_levels = 0;
+    for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
+      if (!inMap[src_index]) {
+        continue;
+      }
+      switch (src_index) {
+      case threadIdIndex:
+        if (__kmp_affinity_gran > affinity_gran_thread) {
+          __kmp_affinity_gran_levels++;
         }
-    }
-    kmp_affin_mask_t *osId2Mask;
-    KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1));
 
-    //
-    // Sort the address2os table according to physical order.  Doing so
-    // will put all threads on the same core/package/node in consecutive
-    // locations.
-    //
-    qsort(address2os, numAddrs, sizeof(*address2os),
-      __kmp_affinity_cmp_Address_labels);
+        break;
+      case coreIdIndex:
+        if (__kmp_affinity_gran > affinity_gran_core) {
+          __kmp_affinity_gran_levels++;
+        }
+        break;
 
-    KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
-    if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
-        KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
-    }
-    if (__kmp_affinity_gran_levels >= (int)depth) {
-        if (__kmp_affinity_verbose || (__kmp_affinity_warnings
-          && (__kmp_affinity_type != affinity_none))) {
-            KMP_WARNING(AffThreadsMayMigrate);
+      case pkgIdIndex:
+        if (__kmp_affinity_gran > affinity_gran_package) {
+          __kmp_affinity_gran_levels++;
         }
+        break;
+      }
     }
+  }
 
-    //
-    // Run through the table, forming the masks for all threads on each
-    // core.  Threads on the same core will have identical "Address"
-    // objects, not considering the last level, which must be the thread
-    // id.  All threads on a core will appear consecutively.
-    //
-    unsigned unique = 0;
-    unsigned j = 0;                             // index of 1st thread on core
-    unsigned leader = 0;
-    Address *leaderAddr = &(address2os[0].first);
-    kmp_affin_mask_t *sum;
-    KMP_CPU_ALLOC_ON_STACK(sum);
-    KMP_CPU_ZERO(sum);
-    KMP_CPU_SET(address2os[0].second, sum);
-    for (i = 1; i < numAddrs; i++) {
-        //
-        // If this thread is sufficiently close to the leader (within the
-        // granularity setting), then set the bit for this os thread in the
-        // affinity mask for this group, and go on to the next thread.
-        //
-        if (leaderAddr->isClose(address2os[i].first,
-          __kmp_affinity_gran_levels)) {
-            KMP_CPU_SET(address2os[i].second, sum);
-            continue;
-        }
-
-        //
-        // For every thread in this group, copy the mask to the thread's
-        // entry in the osId2Mask table.  Mark the first address as a
-        // leader.
-        //
-        for (; j < i; j++) {
-            unsigned osId = address2os[j].second;
-            KMP_DEBUG_ASSERT(osId <= maxOsId);
-            kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
-            KMP_CPU_COPY(mask, sum);
-            address2os[j].first.leader = (j == leader);
-        }
-        unique++;
-
-        //
-        // Start a new mask.
-        //
-        leader = i;
-        leaderAddr = &(address2os[i].first);
-        KMP_CPU_ZERO(sum);
-        KMP_CPU_SET(address2os[i].second, sum);
+  if (__kmp_affinity_verbose) {
+    __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
+                                  coreLevel, threadLevel);
+  }
+
+  __kmp_free(inMap);
+  __kmp_free(lastId);
+  __kmp_free(totals);
+  __kmp_free(maxCt);
+  __kmp_free(counts);
+  CLEANUP_THREAD_INFO;
+  return depth;
+}
+
+// Create and return a table of affinity masks, indexed by OS thread ID.
+// This routine handles OR'ing together all the affinity masks of threads
+// that are sufficiently close, if granularity > fine.
+static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
+                                            unsigned *numUnique,
+                                            AddrUnsPair *address2os,
+                                            unsigned numAddrs) {
+  // First form a table of affinity masks in order of OS thread id.
+  unsigned depth;
+  unsigned maxOsId;
+  unsigned i;
+
+  KMP_ASSERT(numAddrs > 0);
+  depth = address2os[0].first.depth;
+
+  maxOsId = 0;
+  for (i = 0; i < numAddrs; i++) {
+    unsigned osId = address2os[i].second;
+    if (osId > maxOsId) {
+      maxOsId = osId;
+    }
+  }
+  kmp_affin_mask_t *osId2Mask;
+  KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
+
+  // Sort the address2os table according to physical order. Doing so will put
+  // all threads on the same core/package/node in consecutive locations.
+  qsort(address2os, numAddrs, sizeof(*address2os),
+        __kmp_affinity_cmp_Address_labels);
+
+  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
+  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
+    KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
+  }
+  if (__kmp_affinity_gran_levels >= (int)depth) {
+    if (__kmp_affinity_verbose ||
+        (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
+      KMP_WARNING(AffThreadsMayMigrate);
+    }
+  }
+
+  // Run through the table, forming the masks for all threads on each core.
+  // Threads on the same core will have identical "Address" objects, not
+  // considering the last level, which must be the thread id. All threads on a
+  // core will appear consecutively.
+  unsigned unique = 0;
+  unsigned j = 0; // index of 1st thread on core
+  unsigned leader = 0;
+  Address *leaderAddr = &(address2os[0].first);
+  kmp_affin_mask_t *sum;
+  KMP_CPU_ALLOC_ON_STACK(sum);
+  KMP_CPU_ZERO(sum);
+  KMP_CPU_SET(address2os[0].second, sum);
+  for (i = 1; i < numAddrs; i++) {
+    // If this thread is sufficiently close to the leader (within the
+    // granularity setting), then set the bit for this os thread in the
+    // affinity mask for this group, and go on to the next thread.
+    if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) {
+      KMP_CPU_SET(address2os[i].second, sum);
+      continue;
     }
 
-    //
-    // For every thread in last group, copy the mask to the thread's
-    // entry in the osId2Mask table.
-    //
+    // For every thread in this group, copy the mask to the thread's entry in
+    // the osId2Mask table.  Mark the first address as a leader.
     for (; j < i; j++) {
-        unsigned osId = address2os[j].second;
-        KMP_DEBUG_ASSERT(osId <= maxOsId);
-        kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
-        KMP_CPU_COPY(mask, sum);
-        address2os[j].first.leader = (j == leader);
+      unsigned osId = address2os[j].second;
+      KMP_DEBUG_ASSERT(osId <= maxOsId);
+      kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
+      KMP_CPU_COPY(mask, sum);
+      address2os[j].first.leader = (j == leader);
     }
     unique++;
-    KMP_CPU_FREE_FROM_STACK(sum);
 
-    *maxIndex = maxOsId;
-    *numUnique = unique;
-    return osId2Mask;
-}
+    // Start a new mask.
+    leader = i;
+    leaderAddr = &(address2os[i].first);
+    KMP_CPU_ZERO(sum);
+    KMP_CPU_SET(address2os[i].second, sum);
+  }
 
+  // For every thread in last group, copy the mask to the thread's
+  // entry in the osId2Mask table.
+  for (; j < i; j++) {
+    unsigned osId = address2os[j].second;
+    KMP_DEBUG_ASSERT(osId <= maxOsId);
+    kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
+    KMP_CPU_COPY(mask, sum);
+    address2os[j].first.leader = (j == leader);
+  }
+  unique++;
+  KMP_CPU_FREE_FROM_STACK(sum);
+
+  *maxIndex = maxOsId;
+  *numUnique = unique;
+  return osId2Mask;
+}
 
-//
 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
 // as file-static than to try and pass them through the calling sequence of
 // the recursive-descent OMP_PLACES parser.
-//
 static kmp_affin_mask_t *newMasks;
 static int numNewMasks;
 static int nextNewMask;
 
-#define ADD_MASK(_mask) \
-    {                                                                   \
-        if (nextNewMask >= numNewMasks) {                               \
-            int i;                                                      \
-            numNewMasks *= 2;                                           \
-            kmp_affin_mask_t* temp;                                     \
-            KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);            \
-            for(i=0;i<numNewMasks/2;i++) {                              \
-                kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);    \
-                kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i);        \
-                KMP_CPU_COPY(dest, src);                                \
-            }                                                           \
-            KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2);       \
-            newMasks = temp;                                            \
-        }                                                               \
-        KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
-        nextNewMask++;                                                  \
-    }
-
-#define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
-    {                                                                   \
-        if (((_osId) > _maxOsId) ||                                     \
-          (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
-            if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
-              && (__kmp_affinity_type != affinity_none))) {             \
-                KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
-            }                                                           \
-        }                                                               \
-        else {                                                          \
-            ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
-        }                                                               \
-    }
-
+#define ADD_MASK(_mask)                                                        \
+  {                                                                            \
+    if (nextNewMask >= numNewMasks) {                                          \
+      int i;                                                                   \
+      numNewMasks *= 2;                                                        \
+      kmp_affin_mask_t *temp;                                                  \
+      KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);                         \
+      for (i = 0; i < numNewMasks / 2; i++) {                                  \
+        kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);                    \
+        kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i);                       \
+        KMP_CPU_COPY(dest, src);                                               \
+      }                                                                        \
+      KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2);                  \
+      newMasks = temp;                                                         \
+    }                                                                          \
+    KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));               \
+    nextNewMask++;                                                             \
+  }
+
+#define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId)                             \
+  {                                                                            \
+    if (((_osId) > _maxOsId) ||                                                \
+        (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) {     \
+      if (__kmp_affinity_verbose ||                                            \
+          (__kmp_affinity_warnings &&                                          \
+           (__kmp_affinity_type != affinity_none))) {                          \
+        KMP_WARNING(AffIgnoreInvalidProcID, _osId);                            \
+      }                                                                        \
+    } else {                                                                   \
+      ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));                            \
+    }                                                                          \
+  }
 
-//
 // Re-parse the proclist (for the explicit affinity type), and form the list
 // of affinity newMasks indexed by gtid.
-//
-static void
-__kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
-  unsigned int *out_numMasks, const char *proclist,
-  kmp_affin_mask_t *osId2Mask, int maxOsId)
-{
-    int i;
-    const char *scan = proclist;
-    const char *next = proclist;
-
-    //
-    // We use malloc() for the temporary mask vector,
-    // so that we can use realloc() to extend it.
-    //
-    numNewMasks = 2;
-    KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
-    nextNewMask = 0;
-    kmp_affin_mask_t *sumMask;
-    KMP_CPU_ALLOC(sumMask);
-    int setSize = 0;
-
-    for (;;) {
-        int start, end, stride;
-
-        SKIP_WS(scan);
-        next = scan;
-        if (*next == '\0') {
-            break;
-        }
-
-        if (*next == '{') {
-            int num;
-            setSize = 0;
-            next++;     // skip '{'
-            SKIP_WS(next);
-            scan = next;
-
-            //
-            // Read the first integer in the set.
-            //
-            KMP_ASSERT2((*next >= '0') && (*next <= '9'),
-              "bad proclist");
-            SKIP_DIGITS(next);
-            num = __kmp_str_to_int(scan, *next);
-            KMP_ASSERT2(num >= 0, "bad explicit proc list");
-
-            //
-            // Copy the mask for that osId to the sum (union) mask.
-            //
-            if ((num > maxOsId) ||
-              (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
-                if (__kmp_affinity_verbose || (__kmp_affinity_warnings
-                  && (__kmp_affinity_type != affinity_none))) {
-                    KMP_WARNING(AffIgnoreInvalidProcID, num);
-                }
-                KMP_CPU_ZERO(sumMask);
-            }
-            else {
-                KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
-                setSize = 1;
-            }
-
-            for (;;) {
-                //
-                // Check for end of set.
-                //
-                SKIP_WS(next);
-                if (*next == '}') {
-                    next++;     // skip '}'
-                    break;
-                }
-
-                //
-                // Skip optional comma.
-                //
-                if (*next == ',') {
-                    next++;
-                }
-                SKIP_WS(next);
-
-                //
-                // Read the next integer in the set.
-                //
-                scan = next;
-                KMP_ASSERT2((*next >= '0') && (*next <= '9'),
-                  "bad explicit proc list");
-
-                SKIP_DIGITS(next);
-                num = __kmp_str_to_int(scan, *next);
-                KMP_ASSERT2(num >= 0, "bad explicit proc list");
-
-                //
-                // Add the mask for that osId to the sum mask.
-                //
-                if ((num > maxOsId) ||
-                  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
-                    if (__kmp_affinity_verbose || (__kmp_affinity_warnings
-                      && (__kmp_affinity_type != affinity_none))) {
-                        KMP_WARNING(AffIgnoreInvalidProcID, num);
-                    }
-                }
-                else {
-                    KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
-                    setSize++;
-                }
-            }
-            if (setSize > 0) {
-                ADD_MASK(sumMask);
-            }
-
-            SKIP_WS(next);
-            if (*next == ',') {
-                next++;
-            }
-            scan = next;
-            continue;
+static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
+                                            unsigned int *out_numMasks,
+                                            const char *proclist,
+                                            kmp_affin_mask_t *osId2Mask,
+                                            int maxOsId) {
+  int i;
+  const char *scan = proclist;
+  const char *next = proclist;
+
+  // We use malloc() for the temporary mask vector, so that we can use
+  // realloc() to extend it.
+  numNewMasks = 2;
+  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
+  nextNewMask = 0;
+  kmp_affin_mask_t *sumMask;
+  KMP_CPU_ALLOC(sumMask);
+  int setSize = 0;
+
+  for (;;) {
+    int start, end, stride;
+
+    SKIP_WS(scan);
+    next = scan;
+    if (*next == '\0') {
+      break;
+    }
+
+    if (*next == '{') {
+      int num;
+      setSize = 0;
+      next++; // skip '{'
+      SKIP_WS(next);
+      scan = next;
+
+      // Read the first integer in the set.
+      KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
+      SKIP_DIGITS(next);
+      num = __kmp_str_to_int(scan, *next);
+      KMP_ASSERT2(num >= 0, "bad explicit proc list");
+
+      // Copy the mask for that osId to the sum (union) mask.
+      if ((num > maxOsId) ||
+          (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
+        if (__kmp_affinity_verbose ||
+            (__kmp_affinity_warnings &&
+             (__kmp_affinity_type != affinity_none))) {
+          KMP_WARNING(AffIgnoreInvalidProcID, num);
         }
+        KMP_CPU_ZERO(sumMask);
+      } else {
+        KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
+        setSize = 1;
+      }
 
-        //
-        // Read the first integer.
-        //
-        KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
-        SKIP_DIGITS(next);
-        start = __kmp_str_to_int(scan, *next);
-        KMP_ASSERT2(start >= 0, "bad explicit proc list");
+      for (;;) {
+        // Check for end of set.
         SKIP_WS(next);
-
-        //
-        // If this isn't a range, then add a mask to the list and go on.
-        //
-        if (*next != '-') {
-            ADD_MASK_OSID(start, osId2Mask, maxOsId);
-
-            //
-            // Skip optional comma.
-            //
-            if (*next == ',') {
-                next++;
-            }
-            scan = next;
-            continue;
+        if (*next == '}') {
+          next++; // skip '}'
+          break;
         }
 
-        //
-        // This is a range.  Skip over the '-' and read in the 2nd int.
-        //
-        next++;         // skip '-'
+        // Skip optional comma.
+        if (*next == ',') {
+          next++;
+        }
         SKIP_WS(next);
+
+        // Read the next integer in the set.
         scan = next;
         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+
         SKIP_DIGITS(next);
-        end = __kmp_str_to_int(scan, *next);
-        KMP_ASSERT2(end >= 0, "bad explicit proc list");
+        num = __kmp_str_to_int(scan, *next);
+        KMP_ASSERT2(num >= 0, "bad explicit proc list");
 
-        //
-        // Check for a stride parameter
-        //
-        stride = 1;
-        SKIP_WS(next);
-        if (*next == ':') {
-            //
-            // A stride is specified.  Skip over the ':" and read the 3rd int.
-            //
-            int sign = +1;
-            next++;         // skip ':'
-            SKIP_WS(next);
-            scan = next;
-            if (*next == '-') {
-                sign = -1;
-                next++;
-                SKIP_WS(next);
-                scan = next;
-            }
-            KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
-              "bad explicit proc list");
-            SKIP_DIGITS(next);
-            stride = __kmp_str_to_int(scan, *next);
-            KMP_ASSERT2(stride >= 0, "bad explicit proc list");
-            stride *= sign;
-        }
-
-        //
-        // Do some range checks.
-        //
-        KMP_ASSERT2(stride != 0, "bad explicit proc list");
-        if (stride > 0) {
-            KMP_ASSERT2(start <= end, "bad explicit proc list");
-        }
-        else {
-            KMP_ASSERT2(start >= end, "bad explicit proc list");
-        }
-        KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
-
-        //
-        // Add the mask for each OS proc # to the list.
-        //
-        if (stride > 0) {
-            do {
-                ADD_MASK_OSID(start, osId2Mask, maxOsId);
-                start += stride;
-            } while (start <= end);
-        }
-        else {
-            do {
-                ADD_MASK_OSID(start, osId2Mask, maxOsId);
-                start += stride;
-            } while (start >= end);
+        // Add the mask for that osId to the sum mask.
+        if ((num > maxOsId) ||
+            (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
+          if (__kmp_affinity_verbose ||
+              (__kmp_affinity_warnings &&
+               (__kmp_affinity_type != affinity_none))) {
+            KMP_WARNING(AffIgnoreInvalidProcID, num);
+          }
+        } else {
+          KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
+          setSize++;
         }
+      }
+      if (setSize > 0) {
+        ADD_MASK(sumMask);
+      }
 
-        //
-        // Skip optional comma.
-        //
+      SKIP_WS(next);
+      if (*next == ',') {
+        next++;
+      }
+      scan = next;
+      continue;
+    }
+
+    // Read the first integer.
+    KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+    SKIP_DIGITS(next);
+    start = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT2(start >= 0, "bad explicit proc list");
+    SKIP_WS(next);
+
+    // If this isn't a range, then add a mask to the list and go on.
+    if (*next != '-') {
+      ADD_MASK_OSID(start, osId2Mask, maxOsId);
+
+      // Skip optional comma.
+      if (*next == ',') {
+        next++;
+      }
+      scan = next;
+      continue;
+    }
+
+    // This is a range.  Skip over the '-' and read in the 2nd int.
+    next++; // skip '-'
+    SKIP_WS(next);
+    scan = next;
+    KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+    SKIP_DIGITS(next);
+    end = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT2(end >= 0, "bad explicit proc list");
+
+    // Check for a stride parameter
+    stride = 1;
+    SKIP_WS(next);
+    if (*next == ':') {
+      // A stride is specified.  Skip over the ':" and read the 3rd int.
+      int sign = +1;
+      next++; // skip ':'
+      SKIP_WS(next);
+      scan = next;
+      if (*next == '-') {
+        sign = -1;
+        next++;
         SKIP_WS(next);
-        if (*next == ',') {
-            next++;
-        }
         scan = next;
+      }
+      KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+      SKIP_DIGITS(next);
+      stride = __kmp_str_to_int(scan, *next);
+      KMP_ASSERT2(stride >= 0, "bad explicit proc list");
+      stride *= sign;
     }
 
-    *out_numMasks = nextNewMask;
-    if (nextNewMask == 0) {
-        *out_masks = NULL;
-        KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
-        return;
-    }
-    KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
-    for(i = 0; i < nextNewMask; i++) {
-        kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);
-        kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
-        KMP_CPU_COPY(dest, src);
+    // Do some range checks.
+    KMP_ASSERT2(stride != 0, "bad explicit proc list");
+    if (stride > 0) {
+      KMP_ASSERT2(start <= end, "bad explicit proc list");
+    } else {
+      KMP_ASSERT2(start >= end, "bad explicit proc list");
     }
+    KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
+
+    // Add the mask for each OS proc # to the list.
+    if (stride > 0) {
+      do {
+        ADD_MASK_OSID(start, osId2Mask, maxOsId);
+        start += stride;
+      } while (start <= end);
+    } else {
+      do {
+        ADD_MASK_OSID(start, osId2Mask, maxOsId);
+        start += stride;
+      } while (start >= end);
+    }
+
+    // Skip optional comma.
+    SKIP_WS(next);
+    if (*next == ',') {
+      next++;
+    }
+    scan = next;
+  }
+
+  *out_numMasks = nextNewMask;
+  if (nextNewMask == 0) {
+    *out_masks = NULL;
     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
-    KMP_CPU_FREE(sumMask);
+    return;
+  }
+  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
+  for (i = 0; i < nextNewMask; i++) {
+    kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
+    kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
+    KMP_CPU_COPY(dest, src);
+  }
+  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+  KMP_CPU_FREE(sumMask);
 }
 
-
-# if OMP_40_ENABLED
+#if OMP_40_ENABLED
 
 /*-----------------------------------------------------------------------------
-
 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
 places.  Again, Here is the grammar:
 
@@ -3044,756 +2707,574 @@ subplace := num : num : signed
 signed := num
 signed := + signed
 signed := - signed
-
 -----------------------------------------------------------------------------*/
 
-static void
-__kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
-  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
-{
-    const char *next;
-
-    for (;;) {
-        int start, count, stride, i;
+static void __kmp_process_subplace_list(const char **scan,
+                                        kmp_affin_mask_t *osId2Mask,
+                                        int maxOsId, kmp_affin_mask_t *tempMask,
+                                        int *setSize) {
+  const char *next;
 
-        //
-        // Read in the starting proc id
-        //
-        SKIP_WS(*scan);
-        KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
-          "bad explicit places list");
-        next = *scan;
-        SKIP_DIGITS(next);
-        start = __kmp_str_to_int(*scan, *next);
-        KMP_ASSERT(start >= 0);
-        *scan = next;
-
-        //
-        // valid follow sets are ',' ':' and '}'
-        //
-        SKIP_WS(*scan);
-        if (**scan == '}' || **scan == ',') {
-            if ((start > maxOsId) ||
-              (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
-                if (__kmp_affinity_verbose || (__kmp_affinity_warnings
-                  && (__kmp_affinity_type != affinity_none))) {
-                    KMP_WARNING(AffIgnoreInvalidProcID, start);
-                }
-            }
-            else {
-                KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
-                (*setSize)++;
-            }
-            if (**scan == '}') {
-                break;
-            }
-            (*scan)++;  // skip ','
-            continue;
-        }
-        KMP_ASSERT2(**scan == ':', "bad explicit places list");
-        (*scan)++;      // skip ':'
+  for (;;) {
+    int start, count, stride, i;
 
-        //
-        // Read count parameter
-        //
-        SKIP_WS(*scan);
-        KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
-          "bad explicit places list");
-        next = *scan;
-        SKIP_DIGITS(next);
-        count = __kmp_str_to_int(*scan, *next);
-        KMP_ASSERT(count >= 0);
-        *scan = next;
-
-        //
-        // valid follow sets are ',' ':' and '}'
-        //
-        SKIP_WS(*scan);
-        if (**scan == '}' || **scan == ',') {
-            for (i = 0; i < count; i++) {
-                if ((start > maxOsId) ||
-                  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
-                    if (__kmp_affinity_verbose || (__kmp_affinity_warnings
-                      && (__kmp_affinity_type != affinity_none))) {
-                        KMP_WARNING(AffIgnoreInvalidProcID, start);
-                    }
-                    break;  // don't proliferate warnings for large count
-                }
-                else {
-                    KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
-                    start++;
-                    (*setSize)++;
-                }
-            }
-            if (**scan == '}') {
-                break;
-            }
-            (*scan)++;  // skip ','
-            continue;
-        }
-        KMP_ASSERT2(**scan == ':', "bad explicit places list");
-        (*scan)++;      // skip ':'
+    // Read in the starting proc id
+    SKIP_WS(*scan);
+    KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
+    next = *scan;
+    SKIP_DIGITS(next);
+    start = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(start >= 0);
+    *scan = next;
 
-        //
-        // Read stride parameter
-        //
-        int sign = +1;
-        for (;;) {
-            SKIP_WS(*scan);
-            if (**scan == '+') {
-                (*scan)++; // skip '+'
-                continue;
-            }
-            if (**scan == '-') {
-                sign *= -1;
-                (*scan)++; // skip '-'
-                continue;
-            }
-            break;
-        }
-        SKIP_WS(*scan);
-        KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
-          "bad explicit places list");
-        next = *scan;
-        SKIP_DIGITS(next);
-        stride = __kmp_str_to_int(*scan, *next);
-        KMP_ASSERT(stride >= 0);
-        *scan = next;
-        stride *= sign;
-
-        //
-        // valid follow sets are ',' and '}'
-        //
-        SKIP_WS(*scan);
-        if (**scan == '}' || **scan == ',') {
-            for (i = 0; i < count; i++) {
-                if ((start > maxOsId) ||
-                  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
-                    if (__kmp_affinity_verbose || (__kmp_affinity_warnings
-                      && (__kmp_affinity_type != affinity_none))) {
-                        KMP_WARNING(AffIgnoreInvalidProcID, start);
-                    }
-                    break;  // don't proliferate warnings for large count
-                }
-                else {
-                    KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
-                    start += stride;
-                    (*setSize)++;
-                }
-            }
-            if (**scan == '}') {
-                break;
-            }
-            (*scan)++;  // skip ','
-            continue;
+    // valid follow sets are ',' ':' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}' || **scan == ',') {
+      if ((start > maxOsId) ||
+          (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
+        if (__kmp_affinity_verbose ||
+            (__kmp_affinity_warnings &&
+             (__kmp_affinity_type != affinity_none))) {
+          KMP_WARNING(AffIgnoreInvalidProcID, start);
         }
-
-        KMP_ASSERT2(0, "bad explicit places list");
+      } else {
+        KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
+        (*setSize)++;
+      }
+      if (**scan == '}') {
+        break;
+      }
+      (*scan)++; // skip ','
+      continue;
     }
-}
+    KMP_ASSERT2(**scan == ':', "bad explicit places list");
+    (*scan)++; // skip ':'
 
+    // Read count parameter
+    SKIP_WS(*scan);
+    KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
+    next = *scan;
+    SKIP_DIGITS(next);
+    count = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(count >= 0);
+    *scan = next;
 
-static void
-__kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
-  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
-{
-    const char *next;
-
-    //
-    // valid follow sets are '{' '!' and num
-    //
+    // valid follow sets are ',' ':' and '}'
     SKIP_WS(*scan);
-    if (**scan == '{') {
-        (*scan)++;      // skip '{'
-        __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
-          setSize);
-        KMP_ASSERT2(**scan == '}', "bad explicit places list");
-        (*scan)++;      // skip '}'
-    }
-    else if (**scan == '!') {
-        (*scan)++;      // skip '!'
-        __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
-        KMP_CPU_COMPLEMENT(maxOsId, tempMask);
-    }
-    else if ((**scan >= '0') && (**scan <= '9')) {
-        next = *scan;
-        SKIP_DIGITS(next);
-        int num = __kmp_str_to_int(*scan, *next);
-        KMP_ASSERT(num >= 0);
-        if ((num > maxOsId) ||
-          (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
-            if (__kmp_affinity_verbose || (__kmp_affinity_warnings
-              && (__kmp_affinity_type != affinity_none))) {
-                KMP_WARNING(AffIgnoreInvalidProcID, num);
-            }
-        }
-        else {
-            KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
-            (*setSize)++;
+    if (**scan == '}' || **scan == ',') {
+      for (i = 0; i < count; i++) {
+        if ((start > maxOsId) ||
+            (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
+          if (__kmp_affinity_verbose ||
+              (__kmp_affinity_warnings &&
+               (__kmp_affinity_type != affinity_none))) {
+            KMP_WARNING(AffIgnoreInvalidProcID, start);
+          }
+          break; // don't proliferate warnings for large count
+        } else {
+          KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
+          start++;
+          (*setSize)++;
         }
-        *scan = next;  // skip num
-    }
-    else {
-        KMP_ASSERT2(0, "bad explicit places list");
+      }
+      if (**scan == '}') {
+        break;
+      }
+      (*scan)++; // skip ','
+      continue;
     }
-}
-
-
-//static void
-void
-__kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
-  unsigned int *out_numMasks, const char *placelist,
-  kmp_affin_mask_t *osId2Mask, int maxOsId)
-{
-    int i,j,count,stride,sign;
-    const char *scan = placelist;
-    const char *next = placelist;
-
-    numNewMasks = 2;
-    KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
-    nextNewMask = 0;
-
-    // tempMask is modified based on the previous or initial
-    //   place to form the current place
-    // previousMask contains the previous place
-    kmp_affin_mask_t *tempMask;
-    kmp_affin_mask_t *previousMask;
-    KMP_CPU_ALLOC(tempMask);
-    KMP_CPU_ZERO(tempMask);
-    KMP_CPU_ALLOC(previousMask);
-    KMP_CPU_ZERO(previousMask);
-    int setSize = 0;
+    KMP_ASSERT2(**scan == ':', "bad explicit places list");
+    (*scan)++; // skip ':'
 
+    // Read stride parameter
+    int sign = +1;
     for (;;) {
-        __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
+      SKIP_WS(*scan);
+      if (**scan == '+') {
+        (*scan)++; // skip '+'
+        continue;
+      }
+      if (**scan == '-') {
+        sign *= -1;
+        (*scan)++; // skip '-'
+        continue;
+      }
+      break;
+    }
+    SKIP_WS(*scan);
+    KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
+    next = *scan;
+    SKIP_DIGITS(next);
+    stride = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(stride >= 0);
+    *scan = next;
+    stride *= sign;
 
-        //
-        // valid follow sets are ',' ':' and EOL
-        //
-        SKIP_WS(scan);
-        if (*scan == '\0' || *scan == ',') {
-            if (setSize > 0) {
-                ADD_MASK(tempMask);
-            }
-            KMP_CPU_ZERO(tempMask);
-            setSize = 0;
-            if (*scan == '\0') {
-                break;
-            }
-            scan++;     // skip ','
-            continue;
+    // valid follow sets are ',' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}' || **scan == ',') {
+      for (i = 0; i < count; i++) {
+        if ((start > maxOsId) ||
+            (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
+          if (__kmp_affinity_verbose ||
+              (__kmp_affinity_warnings &&
+               (__kmp_affinity_type != affinity_none))) {
+            KMP_WARNING(AffIgnoreInvalidProcID, start);
+          }
+          break; // don't proliferate warnings for large count
+        } else {
+          KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
+          start += stride;
+          (*setSize)++;
         }
+      }
+      if (**scan == '}') {
+        break;
+      }
+      (*scan)++; // skip ','
+      continue;
+    }
 
-        KMP_ASSERT2(*scan == ':', "bad explicit places list");
-        scan++;         // skip ':'
+    KMP_ASSERT2(0, "bad explicit places list");
+  }
+}
 
-        //
-        // Read count parameter
-        //
-        SKIP_WS(scan);
-        KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
-          "bad explicit places list");
-        next = scan;
-        SKIP_DIGITS(next);
-        count = __kmp_str_to_int(scan, *next);
-        KMP_ASSERT(count >= 0);
-        scan = next;
+static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
+                                int maxOsId, kmp_affin_mask_t *tempMask,
+                                int *setSize) {
+  const char *next;
+
+  // valid follow sets are '{' '!' and num
+  SKIP_WS(*scan);
+  if (**scan == '{') {
+    (*scan)++; // skip '{'
+    __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
+    KMP_ASSERT2(**scan == '}', "bad explicit places list");
+    (*scan)++; // skip '}'
+  } else if (**scan == '!') {
+    (*scan)++; // skip '!'
+    __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
+    KMP_CPU_COMPLEMENT(maxOsId, tempMask);
+  } else if ((**scan >= '0') && (**scan <= '9')) {
+    next = *scan;
+    SKIP_DIGITS(next);
+    int num = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(num >= 0);
+    if ((num > maxOsId) ||
+        (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
+      if (__kmp_affinity_verbose ||
+          (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
+        KMP_WARNING(AffIgnoreInvalidProcID, num);
+      }
+    } else {
+      KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
+      (*setSize)++;
+    }
+    *scan = next; // skip num
+  } else {
+    KMP_ASSERT2(0, "bad explicit places list");
+  }
+}
+
+// static void
+void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
+                                      unsigned int *out_numMasks,
+                                      const char *placelist,
+                                      kmp_affin_mask_t *osId2Mask,
+                                      int maxOsId) {
+  int i, j, count, stride, sign;
+  const char *scan = placelist;
+  const char *next = placelist;
+
+  numNewMasks = 2;
+  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
+  nextNewMask = 0;
+
+  // tempMask is modified based on the previous or initial
+  //   place to form the current place
+  // previousMask contains the previous place
+  kmp_affin_mask_t *tempMask;
+  kmp_affin_mask_t *previousMask;
+  KMP_CPU_ALLOC(tempMask);
+  KMP_CPU_ZERO(tempMask);
+  KMP_CPU_ALLOC(previousMask);
+  KMP_CPU_ZERO(previousMask);
+  int setSize = 0;
+
+  for (;;) {
+    __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
+
+    // valid follow sets are ',' ':' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0' || *scan == ',') {
+      if (setSize > 0) {
+        ADD_MASK(tempMask);
+      }
+      KMP_CPU_ZERO(tempMask);
+      setSize = 0;
+      if (*scan == '\0') {
+        break;
+      }
+      scan++; // skip ','
+      continue;
+    }
 
-        //
-        // valid follow sets are ',' ':' and EOL
-        //
-        SKIP_WS(scan);
-        if (*scan == '\0' || *scan == ',') {
-            stride = +1;
-        }
-        else {
-            KMP_ASSERT2(*scan == ':', "bad explicit places list");
-            scan++;         // skip ':'
-
-            //
-            // Read stride parameter
-            //
-            sign = +1;
-            for (;;) {
-                SKIP_WS(scan);
-                if (*scan == '+') {
-                    scan++; // skip '+'
-                    continue;
-                }
-                if (*scan == '-') {
-                    sign *= -1;
-                    scan++; // skip '-'
-                    continue;
-                }
-                break;
-            }
-            SKIP_WS(scan);
-            KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
-              "bad explicit places list");
-            next = scan;
-            SKIP_DIGITS(next);
-            stride = __kmp_str_to_int(scan, *next);
-            KMP_DEBUG_ASSERT(stride >= 0);
-            scan = next;
-            stride *= sign;
-        }
+    KMP_ASSERT2(*scan == ':', "bad explicit places list");
+    scan++; // skip ':'
 
-        // Add places determined by initial_place : count : stride
-        for (i = 0; i < count; i++) {
-            if (setSize == 0) {
-                break;
-            }
-            // Add the current place, then build the next place (tempMask) from that
-            KMP_CPU_COPY(previousMask, tempMask);
-            ADD_MASK(previousMask);
-            KMP_CPU_ZERO(tempMask);
-            setSize = 0;
-            KMP_CPU_SET_ITERATE(j, previousMask) {
-                if (! KMP_CPU_ISSET(j, previousMask)) {
-                    continue;
-                }
-                if ((j+stride > maxOsId) || (j+stride < 0) ||
-                  (! KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
-                  (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) {
-                    if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
-                      && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
-                        KMP_WARNING(AffIgnoreInvalidProcID, j+stride);
-                    }
-                    continue;
-                }
-                KMP_CPU_SET(j+stride, tempMask);
-                setSize++;
-            }
-        }
-        KMP_CPU_ZERO(tempMask);
-        setSize = 0;
+    // Read count parameter
+    SKIP_WS(scan);
+    KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
+    next = scan;
+    SKIP_DIGITS(next);
+    count = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT(count >= 0);
+    scan = next;
+
+    // valid follow sets are ',' ':' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0' || *scan == ',') {
+      stride = +1;
+    } else {
+      KMP_ASSERT2(*scan == ':', "bad explicit places list");
+      scan++; // skip ':'
 
-        //
-        // valid follow sets are ',' and EOL
-        //
+      // Read stride parameter
+      sign = +1;
+      for (;;) {
         SKIP_WS(scan);
-        if (*scan == '\0') {
-            break;
-        }
-        if (*scan == ',') {
-            scan++;     // skip ','
-            continue;
+        if (*scan == '+') {
+          scan++; // skip '+'
+          continue;
+        }
+        if (*scan == '-') {
+          sign *= -1;
+          scan++; // skip '-'
+          continue;
         }
+        break;
+      }
+      SKIP_WS(scan);
+      KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
+      next = scan;
+      SKIP_DIGITS(next);
+      stride = __kmp_str_to_int(scan, *next);
+      KMP_DEBUG_ASSERT(stride >= 0);
+      scan = next;
+      stride *= sign;
+    }
 
-        KMP_ASSERT2(0, "bad explicit places list");
+    // Add places determined by initial_place : count : stride
+    for (i = 0; i < count; i++) {
+      if (setSize == 0) {
+        break;
+      }
+      // Add the current place, then build the next place (tempMask) from that
+      KMP_CPU_COPY(previousMask, tempMask);
+      ADD_MASK(previousMask);
+      KMP_CPU_ZERO(tempMask);
+      setSize = 0;
+      KMP_CPU_SET_ITERATE(j, previousMask) {
+        if (!KMP_CPU_ISSET(j, previousMask)) {
+          continue;
+        }
+        if ((j + stride > maxOsId) || (j + stride < 0) ||
+            (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
+            (!KMP_CPU_ISSET(j + stride,
+                            KMP_CPU_INDEX(osId2Mask, j + stride)))) {
+          if ((__kmp_affinity_verbose ||
+               (__kmp_affinity_warnings &&
+                (__kmp_affinity_type != affinity_none))) &&
+              i < count - 1) {
+            KMP_WARNING(AffIgnoreInvalidProcID, j + stride);
+          }
+          continue;
+        }
+        KMP_CPU_SET(j + stride, tempMask);
+        setSize++;
+      }
     }
+    KMP_CPU_ZERO(tempMask);
+    setSize = 0;
 
-    *out_numMasks = nextNewMask;
-    if (nextNewMask == 0) {
-        *out_masks = NULL;
-        KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
-        return;
+    // valid follow sets are ',' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0') {
+      break;
     }
-    KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
-    KMP_CPU_FREE(tempMask);
-    KMP_CPU_FREE(previousMask);
-    for(i = 0; i < nextNewMask; i++) {
-        kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);
-        kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
-        KMP_CPU_COPY(dest, src);
+    if (*scan == ',') {
+      scan++; // skip ','
+      continue;
     }
+
+    KMP_ASSERT2(0, "bad explicit places list");
+  }
+
+  *out_numMasks = nextNewMask;
+  if (nextNewMask == 0) {
+    *out_masks = NULL;
     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+    return;
+  }
+  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
+  KMP_CPU_FREE(tempMask);
+  KMP_CPU_FREE(previousMask);
+  for (i = 0; i < nextNewMask; i++) {
+    kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
+    kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
+    KMP_CPU_COPY(dest, src);
+  }
+  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
 }
 
-# endif /* OMP_40_ENABLED */
+#endif /* OMP_40_ENABLED */
 
 #undef ADD_MASK
 #undef ADD_MASK_OSID
 
 #if KMP_USE_HWLOC
-static int
-__kmp_hwloc_count_children_by_type(
-    hwloc_topology_t t, hwloc_obj_t o, hwloc_obj_type_t type, hwloc_obj_t* f)
-{
-    if (!hwloc_compare_types(o->type, type)) {
-      if (*f == NULL)
-        *f = o; // output first descendant found
-      return 1;
-    }
-    int sum = 0;
-    for (unsigned i = 0; i < o->arity; i++)
-      sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
-    return sum; // will be 0 if no one found (as PU arity is 0)
+static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
+                                              hwloc_obj_type_t type,
+                                              hwloc_obj_t* f) {
+  if (!hwloc_compare_types(o->type, type)) {
+    if (*f == NULL)
+      *f = o; // output first descendant found
+    return 1;
+  }
+  int sum = 0;
+  for (unsigned i = 0; i < o->arity; i++)
+    sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
+  return sum; // will be 0 if no one found (as PU arity is 0)
 }
 
-static int
-__kmp_hwloc_count_children_by_depth(
-    hwloc_topology_t t, hwloc_obj_t o, unsigned depth, hwloc_obj_t* f)
-{
-    if (o->depth == depth) {
-      if (*f == NULL)
-        *f = o; // output first descendant found
-      return 1;
-    }
-    int sum = 0;
-    for (unsigned i = 0; i < o->arity; i++)
-      sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
-    return sum; // will be 0 if no one found (as PU arity is 0)
-}
-
-static int
-__kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o)
-{ // skip PUs descendants of the object o
-    int skipped = 0;
-    hwloc_obj_t hT = NULL;
-    int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
-    for (int i = 0; i < N; ++i) {
-      KMP_DEBUG_ASSERT(hT);
-      unsigned idx = hT->os_index;
-      if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-        KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-        KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-        ++skipped;
-      }
-      hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
-    }
-    return skipped; // count number of skipped units
-}
-
-static int
-__kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o)
-{ // check if obj has PUs present in fullMask
-    hwloc_obj_t hT = NULL;
-    int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
-    for (int i = 0; i < N; ++i) {
-      KMP_DEBUG_ASSERT(hT);
-      unsigned idx = hT->os_index;
-      if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask))
-        return 1; // found PU
-      hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
-    }
-    return 0; // no PUs found
+static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
+                                               hwloc_obj_t o, unsigned depth,
+                                               hwloc_obj_t* f) {
+  if (o->depth == depth) {
+    if (*f == NULL)
+      *f = o; // output first descendant found
+    return 1;
+  }
+  int sum = 0;
+  for (unsigned i = 0; i < o->arity; i++)
+    sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
+  return sum; // will be 0 if no one found (as PU arity is 0)
+}
+
+static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) {
+  // skip PUs descendants of the object o
+  int skipped = 0;
+  hwloc_obj_t hT = NULL;
+  int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
+  for (int i = 0; i < N; ++i) {
+    KMP_DEBUG_ASSERT(hT);
+    unsigned idx = hT->os_index;
+    if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+      KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+      KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+      ++skipped;
+    }
+    hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
+  }
+  return skipped; // count number of skipped units
+}
+
+static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) {
+  // check if obj has PUs present in fullMask
+  hwloc_obj_t hT = NULL;
+  int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
+  for (int i = 0; i < N; ++i) {
+    KMP_DEBUG_ASSERT(hT);
+    unsigned idx = hT->os_index;
+    if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask))
+      return 1; // found PU
+    hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
+  }
+  return 0; // no PUs found
 }
 #endif // KMP_USE_HWLOC
 
-static void
-__kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
-{
-    AddrUnsPair *newAddr;
-    if (__kmp_hws_requested == 0)
-      goto _exit;   // no topology limiting actions requested, exit
+static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
+  AddrUnsPair *newAddr;
+  if (__kmp_hws_requested == 0)
+    goto _exit;   // no topology limiting actions requested, exit
 #if KMP_USE_HWLOC
-    if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
-      // Number of subobjects calculated dynamically, this works fine for
-      // any non-uniform topology.
-      // L2 cache objects are determined by depth, other objects - by type.
-      hwloc_topology_t tp = __kmp_hwloc_topology;
-      int nS=0, nN=0, nL=0, nC=0, nT=0; // logical index including skipped
-      int nCr=0, nTr=0; // number of requested units
-      int nPkg=0, nCo=0, n_new=0, n_old = 0, nCpP=0, nTpC=0; // counters
-      hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
-      int L2depth, idx;
-
-      // check support of extensions ----------------------------------
-      int numa_support = 0, tile_support = 0;
-      if (__kmp_pu_os_idx)
-        hT = hwloc_get_pu_obj_by_os_index(
-          tp, __kmp_pu_os_idx[__kmp_avail_proc - 1]);
-      else
-        hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1);
-      if (hT == NULL) { // something's gone wrong
-        KMP_WARNING(AffHWSubsetUnsupported);
+  if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
+    // Number of subobjects calculated dynamically, this works fine for
+    // any non-uniform topology.
+    // L2 cache objects are determined by depth, other objects - by type.
+    hwloc_topology_t tp = __kmp_hwloc_topology;
+    int nS=0, nN=0, nL=0, nC=0, nT=0; // logical index including skipped
+    int nCr=0, nTr=0; // number of requested units
+    int nPkg=0, nCo=0, n_new=0, n_old = 0, nCpP=0, nTpC=0; // counters
+    hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
+    int L2depth, idx;
+
+    // check support of extensions ----------------------------------
+    int numa_support = 0, tile_support = 0;
+    if (__kmp_pu_os_idx)
+      hT = hwloc_get_pu_obj_by_os_index(tp,
+                                        __kmp_pu_os_idx[__kmp_avail_proc - 1]);
+    else
+      hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1);
+    if (hT == NULL) { // something's gone wrong
+      KMP_WARNING(AffHWSubsetUnsupported);
+      goto _exit;
+    }
+    // check NUMA node
+    hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
+    hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
+    if (hN != NULL && hN->depth > hS->depth) {
+      numa_support = 1; // 1 in case socket includes node(s)
+    } else if (__kmp_hws_node.num > 0) {
+      // don't support sockets inside NUMA node (no such HW found for testing)
+      KMP_WARNING(AffHWSubsetUnsupported);
+      goto _exit;
+    }
+    // check L2 cahce, get object by depth because of multiple caches
+    L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
+    hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
+    if (hL != NULL && __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
+                                                         &hC) > 1) {
+      tile_support = 1; // no sense to count L2 if it includes single core
+    } else if (__kmp_hws_tile.num > 0) {
+      if (__kmp_hws_core.num == 0) {
+        __kmp_hws_core = __kmp_hws_tile; // replace L2 with core
+        __kmp_hws_tile.num = 0;
+      } else {
+        // L2 and core are both requested, but represent same object
+        KMP_WARNING(AffHWSubsetInvalid);
         goto _exit;
       }
-      // check NUMA node
-      hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
-      hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
-      if (hN != NULL && hN->depth > hS->depth) {
-        numa_support = 1; // 1 in case socket includes node(s)
-      } else if (__kmp_hws_node.num > 0) {
-        // don't support sockets inside NUMA node (no such HW found for testing)
-        KMP_WARNING(AffHWSubsetUnsupported);
+    }
+    // end of check of extensions -----------------------------------
+
+    // fill in unset items, validate settings -----------------------
+    if (__kmp_hws_socket.num == 0)
+      __kmp_hws_socket.num = nPackages;    // use all available sockets
+    if (__kmp_hws_socket.offset >= nPackages) {
+      KMP_WARNING(AffHWSubsetManySockets);
+      goto _exit;
+    }
+    if (numa_support) {
+      int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
+                                                  &hN); // num nodes in socket
+      if (__kmp_hws_node.num == 0)
+        __kmp_hws_node.num = NN; // use all available nodes
+      if (__kmp_hws_node.offset >= NN) {
+        KMP_WARNING(AffHWSubsetManyNodes);
         goto _exit;
       }
-      // check L2 cahce, get object by depth because of multiple caches
-      L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
-      hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
-      if (hL != NULL && __kmp_hwloc_count_children_by_type(
-          tp, hL, HWLOC_OBJ_CORE, &hC) > 1) {
-        tile_support = 1; // no sense to count L2 if it includes single core
-      } else if (__kmp_hws_tile.num > 0) {
-        if (__kmp_hws_core.num == 0) {
-          __kmp_hws_core = __kmp_hws_tile; // replace L2 with core
-          __kmp_hws_tile.num = 0;
-        } else {
-          // L2 and core are both requested, but represent same object
-          KMP_WARNING(AffHWSubsetInvalid);
+      if (tile_support) {
+        // get num tiles in node
+        int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
+        if (__kmp_hws_tile.num == 0) {
+          __kmp_hws_tile.num = NL + 1;
+        } // use all available tiles, some node may have more tiles, thus +1
+        if (__kmp_hws_tile.offset >= NL) {
+          KMP_WARNING(AffHWSubsetManyTiles);
           goto _exit;
         }
-      }
-      // end of check of extensions -----------------------------------
-
-      // fill in unset items, validate settings -----------------------
-      if (__kmp_hws_socket.num == 0)
-        __kmp_hws_socket.num = nPackages;    // use all available sockets
-      if (__kmp_hws_socket.offset >= nPackages) {
-          KMP_WARNING(AffHWSubsetManySockets);
+        int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
+                                                    &hC); // num cores in tile
+        if (__kmp_hws_core.num == 0)
+          __kmp_hws_core.num = NC;   // use all available cores
+        if (__kmp_hws_core.offset >= NC) {
+          KMP_WARNING(AffHWSubsetManyCores);
           goto _exit;
-      }
-      if (numa_support) {
-        int NN = __kmp_hwloc_count_children_by_type(
-          tp, hS, HWLOC_OBJ_NUMANODE, &hN); // num nodes in socket
-        if (__kmp_hws_node.num == 0)
-          __kmp_hws_node.num = NN; // use all available nodes
-        if (__kmp_hws_node.offset >= NN) {
-          KMP_WARNING(AffHWSubsetManyNodes);
+        }
+      } else { // tile_support
+        int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
+                                                    &hC); // num cores in node
+        if (__kmp_hws_core.num == 0)
+          __kmp_hws_core.num = NC;   // use all available cores
+        if (__kmp_hws_core.offset >= NC) {
+          KMP_WARNING(AffHWSubsetManyCores);
           goto _exit;
         }
-        if (tile_support) {
-          // get num tiles in node
-          int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
-          if (__kmp_hws_tile.num == 0) {
-            __kmp_hws_tile.num = NL + 1;
-          } // use all available tiles, some node may have more tiles, thus +1
-          if (__kmp_hws_tile.offset >= NL) {
-            KMP_WARNING(AffHWSubsetManyTiles);
-            goto _exit;
-          }
-          int NC = __kmp_hwloc_count_children_by_type(
-            tp, hL, HWLOC_OBJ_CORE, &hC); // num cores in tile
-          if (__kmp_hws_core.num == 0)
-            __kmp_hws_core.num = NC;   // use all available cores
-          if (__kmp_hws_core.offset >= NC) {
-            KMP_WARNING(AffHWSubsetManyCores);
-            goto _exit;
-          }
-        } else { // tile_support
-          int NC = __kmp_hwloc_count_children_by_type(
-            tp, hN, HWLOC_OBJ_CORE, &hC); // num cores in node
-          if (__kmp_hws_core.num == 0)
-            __kmp_hws_core.num = NC;   // use all available cores
-          if (__kmp_hws_core.offset >= NC) {
-            KMP_WARNING(AffHWSubsetManyCores);
-            goto _exit;
-          }
-        } // tile_support
-      } else { // numa_support
-        if (tile_support) {
-          // get num tiles in socket
-          int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
-          if (__kmp_hws_tile.num == 0)
-            __kmp_hws_tile.num = NL; // use all available tiles
-          if (__kmp_hws_tile.offset >= NL) {
-            KMP_WARNING(AffHWSubsetManyTiles);
-            goto _exit;
-          }
-          int NC = __kmp_hwloc_count_children_by_type(
-            tp, hL, HWLOC_OBJ_CORE, &hC); // num cores in tile
-          if (__kmp_hws_core.num == 0)
-            __kmp_hws_core.num = NC;   // use all available cores
-          if (__kmp_hws_core.offset >= NC) {
-            KMP_WARNING(AffHWSubsetManyCores);
-            goto _exit;
-          }
-        } else { // tile_support
-          int NC = __kmp_hwloc_count_children_by_type(
-            tp, hS, HWLOC_OBJ_CORE, &hC); // num cores in socket
-          if (__kmp_hws_core.num == 0)
-            __kmp_hws_core.num = NC;   // use all available cores
-          if (__kmp_hws_core.offset >= NC) {
-            KMP_WARNING(AffHWSubsetManyCores);
-            goto _exit;
-          }
-        } // tile_support
-      }
-      if (__kmp_hws_proc.num == 0)
-        __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs
-      if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) {
-        KMP_WARNING(AffHWSubsetManyProcs);
-        goto _exit;
+      } // tile_support
+    } else { // numa_support
+      if (tile_support) {
+        // get num tiles in socket
+        int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
+        if (__kmp_hws_tile.num == 0)
+          __kmp_hws_tile.num = NL; // use all available tiles
+        if (__kmp_hws_tile.offset >= NL) {
+          KMP_WARNING(AffHWSubsetManyTiles);
+          goto _exit;
+        }
+        int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
+                                                    &hC); // num cores in tile
+        if (__kmp_hws_core.num == 0)
+          __kmp_hws_core.num = NC;   // use all available cores
+        if (__kmp_hws_core.offset >= NC) {
+          KMP_WARNING(AffHWSubsetManyCores);
+          goto _exit;
+        }
+      } else { // tile_support
+        int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
+                                                    &hC); // num cores in socket
+        if (__kmp_hws_core.num == 0)
+          __kmp_hws_core.num = NC;   // use all available cores
+        if (__kmp_hws_core.offset >= NC) {
+          KMP_WARNING(AffHWSubsetManyCores);
+          goto _exit;
+        }
+      } // tile_support
+    }
+    if (__kmp_hws_proc.num == 0)
+      __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs
+    if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) {
+      KMP_WARNING(AffHWSubsetManyProcs);
+      goto _exit;
+    }
+    // end of validation --------------------------------------------
+
+    if (pAddr) // pAddr is NULL in case of affinity_none
+      newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) *
+                                              __kmp_avail_proc); // max size
+    // main loop to form HW subset ----------------------------------
+    hS = NULL;
+    int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE);
+    for (int s = 0; s < NP; ++s) {
+      // Check Socket -----------------------------------------------
+      hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS);
+      if (!__kmp_hwloc_obj_has_PUs(tp, hS))
+        continue; // skip socket if all PUs are out of fullMask
+      ++nS; // only count objects those have PUs in affinity mask
+      if (nS <= __kmp_hws_socket.offset ||
+          nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) {
+        n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket
+        continue; // move to next socket
       }
-      // end of validation --------------------------------------------
-
-      if (pAddr) // pAddr is NULL in case of affinity_none
-        newAddr = (AddrUnsPair *)__kmp_allocate(
-          sizeof(AddrUnsPair) * __kmp_avail_proc); // max size
-      // main loop to form HW subset ----------------------------------
-      hS = NULL;
-      int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE);
-      for (int s = 0; s < NP; ++s) {
-        // Check Socket -----------------------------------------------
-        hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS);
-        if (!__kmp_hwloc_obj_has_PUs(tp, hS))
-          continue; // skip socket if all PUs are out of fullMask
-        ++nS; // only count objects those have PUs in affinity mask
-        if (nS <= __kmp_hws_socket.offset ||
-            nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) {
-          n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket
-          continue; // move to next socket
-        }
-        nCr = 0; // count number of cores per socket
-        // socket requested, go down the topology tree
-        // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
-        if (numa_support) {
-          nN = 0;
-          hN = NULL;
-          int NN = __kmp_hwloc_count_children_by_type(
-            tp, hS, HWLOC_OBJ_NUMANODE, &hN); // num nodes in current socket
-          for (int n = 0; n < NN; ++n) {
-            // Check NUMA Node ----------------------------------------
-            if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
-              hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
-              continue; // skip node if all PUs are out of fullMask
-            }
-            ++nN;
-            if (nN <= __kmp_hws_node.offset ||
-                nN > __kmp_hws_node.num + __kmp_hws_node.offset) {
-              // skip node as not requested
-              n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node
-              hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
-              continue; // move to next node
-            }
-            // node requested, go down the topology tree
-            if (tile_support) {
-              nL = 0;
-              hL = NULL;
-              int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
-              for (int l = 0; l < NL; ++l) {
-                // Check L2 (tile) ------------------------------------
-                if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
-                  hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-                  continue; // skip tile if all PUs are out of fullMask
-                }
-                ++nL;
-                if (nL <= __kmp_hws_tile.offset ||
-                    nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
-                  // skip tile as not requested
-                  n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
-                  hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-                  continue; // move to next tile
-                }
-                // tile requested, go down the topology tree
-                nC = 0;
-                hC = NULL;
-                int NC = __kmp_hwloc_count_children_by_type(
-                  tp, hL, HWLOC_OBJ_CORE, &hC); // num cores in current tile
-                for (int c = 0; c < NC; ++c) {
-                  // Check Core ---------------------------------------
-                  if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
-                    hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                    continue; // skip core if all PUs are out of fullMask
-                  }
-                  ++nC;
-                  if (nC <= __kmp_hws_core.offset ||
-                      nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
-                    // skip node as not requested
-                    n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
-                    hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                    continue; // move to next node
-                  }
-                  // core requested, go down to PUs
-                  nT = 0;
-                  nTr = 0;
-                  hT = NULL;
-                  int NT = __kmp_hwloc_count_children_by_type(
-                    tp, hC, HWLOC_OBJ_PU, &hT); // num procs in current core
-                  for (int t = 0; t < NT; ++t) {
-                    // Check PU ---------------------------------------
-                    idx = hT->os_index;
-                    if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-                      hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                      continue; // skip PU if not in fullMask
-                    }
-                    ++nT;
-                    if (nT <= __kmp_hws_proc.offset ||
-                        nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
-                      // skip PU
-                      KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-                      ++n_old;
-                      KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-                      hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                      continue; // move to next node
-                    }
-                    ++nTr;
-                    if (pAddr) // collect requested thread's data
-                      newAddr[n_new] = (*pAddr)[n_old];
-                    ++n_new;
-                    ++n_old;
-                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                  } // threads loop
-                  if (nTr > 0) {
-                    ++nCr; // num cores per socket
-                    ++nCo; // total num cores
-                    if (nTr > nTpC)
-                      nTpC = nTr; // calc max threads per core
-                  }
-                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                } // cores loop
-                hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-              } // tiles loop
-            } else { // tile_support
-              // no tiles, check cores
-              nC = 0;
-              hC = NULL;
-              int NC = __kmp_hwloc_count_children_by_type(
-                tp, hN, HWLOC_OBJ_CORE, &hC); // num cores in current node
-              for (int c = 0; c < NC; ++c) {
-                // Check Core ---------------------------------------
-                if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
-                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                  continue; // skip core if all PUs are out of fullMask
-                }
-                ++nC;
-                if (nC <= __kmp_hws_core.offset ||
-                    nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
-                  // skip node as not requested
-                  n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
-                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                  continue; // move to next node
-                }
-                // core requested, go down to PUs
-                nT = 0;
-                nTr = 0;
-                hT = NULL;
-                int NT = __kmp_hwloc_count_children_by_type(
-                  tp, hC, HWLOC_OBJ_PU, &hT);
-                for (int t = 0; t < NT; ++t) {
-                  // Check PU ---------------------------------------
-                  idx = hT->os_index;
-                  if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                    continue; // skip PU if not in fullMask
-                  }
-                  ++nT;
-                  if (nT <= __kmp_hws_proc.offset ||
-                      nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
-                    // skip PU
-                    KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-                    ++n_old;
-                    KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                    continue; // move to next node
-                  }
-                  ++nTr;
-                  if (pAddr) // collect requested thread's data
-                    newAddr[n_new] = (*pAddr)[n_old];
-                  ++n_new;
-                  ++n_old;
-                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                } // threads loop
-                if (nTr > 0) {
-                  ++nCr; // num cores per socket
-                  ++nCo; // total num cores
-                  if (nTr > nTpC)
-                    nTpC = nTr; // calc max threads per core
-                }
-                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-              } // cores loop
-            } // tiles support
+      nCr = 0; // count number of cores per socket
+      // socket requested, go down the topology tree
+      // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
+      if (numa_support) {
+        nN = 0;
+        hN = NULL;
+        // num nodes in current socket
+        int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
+                                                    &hN);
+        for (int n = 0; n < NN; ++n) {
+          // Check NUMA Node ----------------------------------------
+          if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
+            hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
+            continue; // skip node if all PUs are out of fullMask
+          }
+          ++nN;
+          if (nN <= __kmp_hws_node.offset ||
+              nN > __kmp_hws_node.num + __kmp_hws_node.offset) {
+            // skip node as not requested
+            n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node
             hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
-          } // nodes loop
-        } else { // numa_support
-          // no NUMA support
+            continue; // move to next node
+          }
+          // node requested, go down the topology tree
           if (tile_support) {
             nL = 0;
             hL = NULL;
-            int NL = __kmp_hwloc_count_children_by_depth(
-              tp, hS, L2depth, &hL); // num tiles in current socket
+            int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
             for (int l = 0; l < NL; ++l) {
               // Check L2 (tile) ------------------------------------
               if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
@@ -3811,8 +3292,9 @@ __kmp_apply_thread_places(AddrUnsPair **
               // tile requested, go down the topology tree
               nC = 0;
               hC = NULL;
-              int NC = __kmp_hwloc_count_children_by_type(
-                tp, hL, HWLOC_OBJ_CORE, &hC); // num cores per tile
+              // num cores in current tile
+              int NC = __kmp_hwloc_count_children_by_type(tp, hL,
+                                                          HWLOC_OBJ_CORE, &hC);
               for (int c = 0; c < NC; ++c) {
                 // Check Core ---------------------------------------
                 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
@@ -3831,8 +3313,9 @@ __kmp_apply_thread_places(AddrUnsPair **
                 nT = 0;
                 nTr = 0;
                 hT = NULL;
-                int NT = __kmp_hwloc_count_children_by_type(
-                  tp, hC, HWLOC_OBJ_PU, &hT); // num procs per core
+                // num procs in current core
+                int NT = __kmp_hwloc_count_children_by_type(tp, hC,
+                                                            HWLOC_OBJ_PU, &hT);
                 for (int t = 0; t < NT; ++t) {
                   // Check PU ---------------------------------------
                   idx = hT->os_index;
@@ -3871,10 +3354,11 @@ __kmp_apply_thread_places(AddrUnsPair **
             // no tiles, check cores
             nC = 0;
             hC = NULL;
-            int NC = __kmp_hwloc_count_children_by_type(
-              tp, hS, HWLOC_OBJ_CORE, &hC); // num cores in socket
+            // num cores in current node
+            int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
+                                                        &hC);
             for (int c = 0; c < NC; ++c) {
-              // Check Core -------------------------------------------
+              // Check Core ---------------------------------------
               if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
                 continue; // skip core if all PUs are out of fullMask
@@ -3891,8 +3375,8 @@ __kmp_apply_thread_places(AddrUnsPair **
               nT = 0;
               nTr = 0;
               hT = NULL;
-              int NT = __kmp_hwloc_count_children_by_type(
-                tp, hC, HWLOC_OBJ_PU, &hT); // num procs per core
+              int NT = __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU,
+                                                          &hT);
               for (int t = 0; t < NT; ++t) {
                 // Check PU ---------------------------------------
                 idx = hT->os_index;
@@ -3926,85 +3410,232 @@ __kmp_apply_thread_places(AddrUnsPair **
               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
             } // cores loop
           } // tiles support
-        } // numa_support
-        if (nCr > 0) { // found cores?
-          ++nPkg; // num sockets
-          if (nCr > nCpP)
-            nCpP = nCr; // calc max cores per socket
-        }
-      } // sockets loop
-
-      // check the subset is valid
-      KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc);
-      KMP_DEBUG_ASSERT(nPkg > 0);
-      KMP_DEBUG_ASSERT(nCpP > 0);
-      KMP_DEBUG_ASSERT(nTpC > 0);
-      KMP_DEBUG_ASSERT(nCo > 0);
-      KMP_DEBUG_ASSERT(nPkg <= nPackages);
-      KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg);
-      KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
-      KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
-
-      nPackages = nPkg;             // correct num sockets
-      nCoresPerPkg = nCpP;          // correct num cores per socket
-      __kmp_nThreadsPerCore = nTpC; // correct num threads per core
-      __kmp_avail_proc = n_new;     // correct num procs
-      __kmp_ncores = nCo;           // correct num cores
-      // hwloc topology method end
-    } else
-#endif // KMP_USE_HWLOC
-    {
-      int n_old = 0, n_new = 0, proc_num = 0;
-      if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) {
-        KMP_WARNING(AffHWSubsetNoHWLOC);
-        goto _exit;
+          hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
+        } // nodes loop
+      } else { // numa_support
+        // no NUMA support
+        if (tile_support) {
+          nL = 0;
+          hL = NULL;
+          // num tiles in current socket
+          int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
+          for (int l = 0; l < NL; ++l) {
+            // Check L2 (tile) ------------------------------------
+            if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
+              hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+              continue; // skip tile if all PUs are out of fullMask
+            }
+            ++nL;
+            if (nL <= __kmp_hws_tile.offset ||
+                nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
+              // skip tile as not requested
+              n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
+              hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+              continue; // move to next tile
+            }
+            // tile requested, go down the topology tree
+            nC = 0;
+            hC = NULL;
+            // num cores per tile
+            int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
+                                                        &hC);
+            for (int c = 0; c < NC; ++c) {
+              // Check Core ---------------------------------------
+              if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
+                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                continue; // skip core if all PUs are out of fullMask
+              }
+              ++nC;
+              if (nC <= __kmp_hws_core.offset ||
+                  nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
+                // skip node as not requested
+                n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
+                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                continue; // move to next node
+              }
+              // core requested, go down to PUs
+              nT = 0;
+              nTr = 0;
+              hT = NULL;
+              // num procs per core
+              int NT = __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU,
+                                                          &hT);
+              for (int t = 0; t < NT; ++t) {
+                // Check PU ---------------------------------------
+                idx = hT->os_index;
+                if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                  continue; // skip PU if not in fullMask
+                }
+                ++nT;
+                if (nT <= __kmp_hws_proc.offset ||
+                    nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
+                  // skip PU
+                  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+                  ++n_old;
+                  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                  continue; // move to next node
+                }
+                ++nTr;
+                if (pAddr) // collect requested thread's data
+                  newAddr[n_new] = (*pAddr)[n_old];
+                ++n_new;
+                ++n_old;
+                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+              } // threads loop
+              if (nTr > 0) {
+                ++nCr; // num cores per socket
+                ++nCo; // total num cores
+                if (nTr > nTpC)
+                  nTpC = nTr; // calc max threads per core
+              }
+              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+            } // cores loop
+            hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+          } // tiles loop
+        } else { // tile_support
+          // no tiles, check cores
+          nC = 0;
+          hC = NULL;
+          // num cores in socket
+          int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
+                                                      &hC);
+          for (int c = 0; c < NC; ++c) {
+            // Check Core -------------------------------------------
+            if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
+              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+              continue; // skip core if all PUs are out of fullMask
+            }
+            ++nC;
+            if (nC <= __kmp_hws_core.offset ||
+                nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
+              // skip node as not requested
+              n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
+              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+              continue; // move to next node
+            }
+            // core requested, go down to PUs
+            nT = 0;
+            nTr = 0;
+            hT = NULL;
+            // num procs per core
+            int NT = __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU,
+                                                        &hT);
+            for (int t = 0; t < NT; ++t) {
+              // Check PU ---------------------------------------
+              idx = hT->os_index;
+              if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                continue; // skip PU if not in fullMask
+              }
+              ++nT;
+              if (nT <= __kmp_hws_proc.offset ||
+                  nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
+                // skip PU
+                KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+                ++n_old;
+                KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                continue; // move to next node
+              }
+              ++nTr;
+              if (pAddr) // collect requested thread's data
+                newAddr[n_new] = (*pAddr)[n_old];
+              ++n_new;
+              ++n_old;
+              hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+            } // threads loop
+            if (nTr > 0) {
+              ++nCr; // num cores per socket
+              ++nCo; // total num cores
+              if (nTr > nTpC)
+                nTpC = nTr; // calc max threads per core
+            }
+            hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+          } // cores loop
+        } // tiles support
+      } // numa_support
+      if (nCr > 0) { // found cores?
+        ++nPkg; // num sockets
+        if (nCr > nCpP)
+          nCpP = nCr; // calc max cores per socket
       }
-      if (__kmp_hws_socket.num == 0)
-        __kmp_hws_socket.num = nPackages;    // use all available sockets
-      if (__kmp_hws_core.num == 0)
-        __kmp_hws_core.num = nCoresPerPkg;   // use all available cores
-      if (__kmp_hws_proc.num == 0 ||
+    } // sockets loop
+
+    // check the subset is valid
+    KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc);
+    KMP_DEBUG_ASSERT(nPkg > 0);
+    KMP_DEBUG_ASSERT(nCpP > 0);
+    KMP_DEBUG_ASSERT(nTpC > 0);
+    KMP_DEBUG_ASSERT(nCo > 0);
+    KMP_DEBUG_ASSERT(nPkg <= nPackages);
+    KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg);
+    KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
+    KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
+
+    nPackages = nPkg;             // correct num sockets
+    nCoresPerPkg = nCpP;          // correct num cores per socket
+    __kmp_nThreadsPerCore = nTpC; // correct num threads per core
+    __kmp_avail_proc = n_new;     // correct num procs
+    __kmp_ncores = nCo;           // correct num cores
+    // hwloc topology method end
+  } else
+#endif // KMP_USE_HWLOC
+  {
+    int n_old = 0, n_new = 0, proc_num = 0;
+    if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) {
+      KMP_WARNING(AffHWSubsetNoHWLOC);
+      goto _exit;
+    }
+    if (__kmp_hws_socket.num == 0)
+      __kmp_hws_socket.num = nPackages;    // use all available sockets
+    if (__kmp_hws_core.num == 0)
+      __kmp_hws_core.num = nCoresPerPkg;   // use all available cores
+    if (__kmp_hws_proc.num == 0 ||
         __kmp_hws_proc.num > __kmp_nThreadsPerCore)
-        __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
-      if ( !__kmp_affinity_uniform_topology() ) {
-        KMP_WARNING( AffHWSubsetNonUniform );
-        goto _exit; // don't support non-uniform topology
-      }
-      if ( depth > 3 ) {
-        KMP_WARNING( AffHWSubsetNonThreeLevel );
-        goto _exit; // don't support not-3-level topology
-      }
-      if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
-        KMP_WARNING(AffHWSubsetManySockets);
-        goto _exit;
-      }
-      if ( __kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg ) {
-        KMP_WARNING( AffHWSubsetManyCores );
-        goto _exit;
-      }
-      // Form the requested subset
-      if (pAddr) // pAddr is NULL in case of affinity_none
-        newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
-          __kmp_hws_socket.num * __kmp_hws_core.num * __kmp_hws_proc.num);
-      for (int i = 0; i < nPackages; ++i) {
-        if (i < __kmp_hws_socket.offset ||
-            i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
-          // skip not-requested socket
-          n_old += nCoresPerPkg * __kmp_nThreadsPerCore;
-          if (__kmp_pu_os_idx != NULL) {
-            // walk through skipped socket
-            for (int j = 0; j < nCoresPerPkg; ++j) {
-              for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
-                KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
-                ++proc_num;
-              }
+      __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
+    if ( !__kmp_affinity_uniform_topology() ) {
+      KMP_WARNING( AffHWSubsetNonUniform );
+      goto _exit; // don't support non-uniform topology
+    }
+    if ( depth > 3 ) {
+      KMP_WARNING( AffHWSubsetNonThreeLevel );
+      goto _exit; // don't support not-3-level topology
+    }
+    if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
+      KMP_WARNING(AffHWSubsetManySockets);
+      goto _exit;
+    }
+    if ( __kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg ) {
+      KMP_WARNING( AffHWSubsetManyCores );
+      goto _exit;
+    }
+    // Form the requested subset
+    if (pAddr) // pAddr is NULL in case of affinity_none
+      newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) *
+                                              __kmp_hws_socket.num *
+                                              __kmp_hws_core.num *
+                                              __kmp_hws_proc.num);
+    for (int i = 0; i < nPackages; ++i) {
+      if (i < __kmp_hws_socket.offset ||
+          i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
+        // skip not-requested socket
+        n_old += nCoresPerPkg * __kmp_nThreadsPerCore;
+        if (__kmp_pu_os_idx != NULL) {
+          // walk through skipped socket
+          for (int j = 0; j < nCoresPerPkg; ++j) {
+            for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+              KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
+              ++proc_num;
             }
           }
-        } else {
-          // walk through requested socket
-          for (int j = 0; j < nCoresPerPkg; ++j) {
-            if (j < __kmp_hws_core.offset ||
-                j >= __kmp_hws_core.offset + __kmp_hws_core.num)
+        }
+      } else {
+        // walk through requested socket
+        for (int j = 0; j < nCoresPerPkg; ++j) {
+          if (j < __kmp_hws_core.offset ||
+              j >= __kmp_hws_core.offset + __kmp_hws_core.num)
             { // skip not-requested core
               n_old += __kmp_nThreadsPerCore;
               if (__kmp_pu_os_idx != NULL) {
@@ -4014,1428 +3645,1350 @@ __kmp_apply_thread_places(AddrUnsPair **
                 }
               }
             } else {
-              // walk through requested core
-              for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
-                if (k < __kmp_hws_proc.num) {
-                  if (pAddr) // collect requested thread's data
-                    newAddr[n_new] = (*pAddr)[n_old];
-                  n_new++;
-                } else {
-                  if (__kmp_pu_os_idx != NULL)
-                    KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
-                }
-                n_old++;
-                ++proc_num;
+            // walk through requested core
+            for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+              if (k < __kmp_hws_proc.num) {
+                if (pAddr) // collect requested thread's data
+                  newAddr[n_new] = (*pAddr)[n_old];
+                n_new++;
+              } else {
+                if (__kmp_pu_os_idx != NULL)
+                  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
               }
+              n_old++;
+              ++proc_num;
             }
           }
         }
-      }
-      KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
-      KMP_DEBUG_ASSERT(n_new == __kmp_hws_socket.num * __kmp_hws_core.num *
-                       __kmp_hws_proc.num);
-      nPackages = __kmp_hws_socket.num;           // correct nPackages
-      nCoresPerPkg = __kmp_hws_core.num;          // correct nCoresPerPkg
-      __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
-      __kmp_avail_proc = n_new;                   // correct avail_proc
-      __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
-    } // non-hwloc topology method
-    if (pAddr) {
-      __kmp_free( *pAddr );
-      *pAddr = newAddr;      // replace old topology with new one
-    }
-    if (__kmp_affinity_verbose) {
-      char m[KMP_AFFIN_MASK_PRINT_LEN];
-      __kmp_affinity_print_mask(m,KMP_AFFIN_MASK_PRINT_LEN,__kmp_affin_fullMask);
-      if (__kmp_affinity_respect_mask) {
-        KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m);
-      } else {
-        KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m);
-      }
-      KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
-      kmp_str_buf_t buf;
-      __kmp_str_buf_init(&buf);
-      __kmp_str_buf_print(&buf, "%d", nPackages);
-      KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg,
-        __kmp_nThreadsPerCore, __kmp_ncores);
-      __kmp_str_buf_free(&buf);
+      }
     }
-_exit:
-    if (__kmp_pu_os_idx != NULL) {
-      __kmp_free(__kmp_pu_os_idx);
-      __kmp_pu_os_idx = NULL;
+    KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
+    KMP_DEBUG_ASSERT(n_new == __kmp_hws_socket.num * __kmp_hws_core.num *
+                     __kmp_hws_proc.num);
+    nPackages = __kmp_hws_socket.num;           // correct nPackages
+    nCoresPerPkg = __kmp_hws_core.num;          // correct nCoresPerPkg
+    __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
+    __kmp_avail_proc = n_new;                   // correct avail_proc
+    __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
+  } // non-hwloc topology method
+  if (pAddr) {
+    __kmp_free( *pAddr );
+    *pAddr = newAddr;      // replace old topology with new one
+  }
+  if (__kmp_affinity_verbose) {
+    char m[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(m,KMP_AFFIN_MASK_PRINT_LEN,__kmp_affin_fullMask);
+    if (__kmp_affinity_respect_mask) {
+      KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m);
+    } else {
+      KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m);
     }
-}
-
-//
-// This function figures out the deepest level at which there is at least one cluster/core
-// with more than one processing unit bound to it.
-//
-static int
-__kmp_affinity_find_core_level(const AddrUnsPair *address2os, int nprocs, int bottom_level)
-{
-    int core_level = 0;
-
-    for( int i = 0; i < nprocs; i++ ) {
-        for( int j = bottom_level; j > 0; j-- ) {
-            if( address2os[i].first.labels[j] > 0 ) {
-                if( core_level < ( j - 1 ) ) {
-                    core_level = j - 1;
-                }
-            }
+    KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
+    __kmp_str_buf_print(&buf, "%d", nPackages);
+    KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg,
+               __kmp_nThreadsPerCore, __kmp_ncores);
+    __kmp_str_buf_free(&buf);
+  }
+ _exit:
+  if (__kmp_pu_os_idx != NULL) {
+    __kmp_free(__kmp_pu_os_idx);
+    __kmp_pu_os_idx = NULL;
+  }
+}
+
+// This function figures out the deepest level at which there is at least one
+// cluster/core with more than one processing unit bound to it.
+static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os,
+                                          int nprocs, int bottom_level) {
+  int core_level = 0;
+
+  for (int i = 0; i < nprocs; i++) {
+    for (int j = bottom_level; j > 0; j--) {
+      if (address2os[i].first.labels[j] > 0) {
+        if (core_level < (j - 1)) {
+          core_level = j - 1;
         }
+      }
     }
-    return core_level;
+  }
+  return core_level;
 }
 
-//
 // This function counts number of clusters/cores at given level.
-//
-static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, int nprocs, int bottom_level, int core_level)
-{
-    int ncores = 0;
-    int i, j;
-
-    j = bottom_level;
-    for( i = 0; i < nprocs; i++ ) {
-        for ( j = bottom_level; j > core_level; j-- ) {
-            if( ( i + 1 ) < nprocs ) {
-                if( address2os[i + 1].first.labels[j] > 0 ) {
-                    break;
-                }
-            }
-        }
-        if( j == core_level ) {
-            ncores++;
+static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os,
+                                         int nprocs, int bottom_level,
+                                         int core_level) {
+  int ncores = 0;
+  int i, j;
+
+  j = bottom_level;
+  for (i = 0; i < nprocs; i++) {
+    for (j = bottom_level; j > core_level; j--) {
+      if ((i + 1) < nprocs) {
+        if (address2os[i + 1].first.labels[j] > 0) {
+          break;
         }
+      }
     }
-    if( j > core_level ) {
-        //
-        // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one core.
-        // May occur when called from __kmp_affinity_find_core().
-        //
-        ncores++;
+    if (j == core_level) {
+      ncores++;
     }
-    return ncores;
+  }
+  if (j > core_level) {
+    // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one
+    // core. May occur when called from __kmp_affinity_find_core().
+    ncores++;
+  }
+  return ncores;
 }
 
-//
 // This function finds to which cluster/core given processing unit is bound.
-//
-static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, int bottom_level, int core_level)
-{
-    return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, core_level) - 1;
-}
+static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc,
+                                    int bottom_level, int core_level) {
+  return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level,
+                                       core_level) - 1;
+}
+
+// This function finds maximal number of processing units bound to a
+// cluster/core at given level.
+static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os,
+                                            int nprocs, int bottom_level,
+                                            int core_level) {
+  int maxprocpercore = 0;
+
+  if (core_level < bottom_level) {
+    for (int i = 0; i < nprocs; i++) {
+      int percore = address2os[i].first.labels[core_level + 1] + 1;
 
-//
-// This function finds maximal number of processing units bound to a cluster/core at given level.
-//
-static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, int nprocs, int bottom_level, int core_level)
-{
-    int maxprocpercore = 0;
-
-    if( core_level < bottom_level ) {
-        for( int i = 0; i < nprocs; i++ ) {
-            int percore = address2os[i].first.labels[core_level + 1] + 1;
-
-            if( percore > maxprocpercore ) {
-                maxprocpercore = percore;
-            }
-       }
-    } else {
-        maxprocpercore = 1;
+      if (percore > maxprocpercore) {
+        maxprocpercore = percore;
+      }
     }
-    return maxprocpercore;
+  } else {
+    maxprocpercore = 1;
+  }
+  return maxprocpercore;
 }
 
 static AddrUnsPair *address2os = NULL;
-static int           * procarr = NULL;
-static int     __kmp_aff_depth = 0;
+static int *procarr = NULL;
+static int __kmp_aff_depth = 0;
 
-#define KMP_EXIT_AFF_NONE                             \
-    KMP_ASSERT(__kmp_affinity_type == affinity_none); \
-    KMP_ASSERT(address2os == NULL);                   \
-    __kmp_apply_thread_places(NULL, 0);               \
-    return;
-
-static int
-__kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
-{
-    const Address *aa = (const Address *)&(((AddrUnsPair *)a)
-      ->first);
-    const Address *bb = (const Address *)&(((AddrUnsPair *)b)
-      ->first);
-    unsigned depth = aa->depth;
-    unsigned i;
-    KMP_DEBUG_ASSERT(depth == bb->depth);
-    KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
-    KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
-    for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
-        int j = depth - i - 1;
-        if (aa->childNums[j] < bb->childNums[j]) return -1;
-        if (aa->childNums[j] > bb->childNums[j]) return 1;
-    }
-    for (; i < depth; i++) {
-        int j = i - __kmp_affinity_compact;
-        if (aa->childNums[j] < bb->childNums[j]) return -1;
-        if (aa->childNums[j] > bb->childNums[j]) return 1;
-    }
-    return 0;
+#define KMP_EXIT_AFF_NONE                                                      \
+  KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
+  KMP_ASSERT(address2os == NULL);                                              \
+  __kmp_apply_thread_places(NULL, 0);                                          \
+  return;
+
+static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) {
+  const Address *aa = (const Address *)&(((AddrUnsPair *)a)->first);
+  const Address *bb = (const Address *)&(((AddrUnsPair *)b)->first);
+  unsigned depth = aa->depth;
+  unsigned i;
+  KMP_DEBUG_ASSERT(depth == bb->depth);
+  KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
+  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
+  for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
+    int j = depth - i - 1;
+    if (aa->childNums[j] < bb->childNums[j])
+      return -1;
+    if (aa->childNums[j] > bb->childNums[j])
+      return 1;
+  }
+  for (; i < depth; i++) {
+    int j = i - __kmp_affinity_compact;
+    if (aa->childNums[j] < bb->childNums[j])
+      return -1;
+    if (aa->childNums[j] > bb->childNums[j])
+      return 1;
+  }
+  return 0;
 }
 
-static void
-__kmp_aux_affinity_initialize(void)
-{
-    if (__kmp_affinity_masks != NULL) {
-        KMP_ASSERT(__kmp_affin_fullMask != NULL);
-        return;
-    }
-
-    //
-    // Create the "full" mask - this defines all of the processors that we
-    // consider to be in the machine model.  If respect is set, then it is
-    // the initialization thread's affinity mask.  Otherwise, it is all
-    // processors that we know about on the machine.
-    //
-    if (__kmp_affin_fullMask == NULL) {
-        KMP_CPU_ALLOC(__kmp_affin_fullMask);
-    }
-    if (KMP_AFFINITY_CAPABLE()) {
-        if (__kmp_affinity_respect_mask) {
-            __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
+static void __kmp_aux_affinity_initialize(void) {
+  if (__kmp_affinity_masks != NULL) {
+    KMP_ASSERT(__kmp_affin_fullMask != NULL);
+    return;
+  }
 
-            //
-            // Count the number of available processors.
-            //
-            unsigned i;
-            __kmp_avail_proc = 0;
-            KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
-                if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
-                    continue;
-                }
-                __kmp_avail_proc++;
-            }
-            if (__kmp_avail_proc > __kmp_xproc) {
-                if (__kmp_affinity_verbose || (__kmp_affinity_warnings
-                  && (__kmp_affinity_type != affinity_none))) {
-                    KMP_WARNING(ErrorInitializeAffinity);
-                }
-                __kmp_affinity_type = affinity_none;
-                KMP_AFFINITY_DISABLE();
-                return;
-            }
+  // Create the "full" mask - this defines all of the processors that we
+  // consider to be in the machine model. If respect is set, then it is the
+  // initialization thread's affinity mask. Otherwise, it is all processors that
+  // we know about on the machine.
+  if (__kmp_affin_fullMask == NULL) {
+    KMP_CPU_ALLOC(__kmp_affin_fullMask);
+  }
+  if (KMP_AFFINITY_CAPABLE()) {
+    if (__kmp_affinity_respect_mask) {
+      __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
+
+      // Count the number of available processors.
+      unsigned i;
+      __kmp_avail_proc = 0;
+      KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+        if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
+          continue;
         }
-        else {
-            __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
-            __kmp_avail_proc = __kmp_xproc;
+        __kmp_avail_proc++;
+      }
+      if (__kmp_avail_proc > __kmp_xproc) {
+        if (__kmp_affinity_verbose ||
+            (__kmp_affinity_warnings &&
+             (__kmp_affinity_type != affinity_none))) {
+          KMP_WARNING(ErrorInitializeAffinity);
         }
+        __kmp_affinity_type = affinity_none;
+        KMP_AFFINITY_DISABLE();
+        return;
+      }
+    } else {
+      __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
+      __kmp_avail_proc = __kmp_xproc;
     }
+  }
 
-    int depth = -1;
-    kmp_i18n_id_t msg_id = kmp_i18n_null;
+  int depth = -1;
+  kmp_i18n_id_t msg_id = kmp_i18n_null;
 
-    //
-    // For backward compatibility, setting KMP_CPUINFO_FILE =>
-    // KMP_TOPOLOGY_METHOD=cpuinfo
-    //
-    if ((__kmp_cpuinfo_file != NULL) &&
+  // For backward compatibility, setting KMP_CPUINFO_FILE =>
+  // KMP_TOPOLOGY_METHOD=cpuinfo
+  if ((__kmp_cpuinfo_file != NULL) &&
       (__kmp_affinity_top_method == affinity_top_method_all)) {
-        __kmp_affinity_top_method = affinity_top_method_cpuinfo;
-    }
-
-    if (__kmp_affinity_top_method == affinity_top_method_all) {
-        //
-        // In the default code path, errors are not fatal - we just try using
-        // another method.  We only emit a warning message if affinity is on,
-        // or the verbose flag is set, an the nowarnings flag was not set.
-        //
-        const char *file_name = NULL;
-        int line = 0;
-# if KMP_USE_HWLOC
-        if (depth < 0 && __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
-            if (__kmp_affinity_verbose) {
-                KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
-            }
-            if(!__kmp_hwloc_error) {
-                depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
-                if (depth == 0) {
-                    KMP_EXIT_AFF_NONE;
-                } else if(depth < 0 && __kmp_affinity_verbose) {
-                    KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
-                }
-            } else if(__kmp_affinity_verbose) {
-                KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
-            }
-        }
-# endif
-
-# if KMP_ARCH_X86 || KMP_ARCH_X86_64
-
-        if (depth < 0) {
-            if (__kmp_affinity_verbose) {
-                KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
-            }
-
-            file_name = NULL;
-            depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
-            if (depth == 0) {
-                KMP_EXIT_AFF_NONE;
-            }
-
-            if (depth < 0) {
-                if (__kmp_affinity_verbose) {
-                    if (msg_id != kmp_i18n_null) {
-                        KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
-                          KMP_I18N_STR(DecodingLegacyAPIC));
-                    }
-                    else {
-                        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
-                    }
-                }
-
-                file_name = NULL;
-                depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
-                if (depth == 0) {
-                    KMP_EXIT_AFF_NONE;
-                }
-            }
-        }
-
-# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
-
-# if KMP_OS_LINUX
-
-        if (depth < 0) {
-            if (__kmp_affinity_verbose) {
-                if (msg_id != kmp_i18n_null) {
-                    KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
-                }
-                else {
-                    KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
-                }
-            }
-
-            FILE *f = fopen("/proc/cpuinfo", "r");
-            if (f == NULL) {
-                msg_id = kmp_i18n_str_CantOpenCpuinfo;
-            }
-            else {
-                file_name = "/proc/cpuinfo";
-                depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
-                fclose(f);
-                if (depth == 0) {
-                    KMP_EXIT_AFF_NONE;
-                }
-            }
-        }
-
-# endif /* KMP_OS_LINUX */
-
-# if KMP_GROUP_AFFINITY
-
-        if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
-            if (__kmp_affinity_verbose) {
-                KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
-            }
-
-            depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
-            KMP_ASSERT(depth != 0);
-        }
-
-# endif /* KMP_GROUP_AFFINITY */
-
-        if (depth < 0) {
-            if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
-                if (file_name == NULL) {
-                    KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
-                }
-                else if (line == 0) {
-                    KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
-                }
-                else {
-                    KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
-                }
-            }
-            // FIXME - print msg if msg_id = kmp_i18n_null ???
+    __kmp_affinity_top_method = affinity_top_method_cpuinfo;
+  }
 
-            file_name = "";
-            depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
-            if (depth == 0) {
-                KMP_EXIT_AFF_NONE;
-            }
-            KMP_ASSERT(depth > 0);
-            KMP_ASSERT(address2os != NULL);
+  if (__kmp_affinity_top_method == affinity_top_method_all) {
+    // In the default code path, errors are not fatal - we just try using
+    // another method. We only emit a warning message if affinity is on, or the
+    // verbose flag is set, an the nowarnings flag was not set.
+    const char *file_name = NULL;
+    int line = 0;
+#if KMP_USE_HWLOC
+    if (depth < 0 &&
+        __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
+      if (__kmp_affinity_verbose) {
+        KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+      }
+      if (!__kmp_hwloc_error) {
+        depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
+        if (depth == 0) {
+          KMP_EXIT_AFF_NONE;
+        } else if (depth < 0 && __kmp_affinity_verbose) {
+          KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
         }
+      } else if (__kmp_affinity_verbose) {
+        KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+      }
     }
+#endif
 
-    //
-    // If the user has specified that a paricular topology discovery method
-    // is to be used, then we abort if that method fails.  The exception is
-    // group affinity, which might have been implicitly set.
-    //
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
-# if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    if (depth < 0) {
+      if (__kmp_affinity_verbose) {
+        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
+      }
 
-    else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
-        if (__kmp_affinity_verbose) {
-            KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
-              KMP_I18N_STR(Decodingx2APIC));
-        }
+      file_name = NULL;
+      depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
+      if (depth == 0) {
+        KMP_EXIT_AFF_NONE;
+      }
 
-        depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
-        if (depth == 0) {
-            KMP_EXIT_AFF_NONE;
-        }
-        if (depth < 0) {
-            KMP_ASSERT(msg_id != kmp_i18n_null);
-            KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
-        }
-    }
-    else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
+      if (depth < 0) {
         if (__kmp_affinity_verbose) {
+          if (msg_id != kmp_i18n_null) {
+            KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY",
+                       __kmp_i18n_catgets(msg_id),
+                       KMP_I18N_STR(DecodingLegacyAPIC));
+          } else {
             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
-              KMP_I18N_STR(DecodingLegacyAPIC));
+                       KMP_I18N_STR(DecodingLegacyAPIC));
+          }
         }
 
+        file_name = NULL;
         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
         if (depth == 0) {
-            KMP_EXIT_AFF_NONE;
-        }
-        if (depth < 0) {
-            KMP_ASSERT(msg_id != kmp_i18n_null);
-            KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+          KMP_EXIT_AFF_NONE;
         }
+      }
     }
 
-# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-    else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
-        const char *filename;
-        if (__kmp_cpuinfo_file != NULL) {
-            filename = __kmp_cpuinfo_file;
-        }
-        else {
-            filename = "/proc/cpuinfo";
-        }
+#if KMP_OS_LINUX
 
-        if (__kmp_affinity_verbose) {
-            KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
+    if (depth < 0) {
+      if (__kmp_affinity_verbose) {
+        if (msg_id != kmp_i18n_null) {
+          KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY",
+                     __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
+        } else {
+          KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
         }
+      }
 
-        FILE *f = fopen(filename, "r");
-        if (f == NULL) {
-            int code = errno;
-            if (__kmp_cpuinfo_file != NULL) {
-                __kmp_msg(
-                    kmp_ms_fatal,
-                    KMP_MSG(CantOpenFileForReading, filename),
-                    KMP_ERR(code),
-                    KMP_HNT(NameComesFrom_CPUINFO_FILE),
-                    __kmp_msg_null
-                );
-            }
-            else {
-                __kmp_msg(
-                    kmp_ms_fatal,
-                    KMP_MSG(CantOpenFileForReading, filename),
-                    KMP_ERR(code),
-                    __kmp_msg_null
-                );
-            }
-        }
-        int line = 0;
-        depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
+      FILE *f = fopen("/proc/cpuinfo", "r");
+      if (f == NULL) {
+        msg_id = kmp_i18n_str_CantOpenCpuinfo;
+      } else {
+        file_name = "/proc/cpuinfo";
+        depth =
+            __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
         fclose(f);
-        if (depth < 0) {
-            KMP_ASSERT(msg_id != kmp_i18n_null);
-            if (line > 0) {
-                KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
-            }
-            else {
-                KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
-            }
-        }
-        if (__kmp_affinity_type == affinity_none) {
-            KMP_ASSERT(depth == 0);
-            KMP_EXIT_AFF_NONE;
-        }
-    }
-
-# if KMP_GROUP_AFFINITY
-
-    else if (__kmp_affinity_top_method == affinity_top_method_group) {
-        if (__kmp_affinity_verbose) {
-            KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
-        }
-
-        depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
-        KMP_ASSERT(depth != 0);
-        if (depth < 0) {
-            KMP_ASSERT(msg_id != kmp_i18n_null);
-            KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
-        }
-    }
-
-# endif /* KMP_GROUP_AFFINITY */
-
-    else if (__kmp_affinity_top_method == affinity_top_method_flat) {
-        if (__kmp_affinity_verbose) {
-            KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
-        }
-
-        depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
         if (depth == 0) {
-            KMP_EXIT_AFF_NONE;
+          KMP_EXIT_AFF_NONE;
         }
-        // should not fail
-        KMP_ASSERT(depth > 0);
-        KMP_ASSERT(address2os != NULL);
+      }
     }
 
-# if KMP_USE_HWLOC
-    else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
-        KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
-        if (__kmp_affinity_verbose) {
-            KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
-        }
-        depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
-        if (depth == 0) {
-            KMP_EXIT_AFF_NONE;
-        }
-    }
-# endif // KMP_USE_HWLOC
+#endif /* KMP_OS_LINUX */
 
-    if (address2os == NULL) {
-        if (KMP_AFFINITY_CAPABLE()
-          && (__kmp_affinity_verbose || (__kmp_affinity_warnings
-          && (__kmp_affinity_type != affinity_none)))) {
-            KMP_WARNING(ErrorInitializeAffinity);
-        }
-        __kmp_affinity_type = affinity_none;
-        KMP_AFFINITY_DISABLE();
-        return;
-    }
+#if KMP_GROUP_AFFINITY
 
-    __kmp_apply_thread_places(&address2os, depth);
+    if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
+      if (__kmp_affinity_verbose) {
+        KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
+      }
 
-    //
-    // Create the table of masks, indexed by thread Id.
-    //
-    unsigned maxIndex;
-    unsigned numUnique;
-    kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
-      address2os, __kmp_avail_proc);
-    if (__kmp_affinity_gran_levels == 0) {
-        KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
+      depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
+      KMP_ASSERT(depth != 0);
     }
 
-    //
-    // Set the childNums vector in all Address objects.  This must be done
-    // before we can sort using __kmp_affinity_cmp_Address_child_num(),
-    // which takes into account the setting of __kmp_affinity_compact.
-    //
-    __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
-
-    switch (__kmp_affinity_type) {
+#endif /* KMP_GROUP_AFFINITY */
 
-        case affinity_explicit:
-        KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
-# if OMP_40_ENABLED
-        if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
-# endif
-        {
-            __kmp_affinity_process_proclist(&__kmp_affinity_masks,
-              &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
-              maxIndex);
-        }
-# if OMP_40_ENABLED
-        else {
-            __kmp_affinity_process_placelist(&__kmp_affinity_masks,
-              &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
-              maxIndex);
-        }
-# endif
-        if (__kmp_affinity_num_masks == 0) {
-            if (__kmp_affinity_verbose || (__kmp_affinity_warnings
-              && (__kmp_affinity_type != affinity_none))) {
-                KMP_WARNING(AffNoValidProcID);
-            }
-            __kmp_affinity_type = affinity_none;
-            return;
-        }
-        break;
-
-        //
-        // The other affinity types rely on sorting the Addresses according
-        // to some permutation of the machine topology tree.  Set
-        // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
-        // then jump to a common code fragment to do the sort and create
-        // the array of affinity masks.
-        //
-
-        case affinity_logical:
-        __kmp_affinity_compact = 0;
-        if (__kmp_affinity_offset) {
-            __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
-              % __kmp_avail_proc;
-        }
-        goto sortAddresses;
-
-        case affinity_physical:
-        if (__kmp_nThreadsPerCore > 1) {
-            __kmp_affinity_compact = 1;
-            if (__kmp_affinity_compact >= depth) {
-                __kmp_affinity_compact = 0;
-            }
+    if (depth < 0) {
+      if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
+        if (file_name == NULL) {
+          KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
+        } else if (line == 0) {
+          KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
         } else {
-            __kmp_affinity_compact = 0;
-        }
-        if (__kmp_affinity_offset) {
-            __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
-              % __kmp_avail_proc;
-        }
-        goto sortAddresses;
-
-        case affinity_scatter:
-        if (__kmp_affinity_compact >= depth) {
-            __kmp_affinity_compact = 0;
-        }
-        else {
-            __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
-        }
-        goto sortAddresses;
-
-        case affinity_compact:
-        if (__kmp_affinity_compact >= depth) {
-            __kmp_affinity_compact = depth - 1;
-        }
-        goto sortAddresses;
-
-        case affinity_balanced:
-        if( depth <= 1 ) {
-            if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
-                KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
-            }
-            __kmp_affinity_type = affinity_none;
-            return;
-        } else if( __kmp_affinity_uniform_topology() ) {
-            break;
-        } else { // Non-uniform topology
-
-            // Save the depth for further usage
-            __kmp_aff_depth = depth;
-
-            int core_level = __kmp_affinity_find_core_level(address2os, __kmp_avail_proc, depth - 1);
-            int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, depth - 1, core_level);
-            int maxprocpercore = __kmp_affinity_max_proc_per_core(address2os, __kmp_avail_proc, depth - 1, core_level);
-
-            int nproc = ncores * maxprocpercore;
-            if( ( nproc < 2 ) || ( nproc < __kmp_avail_proc ) ) {
-                if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
-                    KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
-                }
-                __kmp_affinity_type = affinity_none;
-                return;
-            }
-
-            procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
-            for( int i = 0; i < nproc; i++ ) {
-                procarr[ i ] = -1;
-            }
-
-            int lastcore = -1;
-            int inlastcore = 0;
-            for( int i = 0; i < __kmp_avail_proc; i++ ) {
-                int proc = address2os[ i ].second;
-                int core = __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
-
-                if ( core == lastcore ) {
-                    inlastcore++;
-                } else {
-                    inlastcore = 0;
-                }
-                lastcore = core;
-
-                procarr[ core * maxprocpercore + inlastcore ] = proc;
-            }
-
-            break;
-        }
-
-        sortAddresses:
-        //
-        // Allocate the gtid->affinity mask table.
-        //
-        if (__kmp_affinity_dups) {
-            __kmp_affinity_num_masks = __kmp_avail_proc;
-        }
-        else {
-            __kmp_affinity_num_masks = numUnique;
-        }
-
-# if OMP_40_ENABLED
-        if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
-          && ( __kmp_affinity_num_places > 0 )
-          && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
-            __kmp_affinity_num_masks = __kmp_affinity_num_places;
-        }
-# endif
-
-        KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
-
-        //
-        // Sort the address2os table according to the current setting of
-        // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
-        //
-        qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
-          __kmp_affinity_cmp_Address_child_num);
-        {
-            int i;
-            unsigned j;
-            for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
-                if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
-                    continue;
-                }
-                unsigned osId = address2os[i].second;
-                kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
-                kmp_affin_mask_t *dest
-                  = KMP_CPU_INDEX(__kmp_affinity_masks, j);
-                KMP_ASSERT(KMP_CPU_ISSET(osId, src));
-                KMP_CPU_COPY(dest, src);
-                if (++j >= __kmp_affinity_num_masks) {
-                    break;
-                }
-            }
-            KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
-        }
-        break;
-
-        default:
-        KMP_ASSERT2(0, "Unexpected affinity setting");
-    }
-
-    KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex+1);
-    machine_hierarchy.init(address2os, __kmp_avail_proc);
-}
-#undef KMP_EXIT_AFF_NONE
-
-
-void
-__kmp_affinity_initialize(void)
-{
-    //
-    // Much of the code above was written assumming that if a machine was not
-    // affinity capable, then __kmp_affinity_type == affinity_none.  We now
-    // explicitly represent this as __kmp_affinity_type == affinity_disabled.
-    //
-    // There are too many checks for __kmp_affinity_type == affinity_none
-    // in this code.  Instead of trying to change them all, check if
-    // __kmp_affinity_type == affinity_disabled, and if so, slam it with
-    // affinity_none, call the real initialization routine, then restore
-    // __kmp_affinity_type to affinity_disabled.
-    //
-    int disabled = (__kmp_affinity_type == affinity_disabled);
-    if (! KMP_AFFINITY_CAPABLE()) {
-        KMP_ASSERT(disabled);
-    }
-    if (disabled) {
-        __kmp_affinity_type = affinity_none;
-    }
-    __kmp_aux_affinity_initialize();
-    if (disabled) {
-        __kmp_affinity_type = affinity_disabled;
-    }
-}
-
-
-void
-__kmp_affinity_uninitialize(void)
-{
-    if (__kmp_affinity_masks != NULL) {
-        KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
-        __kmp_affinity_masks = NULL;
-    }
-    if (__kmp_affin_fullMask != NULL) {
-        KMP_CPU_FREE(__kmp_affin_fullMask);
-        __kmp_affin_fullMask = NULL;
-    }
-    __kmp_affinity_num_masks = 0;
-    __kmp_affinity_type = affinity_default;
-# if OMP_40_ENABLED
-    __kmp_affinity_num_places = 0;
-# endif
-    if (__kmp_affinity_proclist != NULL) {
-        __kmp_free(__kmp_affinity_proclist);
-        __kmp_affinity_proclist = NULL;
-    }
-    if( address2os != NULL ) {
-        __kmp_free( address2os );
-        address2os = NULL;
-    }
-    if( procarr != NULL ) {
-        __kmp_free( procarr );
-        procarr = NULL;
-    }
-# if KMP_USE_HWLOC
-    if (__kmp_hwloc_topology != NULL) {
-        hwloc_topology_destroy(__kmp_hwloc_topology);
-        __kmp_hwloc_topology = NULL;
-    }
-# endif
-    KMPAffinity::destroy_api();
-}
-
-
-void
-__kmp_affinity_set_init_mask(int gtid, int isa_root)
-{
-    if (! KMP_AFFINITY_CAPABLE()) {
-        return;
-    }
-
-    kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
-    if (th->th.th_affin_mask == NULL) {
-        KMP_CPU_ALLOC(th->th.th_affin_mask);
-    }
-    else {
-        KMP_CPU_ZERO(th->th.th_affin_mask);
-    }
-
-    //
-    // Copy the thread mask to the kmp_info_t strucuture.
-    // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
-    // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
-    // is set, then the full mask is the same as the mask of the initialization
-    // thread.
-    //
-    kmp_affin_mask_t *mask;
-    int i;
-
-# if OMP_40_ENABLED
-    if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
-# endif
-    {
-        if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
-          ) {
-# if KMP_GROUP_AFFINITY
-            if (__kmp_num_proc_groups > 1) {
-                return;
-            }
-# endif
-            KMP_ASSERT(__kmp_affin_fullMask != NULL);
-            i = KMP_PLACE_ALL;
-            mask = __kmp_affin_fullMask;
-        }
-        else {
-            KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
-            i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
-            mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+          KMP_INFORM(UsingFlatOSFileLine, file_name, line,
+                     __kmp_i18n_catgets(msg_id));
         }
+      }
+      // FIXME - print msg if msg_id = kmp_i18n_null ???
+
+      file_name = "";
+      depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
+      if (depth == 0) {
+        KMP_EXIT_AFF_NONE;
+      }
+      KMP_ASSERT(depth > 0);
+      KMP_ASSERT(address2os != NULL);
     }
-# if OMP_40_ENABLED
-    else {
-        if ((! isa_root)
-          || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
-#  if KMP_GROUP_AFFINITY
-            if (__kmp_num_proc_groups > 1) {
-                return;
-            }
-#  endif
-            KMP_ASSERT(__kmp_affin_fullMask != NULL);
-            i = KMP_PLACE_ALL;
-            mask = __kmp_affin_fullMask;
-        }
-        else {
-            //
-            // int i = some hash function or just a counter that doesn't
-            // always start at 0.  Use gtid for now.
-            //
-            KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
-            i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
-            mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
-        }
-    }
-# endif
-
-# if OMP_40_ENABLED
-    th->th.th_current_place = i;
-    if (isa_root) {
-        th->th.th_new_place = i;
-        th->th.th_first_place = 0;
-        th->th.th_last_place = __kmp_affinity_num_masks - 1;
-    }
-
-    if (i == KMP_PLACE_ALL) {
-        KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
-          gtid));
-    }
-    else {
-        KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
-          gtid, i));
-    }
-# else
-    if (i == -1) {
-        KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n",
-          gtid));
-    }
-    else {
-        KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
-          gtid, i));
-    }
-# endif /* OMP_40_ENABLED */
+  }
 
-    KMP_CPU_COPY(th->th.th_affin_mask, mask);
+// If the user has specified that a paricular topology discovery method is to be
+// used, then we abort if that method fails. The exception is group affinity,
+// which might have been implicitly set.
 
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
     if (__kmp_affinity_verbose) {
-        char buf[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-          th->th.th_affin_mask);
-        KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),__kmp_gettid(), gtid, buf);
+      KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
     }
 
-# if KMP_OS_WINDOWS
-    //
-    // On Windows* OS, the process affinity mask might have changed.
-    // If the user didn't request affinity and this call fails,
-    // just continue silently.  See CQ171393.
-    //
-    if ( __kmp_affinity_type == affinity_none ) {
-        __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
+    depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
+    if (depth == 0) {
+      KMP_EXIT_AFF_NONE;
+    }
+    if (depth < 0) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+    }
+  } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
     }
-    else
-# endif
-    __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
-}
 
+    depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
+    if (depth == 0) {
+      KMP_EXIT_AFF_NONE;
+    }
+    if (depth < 0) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+    }
+  }
 
-# if OMP_40_ENABLED
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-void
-__kmp_affinity_set_place(int gtid)
-{
-    int retval;
+  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
+    const char *filename;
+    if (__kmp_cpuinfo_file != NULL) {
+      filename = __kmp_cpuinfo_file;
+    } else {
+      filename = "/proc/cpuinfo";
+    }
 
-    if (! KMP_AFFINITY_CAPABLE()) {
-        return;
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
     }
 
-    kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+    FILE *f = fopen(filename, "r");
+    if (f == NULL) {
+      int code = errno;
+      if (__kmp_cpuinfo_file != NULL) {
+        __kmp_msg(kmp_ms_fatal, KMP_MSG(CantOpenFileForReading, filename),
+                  KMP_ERR(code), KMP_HNT(NameComesFrom_CPUINFO_FILE),
+                  __kmp_msg_null);
+      } else {
+        __kmp_msg(kmp_ms_fatal, KMP_MSG(CantOpenFileForReading, filename),
+                  KMP_ERR(code), __kmp_msg_null);
+      }
+    }
+    int line = 0;
+    depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
+    fclose(f);
+    if (depth < 0) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      if (line > 0) {
+        KMP_FATAL(FileLineMsgExiting, filename, line,
+                  __kmp_i18n_catgets(msg_id));
+      } else {
+        KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
+      }
+    }
+    if (__kmp_affinity_type == affinity_none) {
+      KMP_ASSERT(depth == 0);
+      KMP_EXIT_AFF_NONE;
+    }
+  }
 
-    KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
-      gtid, th->th.th_new_place, th->th.th_current_place));
+#if KMP_GROUP_AFFINITY
 
-    //
-    // Check that the new place is within this thread's partition.
-    //
-    KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
-    KMP_ASSERT(th->th.th_new_place >= 0);
-    KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
-    if (th->th.th_first_place <= th->th.th_last_place) {
-        KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
-         && (th->th.th_new_place <= th->th.th_last_place));
+  else if (__kmp_affinity_top_method == affinity_top_method_group) {
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
     }
-    else {
-        KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
-         || (th->th.th_new_place >= th->th.th_last_place));
+
+    depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
+    KMP_ASSERT(depth != 0);
+    if (depth < 0) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
+  }
 
-    //
-    // Copy the thread mask to the kmp_info_t strucuture,
-    // and set this thread's affinity.
-    //
-    kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
-      th->th.th_new_place);
-    KMP_CPU_COPY(th->th.th_affin_mask, mask);
-    th->th.th_current_place = th->th.th_new_place;
+#endif /* KMP_GROUP_AFFINITY */
 
+  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
     if (__kmp_affinity_verbose) {
-        char buf[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-          th->th.th_affin_mask);
-        KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), __kmp_gettid(), gtid, buf);
+      KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
     }
-    __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
-}
-
-# endif /* OMP_40_ENABLED */
 
+    depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
+    if (depth == 0) {
+      KMP_EXIT_AFF_NONE;
+    }
+    // should not fail
+    KMP_ASSERT(depth > 0);
+    KMP_ASSERT(address2os != NULL);
+  }
 
-int
-__kmp_aux_set_affinity(void **mask)
-{
-    int gtid;
-    kmp_info_t *th;
-    int retval;
+#if KMP_USE_HWLOC
+  else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
+    KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+    }
+    depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
+    if (depth == 0) {
+      KMP_EXIT_AFF_NONE;
+    }
+  }
+#endif // KMP_USE_HWLOC
 
-    if (! KMP_AFFINITY_CAPABLE()) {
-        return -1;
+  if (address2os == NULL) {
+    if (KMP_AFFINITY_CAPABLE() &&
+        (__kmp_affinity_verbose ||
+         (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
+      KMP_WARNING(ErrorInitializeAffinity);
     }
+    __kmp_affinity_type = affinity_none;
+    KMP_AFFINITY_DISABLE();
+    return;
+  }
 
-    gtid = __kmp_entry_gtid();
-    KA_TRACE(1000, ;{
-        char buf[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-          (kmp_affin_mask_t *)(*mask));
-        __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
-          gtid, buf);
-    });
-
-    if (__kmp_env_consistency_check) {
-        if ((mask == NULL) || (*mask == NULL)) {
-            KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
-        }
-        else {
-            unsigned proc;
-            int num_procs = 0;
-
-            KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) {
-                if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
-                    KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
-                }
-                if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
-                    continue;
-                }
-                num_procs++;
-            }
-            if (num_procs == 0) {
-                KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
-            }
+  __kmp_apply_thread_places(&address2os, depth);
 
-# if KMP_GROUP_AFFINITY
-            if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
-                KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
-            }
-# endif /* KMP_GROUP_AFFINITY */
+  // Create the table of masks, indexed by thread Id.
+  unsigned maxIndex;
+  unsigned numUnique;
+  kmp_affin_mask_t *osId2Mask =
+      __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc);
+  if (__kmp_affinity_gran_levels == 0) {
+    KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
+  }
+
+  // Set the childNums vector in all Address objects. This must be done before
+  // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into
+  // account the setting of __kmp_affinity_compact.
+  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
+
+  switch (__kmp_affinity_type) {
+
+  case affinity_explicit:
+    KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
+#if OMP_40_ENABLED
+    if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
+#endif
+    {
+      __kmp_affinity_process_proclist(
+          &__kmp_affinity_masks, &__kmp_affinity_num_masks,
+          __kmp_affinity_proclist, osId2Mask, maxIndex);
+    }
+#if OMP_40_ENABLED
+    else {
+      __kmp_affinity_process_placelist(
+          &__kmp_affinity_masks, &__kmp_affinity_num_masks,
+          __kmp_affinity_proclist, osId2Mask, maxIndex);
+    }
+#endif
+    if (__kmp_affinity_num_masks == 0) {
+      if (__kmp_affinity_verbose ||
+          (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
+        KMP_WARNING(AffNoValidProcID);
+      }
+      __kmp_affinity_type = affinity_none;
+      return;
+    }
+    break;
 
-        }
+  // The other affinity types rely on sorting the Addresses according to some
+  // permutation of the machine topology tree. Set __kmp_affinity_compact and
+  // __kmp_affinity_offset appropriately, then jump to a common code fragment
+  // to do the sort and create the array of affinity masks.
+
+  case affinity_logical:
+    __kmp_affinity_compact = 0;
+    if (__kmp_affinity_offset) {
+      __kmp_affinity_offset =
+          __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
+    }
+    goto sortAddresses;
+
+  case affinity_physical:
+    if (__kmp_nThreadsPerCore > 1) {
+      __kmp_affinity_compact = 1;
+      if (__kmp_affinity_compact >= depth) {
+        __kmp_affinity_compact = 0;
+      }
+    } else {
+      __kmp_affinity_compact = 0;
+    }
+    if (__kmp_affinity_offset) {
+      __kmp_affinity_offset =
+          __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
     }
+    goto sortAddresses;
 
-    th = __kmp_threads[gtid];
-    KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
-    retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
-    if (retval == 0) {
-        KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
+  case affinity_scatter:
+    if (__kmp_affinity_compact >= depth) {
+      __kmp_affinity_compact = 0;
+    } else {
+      __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
     }
+    goto sortAddresses;
 
-# if OMP_40_ENABLED
-    th->th.th_current_place = KMP_PLACE_UNDEFINED;
-    th->th.th_new_place = KMP_PLACE_UNDEFINED;
-    th->th.th_first_place = 0;
-    th->th.th_last_place = __kmp_affinity_num_masks - 1;
+  case affinity_compact:
+    if (__kmp_affinity_compact >= depth) {
+      __kmp_affinity_compact = depth - 1;
+    }
+    goto sortAddresses;
 
-    //
-    // Turn off 4.0 affinity for the current tread at this parallel level.
-    //
-    th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
-# endif
+  case affinity_balanced:
+    if (depth <= 1) {
+      if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
+        KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
+      }
+      __kmp_affinity_type = affinity_none;
+      return;
+    } else if (__kmp_affinity_uniform_topology()) {
+      break;
+    } else { // Non-uniform topology
 
-    return retval;
-}
+      // Save the depth for further usage
+      __kmp_aff_depth = depth;
 
+      int core_level = __kmp_affinity_find_core_level(
+          address2os, __kmp_avail_proc, depth - 1);
+      int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
+                                                 depth - 1, core_level);
+      int maxprocpercore = __kmp_affinity_max_proc_per_core(
+          address2os, __kmp_avail_proc, depth - 1, core_level);
+
+      int nproc = ncores * maxprocpercore;
+      if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
+        if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
+          KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
+        }
+        __kmp_affinity_type = affinity_none;
+        return;
+      }
 
-int
-__kmp_aux_get_affinity(void **mask)
-{
-    int gtid;
-    int retval;
-    kmp_info_t *th;
+      procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
+      for (int i = 0; i < nproc; i++) {
+        procarr[i] = -1;
+      }
 
-    if (! KMP_AFFINITY_CAPABLE()) {
-        return -1;
-    }
+      int lastcore = -1;
+      int inlastcore = 0;
+      for (int i = 0; i < __kmp_avail_proc; i++) {
+        int proc = address2os[i].second;
+        int core =
+            __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
 
-    gtid = __kmp_entry_gtid();
-    th = __kmp_threads[gtid];
-    KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+        if (core == lastcore) {
+          inlastcore++;
+        } else {
+          inlastcore = 0;
+        }
+        lastcore = core;
 
-    KA_TRACE(1000, ;{
-        char buf[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-          th->th.th_affin_mask);
-        __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
-    });
+        procarr[core * maxprocpercore + inlastcore] = proc;
+      }
 
-    if (__kmp_env_consistency_check) {
-        if ((mask == NULL) || (*mask == NULL)) {
-            KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
-        }
+      break;
     }
 
-# if !KMP_OS_WINDOWS
+  sortAddresses:
+    // Allocate the gtid->affinity mask table.
+    if (__kmp_affinity_dups) {
+      __kmp_affinity_num_masks = __kmp_avail_proc;
+    } else {
+      __kmp_affinity_num_masks = numUnique;
+    }
 
-    retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
-    KA_TRACE(1000, ;{
-        char buf[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-          (kmp_affin_mask_t *)(*mask));
-        __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
-    });
-    return retval;
+#if OMP_40_ENABLED
+    if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
+        (__kmp_affinity_num_places > 0) &&
+        ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
+      __kmp_affinity_num_masks = __kmp_affinity_num_places;
+    }
+#endif
 
-# else
+    KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
 
-    KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
-    return 0;
+    // Sort the address2os table according to the current setting of
+    // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
+    qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
+          __kmp_affinity_cmp_Address_child_num);
+    {
+      int i;
+      unsigned j;
+      for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
+        if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) {
+          continue;
+        }
+        unsigned osId = address2os[i].second;
+        kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
+        kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
+        KMP_ASSERT(KMP_CPU_ISSET(osId, src));
+        KMP_CPU_COPY(dest, src);
+        if (++j >= __kmp_affinity_num_masks) {
+          break;
+        }
+      }
+      KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
+    }
+    break;
 
-# endif /* KMP_OS_WINDOWS */
+  default:
+    KMP_ASSERT2(0, "Unexpected affinity setting");
+  }
 
+  KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
+  machine_hierarchy.init(address2os, __kmp_avail_proc);
 }
+#undef KMP_EXIT_AFF_NONE
 
-int
-__kmp_aux_get_affinity_max_proc() {
-    if (!  KMP_AFFINITY_CAPABLE()) {
-        return 0;
-    }
-#if KMP_GROUP_AFFINITY
-    if ( __kmp_num_proc_groups > 1 ) {
-        return (int)(__kmp_num_proc_groups*sizeof(DWORD_PTR)*CHAR_BIT);
-    }
+void __kmp_affinity_initialize(void) {
+  // Much of the code above was written assumming that if a machine was not
+  // affinity capable, then __kmp_affinity_type == affinity_none.  We now
+  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
+  // There are too many checks for __kmp_affinity_type == affinity_none
+  // in this code.  Instead of trying to change them all, check if
+  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
+  // affinity_none, call the real initialization routine, then restore
+  // __kmp_affinity_type to affinity_disabled.
+  int disabled = (__kmp_affinity_type == affinity_disabled);
+  if (!KMP_AFFINITY_CAPABLE()) {
+    KMP_ASSERT(disabled);
+  }
+  if (disabled) {
+    __kmp_affinity_type = affinity_none;
+  }
+  __kmp_aux_affinity_initialize();
+  if (disabled) {
+    __kmp_affinity_type = affinity_disabled;
+  }
+}
+
+void __kmp_affinity_uninitialize(void) {
+  if (__kmp_affinity_masks != NULL) {
+    KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
+    __kmp_affinity_masks = NULL;
+  }
+  if (__kmp_affin_fullMask != NULL) {
+    KMP_CPU_FREE(__kmp_affin_fullMask);
+    __kmp_affin_fullMask = NULL;
+  }
+  __kmp_affinity_num_masks = 0;
+  __kmp_affinity_type = affinity_default;
+#if OMP_40_ENABLED
+  __kmp_affinity_num_places = 0;
+#endif
+  if (__kmp_affinity_proclist != NULL) {
+    __kmp_free(__kmp_affinity_proclist);
+    __kmp_affinity_proclist = NULL;
+  }
+  if (address2os != NULL) {
+    __kmp_free(address2os);
+    address2os = NULL;
+  }
+  if (procarr != NULL) {
+    __kmp_free(procarr);
+    procarr = NULL;
+  }
+#if KMP_USE_HWLOC
+  if (__kmp_hwloc_topology != NULL) {
+    hwloc_topology_destroy(__kmp_hwloc_topology);
+    __kmp_hwloc_topology = NULL;
+  }
 #endif
-    return __kmp_xproc;
+  KMPAffinity::destroy_api();
 }
 
-int
-__kmp_aux_set_affinity_mask_proc(int proc, void **mask)
-{
-    int retval;
-
-    if (! KMP_AFFINITY_CAPABLE()) {
-        return -1;
-    }
+void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return;
+  }
 
-    KA_TRACE(1000, ;{
-        int gtid = __kmp_entry_gtid();
-        char buf[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-          (kmp_affin_mask_t *)(*mask));
-        __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
-          proc, gtid, buf);
-    });
+  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+  if (th->th.th_affin_mask == NULL) {
+    KMP_CPU_ALLOC(th->th.th_affin_mask);
+  } else {
+    KMP_CPU_ZERO(th->th.th_affin_mask);
+  }
+
+  // Copy the thread mask to the kmp_info_t strucuture. If
+  // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
+  // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
+  // then the full mask is the same as the mask of the initialization thread.
+  kmp_affin_mask_t *mask;
+  int i;
 
-    if (__kmp_env_consistency_check) {
-        if ((mask == NULL) || (*mask == NULL)) {
-            KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
-        }
+#if OMP_40_ENABLED
+  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
+#endif
+  {
+    if ((__kmp_affinity_type == affinity_none) ||
+        (__kmp_affinity_type == affinity_balanced)) {
+#if KMP_GROUP_AFFINITY
+      if (__kmp_num_proc_groups > 1) {
+        return;
+      }
+#endif
+      KMP_ASSERT(__kmp_affin_fullMask != NULL);
+      i = KMP_PLACE_ALL;
+      mask = __kmp_affin_fullMask;
+    } else {
+      KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
+      i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
+      mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+    }
+  }
+#if OMP_40_ENABLED
+  else {
+    if ((!isa_root) ||
+        (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
+#if KMP_GROUP_AFFINITY
+      if (__kmp_num_proc_groups > 1) {
+        return;
+      }
+#endif
+      KMP_ASSERT(__kmp_affin_fullMask != NULL);
+      i = KMP_PLACE_ALL;
+      mask = __kmp_affin_fullMask;
+    } else {
+      // int i = some hash function or just a counter that doesn't
+      // always start at 0.  Use gtid for now.
+      KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
+      i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
+      mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
     }
+  }
+#endif
 
-    if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
-        return -1;
-    }
-    if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
-        return -2;
-    }
+#if OMP_40_ENABLED
+  th->th.th_current_place = i;
+  if (isa_root) {
+    th->th.th_new_place = i;
+    th->th.th_first_place = 0;
+    th->th.th_last_place = __kmp_affinity_num_masks - 1;
+  }
 
-    KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
-    return 0;
+  if (i == KMP_PLACE_ALL) {
+    KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
+                   gtid));
+  } else {
+    KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
+                   gtid, i));
+  }
+#else
+  if (i == -1) {
+    KA_TRACE(
+        100,
+        ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n",
+         gtid));
+  } else {
+    KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
+                   gtid, i));
+  }
+#endif /* OMP_40_ENABLED */
+
+  KMP_CPU_COPY(th->th.th_affin_mask, mask);
+
+  if (__kmp_affinity_verbose) {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              th->th.th_affin_mask);
+    KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
+               __kmp_gettid(), gtid, buf);
+  }
+
+#if KMP_OS_WINDOWS
+  // On Windows* OS, the process affinity mask might have changed. If the user
+  // didn't request affinity and this call fails, just continue silently.
+  // See CQ171393.
+  if (__kmp_affinity_type == affinity_none) {
+    __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
+  } else
+#endif
+    __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
 }
 
+#if OMP_40_ENABLED
 
-int
-__kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
-{
-    int retval;
+void __kmp_affinity_set_place(int gtid) {
+  int retval;
 
-    if (! KMP_AFFINITY_CAPABLE()) {
-        return -1;
-    }
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return;
+  }
 
-    KA_TRACE(1000, ;{
-        int gtid = __kmp_entry_gtid();
-        char buf[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-          (kmp_affin_mask_t *)(*mask));
-        __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
-          proc, gtid, buf);
-    });
+  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+
+  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
+                 "place = %d)\n",
+                 gtid, th->th.th_new_place, th->th.th_current_place));
+
+  // Check that the new place is within this thread's partition.
+  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+  KMP_ASSERT(th->th.th_new_place >= 0);
+  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
+  if (th->th.th_first_place <= th->th.th_last_place) {
+    KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
+               (th->th.th_new_place <= th->th.th_last_place));
+  } else {
+    KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
+               (th->th.th_new_place >= th->th.th_last_place));
+  }
+
+  // Copy the thread mask to the kmp_info_t strucuture,
+  // and set this thread's affinity.
+  kmp_affin_mask_t *mask =
+      KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
+  KMP_CPU_COPY(th->th.th_affin_mask, mask);
+  th->th.th_current_place = th->th.th_new_place;
+
+  if (__kmp_affinity_verbose) {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              th->th.th_affin_mask);
+    KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
+               __kmp_gettid(), gtid, buf);
+  }
+  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
+}
+
+#endif /* OMP_40_ENABLED */
+
+int __kmp_aux_set_affinity(void **mask) {
+  int gtid;
+  kmp_info_t *th;
+  int retval;
+
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  gtid = __kmp_entry_gtid();
+  KA_TRACE(1000, ; {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              (kmp_affin_mask_t *)(*mask));
+    __kmp_debug_printf(
+        "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid,
+        buf);
+  });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+    } else {
+      unsigned proc;
+      int num_procs = 0;
 
-    if (__kmp_env_consistency_check) {
-        if ((mask == NULL) || (*mask == NULL)) {
-            KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
+      KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
+        if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+          KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
         }
-    }
+        if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
+          continue;
+        }
+        num_procs++;
+      }
+      if (num_procs == 0) {
+        KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+      }
 
-    if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
-        return -1;
-    }
-    if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
-        return -2;
+#if KMP_GROUP_AFFINITY
+      if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
+        KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+      }
+#endif /* KMP_GROUP_AFFINITY */
     }
+  }
 
-    KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
-    return 0;
-}
+  th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
+  if (retval == 0) {
+    KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
+  }
+
+#if OMP_40_ENABLED
+  th->th.th_current_place = KMP_PLACE_UNDEFINED;
+  th->th.th_new_place = KMP_PLACE_UNDEFINED;
+  th->th.th_first_place = 0;
+  th->th.th_last_place = __kmp_affinity_num_masks - 1;
 
+  // Turn off 4.0 affinity for the current tread at this parallel level.
+  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
+#endif
 
-int
-__kmp_aux_get_affinity_mask_proc(int proc, void **mask)
-{
-    int retval;
+  return retval;
+}
 
-    if (! KMP_AFFINITY_CAPABLE()) {
-        return -1;
-    }
+int __kmp_aux_get_affinity(void **mask) {
+  int gtid;
+  int retval;
+  kmp_info_t *th;
+
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  gtid = __kmp_entry_gtid();
+  th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+
+  KA_TRACE(1000, ; {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              th->th.th_affin_mask);
+    __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n",
+                 gtid, buf);
+  });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
+    }
+  }
+
+#if !KMP_OS_WINDOWS
+
+  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
+  KA_TRACE(1000, ; {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              (kmp_affin_mask_t *)(*mask));
+    __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n",
+                 gtid, buf);
+  });
+  return retval;
 
-    KA_TRACE(1000, ;{
-        int gtid = __kmp_entry_gtid();
-        char buf[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-          (kmp_affin_mask_t *)(*mask));
-        __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
-          proc, gtid, buf);
-    });
+#else
 
-    if (__kmp_env_consistency_check) {
-        if ((mask == NULL) || (*mask == NULL)) {
-            KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
-        }
-    }
+  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
+  return 0;
 
-    if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
-        return -1;
-    }
-    if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
-        return 0;
-    }
+#endif /* KMP_OS_WINDOWS */
+}
 
-    return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
+int __kmp_aux_get_affinity_max_proc() {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return 0;
+  }
+#if KMP_GROUP_AFFINITY
+  if (__kmp_num_proc_groups > 1) {
+    return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
+  }
+#endif
+  return __kmp_xproc;
 }
 
+int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
+  int retval;
+
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  KA_TRACE(1000, ; {
+    int gtid = __kmp_entry_gtid();
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              (kmp_affin_mask_t *)(*mask));
+    __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
+                       "affinity mask for thread %d = %s\n",
+                       proc, gtid, buf);
+  });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
+    }
+  }
+
+  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
+    return -1;
+  }
+  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+    return -2;
+  }
+
+  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
+  return 0;
+}
+
+int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
+  int retval;
+
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  KA_TRACE(1000, ; {
+    int gtid = __kmp_entry_gtid();
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              (kmp_affin_mask_t *)(*mask));
+    __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
+                       "affinity mask for thread %d = %s\n",
+                       proc, gtid, buf);
+  });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
+    }
+  }
+
+  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
+    return -1;
+  }
+  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+    return -2;
+  }
+
+  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
+  return 0;
+}
+
+int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
+  int retval;
+
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  KA_TRACE(1000, ; {
+    int gtid = __kmp_entry_gtid();
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              (kmp_affin_mask_t *)(*mask));
+    __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
+                       "affinity mask for thread %d = %s\n",
+                       proc, gtid, buf);
+  });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
+    }
+  }
+
+  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
+    return -1;
+  }
+  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+    return 0;
+  }
+
+  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
+}
 
 // Dynamic affinity settings - Affinity balanced
-void __kmp_balanced_affinity( int tid, int nthreads )
-{
-    bool fine_gran = true;
+void __kmp_balanced_affinity(int tid, int nthreads) {
+  bool fine_gran = true;
 
-    switch (__kmp_affinity_gran) {
-        case affinity_gran_fine:
-        case affinity_gran_thread:
-            break;
-        case affinity_gran_core:
-            if( __kmp_nThreadsPerCore > 1) {
-                fine_gran = false;
-            }
-            break;
-        case affinity_gran_package:
-            if( nCoresPerPkg > 1) {
-                fine_gran = false;
-            }
-            break;
-        default:
-            fine_gran = false;
+  switch (__kmp_affinity_gran) {
+  case affinity_gran_fine:
+  case affinity_gran_thread:
+    break;
+  case affinity_gran_core:
+    if (__kmp_nThreadsPerCore > 1) {
+      fine_gran = false;
+    }
+    break;
+  case affinity_gran_package:
+    if (nCoresPerPkg > 1) {
+      fine_gran = false;
+    }
+    break;
+  default:
+    fine_gran = false;
+  }
+
+  if (__kmp_affinity_uniform_topology()) {
+    int coreID;
+    int threadID;
+    // Number of hyper threads per core in HT machine
+    int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
+    // Number of cores
+    int ncores = __kmp_ncores;
+    if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
+      __kmp_nth_per_core = __kmp_avail_proc / nPackages;
+      ncores = nPackages;
+    }
+    // How many threads will be bound to each core
+    int chunk = nthreads / ncores;
+    // How many cores will have an additional thread bound to it - "big cores"
+    int big_cores = nthreads % ncores;
+    // Number of threads on the big cores
+    int big_nth = (chunk + 1) * big_cores;
+    if (tid < big_nth) {
+      coreID = tid / (chunk + 1);
+      threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
+    } else { // tid >= big_nth
+      coreID = (tid - big_cores) / chunk;
+      threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
     }
 
-    if( __kmp_affinity_uniform_topology() ) {
-        int coreID;
-        int threadID;
-        // Number of hyper threads per core in HT machine
-        int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
-        // Number of cores
-        int ncores = __kmp_ncores;
-        if( ( nPackages > 1 ) && ( __kmp_nth_per_core <= 1 ) ) {
-            __kmp_nth_per_core = __kmp_avail_proc / nPackages;
-            ncores = nPackages;
-        }
-        // How many threads will be bound to each core
-        int chunk = nthreads / ncores;
-        // How many cores will have an additional thread bound to it - "big cores"
-        int big_cores = nthreads % ncores;
-        // Number of threads on the big cores
-        int big_nth = ( chunk + 1 ) * big_cores;
-        if( tid < big_nth ) {
-            coreID = tid / (chunk + 1 );
-            threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
-        } else { //tid >= big_nth
-            coreID = ( tid - big_cores ) / chunk;
-            threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
-        }
-
-        KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
-          "Illegal set affinity operation when not capable");
-
-        kmp_affin_mask_t *mask;
-        KMP_CPU_ALLOC_ON_STACK(mask);
-        KMP_CPU_ZERO(mask);
-
-        if( fine_gran ) {
-            int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
-            KMP_CPU_SET( osID, mask);
-        } else {
-            for( int i = 0; i < __kmp_nth_per_core; i++ ) {
-                int osID;
-                osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
-                KMP_CPU_SET( osID, mask);
-            }
-        }
-        if (__kmp_affinity_verbose) {
-            char buf[KMP_AFFIN_MASK_PRINT_LEN];
-            __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
-            KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
-            __kmp_gettid(), tid, buf);
-        }
-        __kmp_set_system_affinity( mask, TRUE );
-        KMP_CPU_FREE_FROM_STACK(mask);
-    } else { // Non-uniform topology
+    KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                      "Illegal set affinity operation when not capable");
 
-        kmp_affin_mask_t *mask;
-        KMP_CPU_ALLOC_ON_STACK(mask);
-        KMP_CPU_ZERO(mask);
-
-        int core_level = __kmp_affinity_find_core_level(address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
-        int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
-        int nth_per_core = __kmp_affinity_max_proc_per_core(address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
-
-        // For performance gain consider the special case nthreads == __kmp_avail_proc
-        if( nthreads == __kmp_avail_proc ) {
-            if( fine_gran ) {
-                int osID = address2os[ tid ].second;
-                KMP_CPU_SET( osID, mask);
-            } else {
-                int core = __kmp_affinity_find_core(address2os, tid, __kmp_aff_depth - 1, core_level);
-                for( int i = 0; i < __kmp_avail_proc; i++ ) {
-                    int osID = address2os[ i ].second;
-                    if( __kmp_affinity_find_core(address2os, i,  __kmp_aff_depth - 1, core_level) == core ) {
-                        KMP_CPU_SET( osID, mask);
-                    }
-                }
-            }
-        } else if( nthreads <= ncores ) {
+    kmp_affin_mask_t *mask;
+    KMP_CPU_ALLOC_ON_STACK(mask);
+    KMP_CPU_ZERO(mask);
 
-            int core = 0;
-            for( int i = 0; i < ncores; i++ ) {
-                // Check if this core from procarr[] is in the mask
-                int in_mask = 0;
-                for( int j = 0; j < nth_per_core; j++ ) {
-                    if( procarr[ i * nth_per_core + j ] != - 1 ) {
-                        in_mask = 1;
-                        break;
-                    }
-                }
-                if( in_mask ) {
-                    if( tid == core ) {
-                        for( int j = 0; j < nth_per_core; j++ ) {
-                            int osID = procarr[ i * nth_per_core + j ];
-                            if( osID != -1 ) {
-                                KMP_CPU_SET( osID, mask );
-                                // For fine granularity it is enough to set the first available osID for this core
-                                if( fine_gran) {
-                                    break;
-                                }
-                            }
-                        }
-                        break;
-                    } else {
-                        core++;
-                    }
-                }
-            }
+    if (fine_gran) {
+      int osID = address2os[coreID * __kmp_nth_per_core + threadID].second;
+      KMP_CPU_SET(osID, mask);
+    } else {
+      for (int i = 0; i < __kmp_nth_per_core; i++) {
+        int osID;
+        osID = address2os[coreID * __kmp_nth_per_core + i].second;
+        KMP_CPU_SET(osID, mask);
+      }
+    }
+    if (__kmp_affinity_verbose) {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
+      KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
+                 __kmp_gettid(), tid, buf);
+    }
+    __kmp_set_system_affinity(mask, TRUE);
+    KMP_CPU_FREE_FROM_STACK(mask);
+  } else { // Non-uniform topology
 
-        } else { // nthreads > ncores
+    kmp_affin_mask_t *mask;
+    KMP_CPU_ALLOC_ON_STACK(mask);
+    KMP_CPU_ZERO(mask);
 
-            // Array to save the number of processors at each core
-            int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
-            // Array to save the number of cores with "x" available processors;
-            int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
-            // Array to save the number of cores with # procs from x to nth_per_core
-            int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
-
-            for( int i = 0; i <= nth_per_core; i++ ) {
-                ncores_with_x_procs[ i ] = 0;
-                ncores_with_x_to_max_procs[ i ] = 0;
-            }
+    int core_level = __kmp_affinity_find_core_level(
+        address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
+    int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
+                                               __kmp_aff_depth - 1, core_level);
+    int nth_per_core = __kmp_affinity_max_proc_per_core(
+        address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
+
+    // For performance gain consider the special case nthreads ==
+    // __kmp_avail_proc
+    if (nthreads == __kmp_avail_proc) {
+      if (fine_gran) {
+        int osID = address2os[tid].second;
+        KMP_CPU_SET(osID, mask);
+      } else {
+        int core = __kmp_affinity_find_core(address2os, tid,
+                                            __kmp_aff_depth - 1, core_level);
+        for (int i = 0; i < __kmp_avail_proc; i++) {
+          int osID = address2os[i].second;
+          if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1,
+                                       core_level) == core) {
+            KMP_CPU_SET(osID, mask);
+          }
+        }
+      }
+    } else if (nthreads <= ncores) {
 
-            for( int i = 0; i < ncores; i++ ) {
-                int cnt = 0;
-                for( int j = 0; j < nth_per_core; j++ ) {
-                    if( procarr[ i * nth_per_core + j ] != -1 ) {
-                        cnt++;
-                    }
+      int core = 0;
+      for (int i = 0; i < ncores; i++) {
+        // Check if this core from procarr[] is in the mask
+        int in_mask = 0;
+        for (int j = 0; j < nth_per_core; j++) {
+          if (procarr[i * nth_per_core + j] != -1) {
+            in_mask = 1;
+            break;
+          }
+        }
+        if (in_mask) {
+          if (tid == core) {
+            for (int j = 0; j < nth_per_core; j++) {
+              int osID = procarr[i * nth_per_core + j];
+              if (osID != -1) {
+                KMP_CPU_SET(osID, mask);
+                // For fine granularity it is enough to set the first available
+                // osID for this core
+                if (fine_gran) {
+                  break;
                 }
-                nproc_at_core[ i ] = cnt;
-                ncores_with_x_procs[ cnt ]++;
+              }
             }
+            break;
+          } else {
+            core++;
+          }
+        }
+      }
+    } else { // nthreads > ncores
+      // Array to save the number of processors at each core
+      int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
+      // Array to save the number of cores with "x" available processors;
+      int *ncores_with_x_procs =
+          (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
+      // Array to save the number of cores with # procs from x to nth_per_core
+      int *ncores_with_x_to_max_procs =
+          (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
+
+      for (int i = 0; i <= nth_per_core; i++) {
+        ncores_with_x_procs[i] = 0;
+        ncores_with_x_to_max_procs[i] = 0;
+      }
 
-            for( int i = 0; i <= nth_per_core; i++ ) {
-                for( int j = i; j <= nth_per_core; j++ ) {
-                    ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
-                }
-            }
+      for (int i = 0; i < ncores; i++) {
+        int cnt = 0;
+        for (int j = 0; j < nth_per_core; j++) {
+          if (procarr[i * nth_per_core + j] != -1) {
+            cnt++;
+          }
+        }
+        nproc_at_core[i] = cnt;
+        ncores_with_x_procs[cnt]++;
+      }
 
-            // Max number of processors
-            int nproc = nth_per_core * ncores;
-            // An array to keep number of threads per each context
-            int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
-            for( int i = 0; i < nproc; i++ ) {
-                newarr[ i ] = 0;
-            }
+      for (int i = 0; i <= nth_per_core; i++) {
+        for (int j = i; j <= nth_per_core; j++) {
+          ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
+        }
+      }
 
-            int nth = nthreads;
-            int flag = 0;
-            while( nth > 0 ) {
-                for( int j = 1; j <= nth_per_core; j++ ) {
-                    int cnt = ncores_with_x_to_max_procs[ j ];
-                    for( int i = 0; i < ncores; i++ ) {
-                        // Skip the core with 0 processors
-                        if( nproc_at_core[ i ] == 0 ) {
-                            continue;
-                        }
-                        for( int k = 0; k < nth_per_core; k++ ) {
-                            if( procarr[ i * nth_per_core + k ] != -1 ) {
-                                if( newarr[ i * nth_per_core + k ] == 0 ) {
-                                    newarr[ i * nth_per_core + k ] = 1;
-                                    cnt--;
-                                    nth--;
-                                    break;
-                                } else {
-                                    if( flag != 0 ) {
-                                        newarr[ i * nth_per_core + k ] ++;
-                                        cnt--;
-                                        nth--;
-                                        break;
-                                    }
-                                }
-                            }
-                        }
-                        if( cnt == 0 || nth == 0 ) {
-                            break;
-                        }
-                    }
-                    if( nth == 0 ) {
-                        break;
-                    }
-                }
-                flag = 1;
-            }
-            int sum = 0;
-            for( int i = 0; i < nproc; i++ ) {
-                sum += newarr[ i ];
-                if( sum > tid ) {
-                    if( fine_gran) {
-                        int osID = procarr[ i ];
-                        KMP_CPU_SET( osID, mask);
-                    } else {
-                        int coreID = i / nth_per_core;
-                        for( int ii = 0; ii < nth_per_core; ii++ ) {
-                            int osID = procarr[ coreID * nth_per_core + ii ];
-                            if( osID != -1 ) {
-                                KMP_CPU_SET( osID, mask);
-                            }
-                        }
-                    }
+      // Max number of processors
+      int nproc = nth_per_core * ncores;
+      // An array to keep number of threads per each context
+      int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
+      for (int i = 0; i < nproc; i++) {
+        newarr[i] = 0;
+      }
+
+      int nth = nthreads;
+      int flag = 0;
+      while (nth > 0) {
+        for (int j = 1; j <= nth_per_core; j++) {
+          int cnt = ncores_with_x_to_max_procs[j];
+          for (int i = 0; i < ncores; i++) {
+            // Skip the core with 0 processors
+            if (nproc_at_core[i] == 0) {
+              continue;
+            }
+            for (int k = 0; k < nth_per_core; k++) {
+              if (procarr[i * nth_per_core + k] != -1) {
+                if (newarr[i * nth_per_core + k] == 0) {
+                  newarr[i * nth_per_core + k] = 1;
+                  cnt--;
+                  nth--;
+                  break;
+                } else {
+                  if (flag != 0) {
+                    newarr[i * nth_per_core + k]++;
+                    cnt--;
+                    nth--;
                     break;
+                  }
                 }
+              }
+            }
+            if (cnt == 0 || nth == 0) {
+              break;
             }
-            __kmp_free( newarr );
+          }
+          if (nth == 0) {
+            break;
+          }
         }
-
-        if (__kmp_affinity_verbose) {
-            char buf[KMP_AFFIN_MASK_PRINT_LEN];
-            __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
-            KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
-            __kmp_gettid(), tid, buf);
+        flag = 1;
+      }
+      int sum = 0;
+      for (int i = 0; i < nproc; i++) {
+        sum += newarr[i];
+        if (sum > tid) {
+          if (fine_gran) {
+            int osID = procarr[i];
+            KMP_CPU_SET(osID, mask);
+          } else {
+            int coreID = i / nth_per_core;
+            for (int ii = 0; ii < nth_per_core; ii++) {
+              int osID = procarr[coreID * nth_per_core + ii];
+              if (osID != -1) {
+                KMP_CPU_SET(osID, mask);
+              }
+            }
+          }
+          break;
         }
-        __kmp_set_system_affinity( mask, TRUE );
-        KMP_CPU_FREE_FROM_STACK(mask);
+      }
+      __kmp_free(newarr);
     }
+
+    if (__kmp_affinity_verbose) {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
+      KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
+                 __kmp_gettid(), tid, buf);
+    }
+    __kmp_set_system_affinity(mask, TRUE);
+    KMP_CPU_FREE_FROM_STACK(mask);
+  }
 }
 
 #if KMP_OS_LINUX
@@ -5451,28 +5004,29 @@ void __kmp_balanced_affinity( int tid, i
 #ifdef __cplusplus
 extern "C"
 #endif
-int
-kmp_set_thread_affinity_mask_initial()
+    int
+    kmp_set_thread_affinity_mask_initial()
 // the function returns 0 on success,
 //   -1 if we cannot bind thread
 //   >0 (errno) if an error happened during binding
 {
-    int gtid = __kmp_get_gtid();
-    if (gtid < 0) {
-        // Do not touch non-omp threads
-        KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
-            "non-omp thread, returning\n"));
-        return -1;
-    }
-    if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
-        KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
-            "affinity not initialized, returning\n"));
-        return -1;
-    }
-    KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: "
-        "set full mask for thread %d\n", gtid));
-    KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
-    return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
+  int gtid = __kmp_get_gtid();
+  if (gtid < 0) {
+    // Do not touch non-omp threads
+    KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
+                  "non-omp thread, returning\n"));
+    return -1;
+  }
+  if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
+    KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
+                  "affinity not initialized, returning\n"));
+    return -1;
+  }
+  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
+                "set full mask for thread %d\n",
+                gtid));
+  KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
+  return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
 }
 #endif
 

Modified: openmp/trunk/runtime/src/kmp_affinity.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_affinity.h?rev=302929&r1=302928&r2=302929&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_affinity.h (original)
+++ openmp/trunk/runtime/src/kmp_affinity.h Fri May 12 13:01:32 2017
@@ -12,765 +12,827 @@
 //
 //===----------------------------------------------------------------------===//
 
+
 #ifndef KMP_AFFINITY_H
 #define KMP_AFFINITY_H
 
-#include "kmp_os.h"
 #include "kmp.h"
+#include "kmp_os.h"
 
 #if KMP_AFFINITY_SUPPORTED
 #if KMP_USE_HWLOC
-class KMPHwlocAffinity: public KMPAffinity {
+class KMPHwlocAffinity : public KMPAffinity {
 public:
-    class Mask : public KMPAffinity::Mask {
-        hwloc_cpuset_t mask;
-    public:
-        Mask() { mask = hwloc_bitmap_alloc(); this->zero(); }
-        ~Mask() { hwloc_bitmap_free(mask); }
-        void set(int i) override { hwloc_bitmap_set(mask, i); }
-        bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
-        void clear(int i) override { hwloc_bitmap_clr(mask, i); }
-        void zero() override { hwloc_bitmap_zero(mask); }
-        void copy(const KMPAffinity::Mask* src) override {
-            const Mask* convert = static_cast<const Mask*>(src);
-            hwloc_bitmap_copy(mask, convert->mask);
-        }
-        void bitwise_and(const KMPAffinity::Mask* rhs) override {
-            const Mask* convert = static_cast<const Mask*>(rhs);
-            hwloc_bitmap_and(mask, mask, convert->mask);
-        }
-        void bitwise_or(const KMPAffinity::Mask * rhs) override {
-            const Mask* convert = static_cast<const Mask*>(rhs);
-            hwloc_bitmap_or(mask, mask, convert->mask);
-        }
-        void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
-        int begin() const override { return hwloc_bitmap_first(mask); }
-        int end() const override { return -1; }
-        int next(int previous) const override { return hwloc_bitmap_next(mask, previous); }
-        int get_system_affinity(bool abort_on_error) override {
-            KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
-              "Illegal get affinity operation when not capable");
-            int retval = hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
-            if (retval >= 0) {
-                return 0;
-            }
-            int error = errno;
-            if (abort_on_error) {
-                __kmp_msg(kmp_ms_fatal, KMP_MSG( FatalSysError ), KMP_ERR( error ), __kmp_msg_null);
-            }
-            return error;
-        }
-        int set_system_affinity(bool abort_on_error) const override {
-            KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
-              "Illegal get affinity operation when not capable");
-            int retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
-            if (retval >= 0) {
-                return 0;
-            }
-            int error = errno;
-            if (abort_on_error) {
-                __kmp_msg(kmp_ms_fatal, KMP_MSG( FatalSysError ), KMP_ERR( error ), __kmp_msg_null);
-            }
-            return error;
-        }
-        int get_proc_group() const override {
-            int i;
-            int group = -1;
-# if KMP_OS_WINDOWS
-            if (__kmp_num_proc_groups == 1) {
-                return 1;
-            }
-            for (i = 0; i < __kmp_num_proc_groups; i++) {
-                // On windows, the long type is always 32 bits
-                unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i*2);
-                unsigned long second_32_bits = hwloc_bitmap_to_ith_ulong(mask, i*2+1);
-                if (first_32_bits == 0 && second_32_bits == 0) {
-                    continue;
-                }
-                if (group >= 0) {
-                    return -1;
-                }
-                group = i;
-            }
-# endif /* KMP_OS_WINDOWS */
-            return group;
-        }
-    };
-    void determine_capable(const char* var) override {
-        const hwloc_topology_support* topology_support;
-        if(__kmp_hwloc_topology == NULL) {
-            if(hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
-                __kmp_hwloc_error = TRUE;
-                if(__kmp_affinity_verbose)
-                    KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
-            }
-            if(hwloc_topology_load(__kmp_hwloc_topology) < 0) {
-                __kmp_hwloc_error = TRUE;
-                if(__kmp_affinity_verbose)
-                    KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
-            }
+  class Mask : public KMPAffinity::Mask {
+    hwloc_cpuset_t mask;
+
+  public:
+    Mask() {
+      mask = hwloc_bitmap_alloc();
+      this->zero();
+    }
+    ~Mask() { hwloc_bitmap_free(mask); }
+    void set(int i) override { hwloc_bitmap_set(mask, i); }
+    bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
+    void clear(int i) override { hwloc_bitmap_clr(mask, i); }
+    void zero() override { hwloc_bitmap_zero(mask); }
+    void copy(const KMPAffinity::Mask *src) override {
+      const Mask *convert = static_cast<const Mask *>(src);
+      hwloc_bitmap_copy(mask, convert->mask);
+    }
+    void bitwise_and(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      hwloc_bitmap_and(mask, mask, convert->mask);
+    }
+    void bitwise_or(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      hwloc_bitmap_or(mask, mask, convert->mask);
+    }
+    void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
+    int begin() const override { return hwloc_bitmap_first(mask); }
+    int end() const override { return -1; }
+    int next(int previous) const override {
+      return hwloc_bitmap_next(mask, previous);
+    }
+    int get_system_affinity(bool abort_on_error) override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal get affinity operation when not capable");
+      int retval =
+          hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
+      if (retval >= 0) {
+        return 0;
+      }
+      int error = errno;
+      if (abort_on_error) {
+        __kmp_msg(kmp_ms_fatal, KMP_MSG(FatalSysError), KMP_ERR(error),
+                  __kmp_msg_null);
+      }
+      return error;
+    }
+    int set_system_affinity(bool abort_on_error) const override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal get affinity operation when not capable");
+      int retval =
+          hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
+      if (retval >= 0) {
+        return 0;
+      }
+      int error = errno;
+      if (abort_on_error) {
+        __kmp_msg(kmp_ms_fatal, KMP_MSG(FatalSysError), KMP_ERR(error),
+                  __kmp_msg_null);
+      }
+      return error;
+    }
+    int get_proc_group() const override {
+      int i;
+      int group = -1;
+#if KMP_OS_WINDOWS
+      if (__kmp_num_proc_groups == 1) {
+        return 1;
+      }
+      for (i = 0; i < __kmp_num_proc_groups; i++) {
+        // On windows, the long type is always 32 bits
+        unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
+        unsigned long second_32_bits =
+            hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
+        if (first_32_bits == 0 && second_32_bits == 0) {
+          continue;
         }
-        topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
-        // Is the system capable of setting/getting this thread's affinity?
-        // also, is topology discovery possible? (pu indicates ability to discover processing units)
-        // and finally, were there no errors when calling any hwloc_* API functions?
-        if(topology_support && topology_support->cpubind->set_thisthread_cpubind &&
-           topology_support->cpubind->get_thisthread_cpubind &&
-           topology_support->discovery->pu &&
-           !__kmp_hwloc_error)
-        {
-            // enables affinity according to KMP_AFFINITY_CAPABLE() macro
-            KMP_AFFINITY_ENABLE(TRUE);
-        } else {
-            // indicate that hwloc didn't work and disable affinity
-            __kmp_hwloc_error = TRUE;
-            KMP_AFFINITY_DISABLE();
+        if (group >= 0) {
+          return -1;
         }
+        group = i;
+      }
+#endif /* KMP_OS_WINDOWS */
+      return group;
     }
-    void bind_thread(int which) override {
-        KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
-          "Illegal set affinity operation when not capable");
-        KMPAffinity::Mask *mask;
-        KMP_CPU_ALLOC_ON_STACK(mask);
-        KMP_CPU_ZERO(mask);
-        KMP_CPU_SET(which, mask);
-        __kmp_set_system_affinity(mask, TRUE);
-        KMP_CPU_FREE_FROM_STACK(mask);
-    }
-    KMPAffinity::Mask* allocate_mask() override { return new Mask();  }
-    void deallocate_mask(KMPAffinity::Mask* m) override { delete m; }
-    KMPAffinity::Mask* allocate_mask_array(int num) override { return new Mask[num]; }
-    void deallocate_mask_array(KMPAffinity::Mask* array) override {
-        Mask* hwloc_array = static_cast<Mask*>(array);
-        delete[] hwloc_array;
-    }
-    KMPAffinity::Mask* index_mask_array(KMPAffinity::Mask* array, int index) override {
-        Mask* hwloc_array = static_cast<Mask*>(array);
-        return &(hwloc_array[index]);
-    }
-    api_type get_api_type() const override { return HWLOC; }
+  };
+  void determine_capable(const char *var) override {
+    const hwloc_topology_support *topology_support;
+    if (__kmp_hwloc_topology == NULL) {
+      if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
+        __kmp_hwloc_error = TRUE;
+        if (__kmp_affinity_verbose)
+          KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
+      }
+      if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
+        __kmp_hwloc_error = TRUE;
+        if (__kmp_affinity_verbose)
+          KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
+      }
+    }
+    topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
+    // Is the system capable of setting/getting this thread's affinity?
+    // Also, is topology discovery possible? (pu indicates ability to discover
+    // processing units). And finally, were there no errors when calling any
+    // hwloc_* API functions?
+    if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
+        topology_support->cpubind->get_thisthread_cpubind &&
+        topology_support->discovery->pu && !__kmp_hwloc_error) {
+      // enables affinity according to KMP_AFFINITY_CAPABLE() macro
+      KMP_AFFINITY_ENABLE(TRUE);
+    } else {
+      // indicate that hwloc didn't work and disable affinity
+      __kmp_hwloc_error = TRUE;
+      KMP_AFFINITY_DISABLE();
+    }
+  }
+  void bind_thread(int which) override {
+    KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                "Illegal set affinity operation when not capable");
+    KMPAffinity::Mask *mask;
+    KMP_CPU_ALLOC_ON_STACK(mask);
+    KMP_CPU_ZERO(mask);
+    KMP_CPU_SET(which, mask);
+    __kmp_set_system_affinity(mask, TRUE);
+    KMP_CPU_FREE_FROM_STACK(mask);
+  }
+  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
+  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
+  KMPAffinity::Mask *allocate_mask_array(int num) override {
+    return new Mask[num];
+  }
+  void deallocate_mask_array(KMPAffinity::Mask *array) override {
+    Mask *hwloc_array = static_cast<Mask *>(array);
+    delete[] hwloc_array;
+  }
+  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
+                                      int index) override {
+    Mask *hwloc_array = static_cast<Mask *>(array);
+    return &(hwloc_array[index]);
+  }
+  api_type get_api_type() const override { return HWLOC; }
 };
 #endif /* KMP_USE_HWLOC */
 
 #if KMP_OS_LINUX
-/*
- * On some of the older OS's that we build on, these constants aren't present
- * in <asm/unistd.h> #included from <sys.syscall.h>.  They must be the same on
- * all systems of the same arch where they are defined, and they cannot change.
- * stone forever.
- */
+/* On some of the older OS's that we build on, these constants aren't present
+   in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
+   all systems of the same arch where they are defined, and they cannot change.
+   stone forever. */
 #include <sys/syscall.h>
-# if KMP_ARCH_X86 || KMP_ARCH_ARM
-#  ifndef __NR_sched_setaffinity
-#   define __NR_sched_setaffinity  241
-#  elif __NR_sched_setaffinity != 241
-#   error Wrong code for setaffinity system call.
-#  endif /* __NR_sched_setaffinity */
-#  ifndef __NR_sched_getaffinity
-#   define __NR_sched_getaffinity  242
-#  elif __NR_sched_getaffinity != 242
-#   error Wrong code for getaffinity system call.
-#  endif /* __NR_sched_getaffinity */
-# elif KMP_ARCH_AARCH64
-#  ifndef __NR_sched_setaffinity
-#   define __NR_sched_setaffinity  122
-#  elif __NR_sched_setaffinity != 122
-#   error Wrong code for setaffinity system call.
-#  endif /* __NR_sched_setaffinity */
-#  ifndef __NR_sched_getaffinity
-#   define __NR_sched_getaffinity  123
-#  elif __NR_sched_getaffinity != 123
-#   error Wrong code for getaffinity system call.
-#  endif /* __NR_sched_getaffinity */
-# elif KMP_ARCH_X86_64
-#  ifndef __NR_sched_setaffinity
-#   define __NR_sched_setaffinity  203
-#  elif __NR_sched_setaffinity != 203
-#   error Wrong code for setaffinity system call.
-#  endif /* __NR_sched_setaffinity */
-#  ifndef __NR_sched_getaffinity
-#   define __NR_sched_getaffinity  204
-#  elif __NR_sched_getaffinity != 204
-#   error Wrong code for getaffinity system call.
-#  endif /* __NR_sched_getaffinity */
-# elif KMP_ARCH_PPC64
-#  ifndef __NR_sched_setaffinity
-#   define __NR_sched_setaffinity  222
-#  elif __NR_sched_setaffinity != 222
-#   error Wrong code for setaffinity system call.
-#  endif /* __NR_sched_setaffinity */
-#  ifndef __NR_sched_getaffinity
-#   define __NR_sched_getaffinity  223
-#  elif __NR_sched_getaffinity != 223
-#   error Wrong code for getaffinity system call.
-#  endif /* __NR_sched_getaffinity */
-#  elif KMP_ARCH_MIPS
-#   ifndef __NR_sched_setaffinity
-#    define __NR_sched_setaffinity  4239
-#   elif __NR_sched_setaffinity != 4239
-#    error Wrong code for setaffinity system call.
-#   endif /* __NR_sched_setaffinity */
-#   ifndef __NR_sched_getaffinity
-#    define __NR_sched_getaffinity  4240
-#   elif __NR_sched_getaffinity != 4240
-#    error Wrong code for getaffinity system call.
-#   endif /* __NR_sched_getaffinity */
-#  elif KMP_ARCH_MIPS64
-#   ifndef __NR_sched_setaffinity
-#    define __NR_sched_setaffinity  5195
-#   elif __NR_sched_setaffinity != 5195
-#    error Wrong code for setaffinity system call.
-#   endif /* __NR_sched_setaffinity */
-#   ifndef __NR_sched_getaffinity
-#    define __NR_sched_getaffinity  5196
-#   elif __NR_sched_getaffinity != 5196
-#    error Wrong code for getaffinity system call.
-#   endif /* __NR_sched_getaffinity */
-#  error Unknown or unsupported architecture
-# endif /* KMP_ARCH_* */
+#if KMP_ARCH_X86 || KMP_ARCH_ARM
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 241
+#elif __NR_sched_setaffinity != 241
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 242
+#elif __NR_sched_getaffinity != 242
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_AARCH64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 122
+#elif __NR_sched_setaffinity != 122
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 123
+#elif __NR_sched_getaffinity != 123
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_X86_64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 203
+#elif __NR_sched_setaffinity != 203
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 204
+#elif __NR_sched_getaffinity != 204
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_PPC64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 222
+#elif __NR_sched_setaffinity != 222
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 223
+#elif __NR_sched_getaffinity != 223
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_MIPS
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 4239
+#elif __NR_sched_setaffinity != 4239
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 4240
+#elif __NR_sched_getaffinity != 4240
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_MIPS64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 5195
+#elif __NR_sched_setaffinity != 5195
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 5196
+#elif __NR_sched_getaffinity != 5196
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#error Unknown or unsupported architecture
+#endif /* KMP_ARCH_* */
 class KMPNativeAffinity : public KMPAffinity {
-    class Mask : public KMPAffinity::Mask {
-        typedef unsigned char mask_t;
-        static const int BITS_PER_MASK_T = sizeof(mask_t)*CHAR_BIT;
-    public:
-        mask_t* mask;
-        Mask() { mask = (mask_t*)__kmp_allocate(__kmp_affin_mask_size); }
-        ~Mask() { if (mask) __kmp_free(mask); }
-        void set(int i) override { mask[i/BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); }
-        bool is_set(int i) const override { return (mask[i/BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); }
-        void clear(int i) override { mask[i/BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); }
-        void zero() override {
-            for (size_t i=0; i<__kmp_affin_mask_size; ++i)
-                mask[i] = 0;
-        }
-        void copy(const KMPAffinity::Mask* src) override {
-            const Mask * convert = static_cast<const Mask*>(src);
-            for (size_t i=0; i<__kmp_affin_mask_size; ++i)
-                mask[i] = convert->mask[i];
-        }
-        void bitwise_and(const KMPAffinity::Mask* rhs) override {
-            const Mask * convert = static_cast<const Mask*>(rhs);
-            for (size_t i=0; i<__kmp_affin_mask_size; ++i)
-                mask[i] &= convert->mask[i];
-        }
-        void bitwise_or(const KMPAffinity::Mask* rhs) override {
-            const Mask * convert = static_cast<const Mask*>(rhs);
-            for (size_t i=0; i<__kmp_affin_mask_size; ++i)
-                mask[i] |= convert->mask[i];
-        }
-        void bitwise_not() override {
-            for (size_t i=0; i<__kmp_affin_mask_size; ++i)
-                mask[i] = ~(mask[i]);
-        }
-        int begin() const override {
-            int retval = 0;
-            while (retval < end() && !is_set(retval))
-                ++retval;
-            return retval;
-        }
-        int end() const override { return __kmp_affin_mask_size*BITS_PER_MASK_T; }
-        int next(int previous) const override {
-            int retval = previous+1;
-            while (retval < end() && !is_set(retval))
-                ++retval;
-            return retval;
-        }
-        int get_system_affinity(bool abort_on_error) override {
-            KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
-              "Illegal get affinity operation when not capable");
-            int retval = syscall( __NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask );
-            if (retval >= 0) {
-                return 0;
-            }
-            int error = errno;
-            if (abort_on_error) {
-                __kmp_msg(kmp_ms_fatal, KMP_MSG( FatalSysError ), KMP_ERR( error ), __kmp_msg_null);
-            }
-            return error;
-        }
-        int set_system_affinity(bool abort_on_error) const override {
-            KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
-              "Illegal get affinity operation when not capable");
-            int retval = syscall( __NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask );
-            if (retval >= 0) {
-                return 0;
-            }
-            int error = errno;
-            if (abort_on_error) {
-                __kmp_msg(kmp_ms_fatal, KMP_MSG( FatalSysError ), KMP_ERR( error ), __kmp_msg_null);
-            }
-            return error;
-        }
-    };
-    void determine_capable(const char* env_var) override {
-        __kmp_affinity_determine_capable(env_var);
-    }
-    void bind_thread(int which) override {
-        __kmp_affinity_bind_thread(which);
-    }
-    KMPAffinity::Mask* allocate_mask() override {
-        KMPNativeAffinity::Mask* retval = new Mask();
-        return retval;
-    }
-    void deallocate_mask(KMPAffinity::Mask* m) override {
-        KMPNativeAffinity::Mask* native_mask = static_cast<KMPNativeAffinity::Mask*>(m);
-        delete m;
-    }
-    KMPAffinity::Mask* allocate_mask_array(int num) override { return new Mask[num]; }
-    void deallocate_mask_array(KMPAffinity::Mask* array) override {
-        Mask* linux_array = static_cast<Mask*>(array);
-        delete[] linux_array;
-    }
-    KMPAffinity::Mask* index_mask_array(KMPAffinity::Mask* array, int index) override {
-        Mask* linux_array = static_cast<Mask*>(array);
-        return &(linux_array[index]);
-    }
-    api_type get_api_type() const override { return NATIVE_OS; }
+  class Mask : public KMPAffinity::Mask {
+    typedef unsigned char mask_t;
+    static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
+
+  public:
+    mask_t *mask;
+    Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
+    ~Mask() {
+      if (mask)
+        __kmp_free(mask);
+    }
+    void set(int i) override {
+      mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
+    }
+    bool is_set(int i) const override {
+      return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
+    }
+    void clear(int i) override {
+      mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
+    }
+    void zero() override {
+      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+        mask[i] = 0;
+    }
+    void copy(const KMPAffinity::Mask *src) override {
+      const Mask *convert = static_cast<const Mask *>(src);
+      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+        mask[i] = convert->mask[i];
+    }
+    void bitwise_and(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+        mask[i] &= convert->mask[i];
+    }
+    void bitwise_or(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+        mask[i] |= convert->mask[i];
+    }
+    void bitwise_not() override {
+      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+        mask[i] = ~(mask[i]);
+    }
+    int begin() const override {
+      int retval = 0;
+      while (retval < end() && !is_set(retval))
+        ++retval;
+      return retval;
+    }
+    int end() const override { return __kmp_affin_mask_size * BITS_PER_MASK_T; }
+    int next(int previous) const override {
+      int retval = previous + 1;
+      while (retval < end() && !is_set(retval))
+        ++retval;
+      return retval;
+    }
+    int get_system_affinity(bool abort_on_error) override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal get affinity operation when not capable");
+      int retval =
+          syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
+      if (retval >= 0) {
+        return 0;
+      }
+      int error = errno;
+      if (abort_on_error) {
+        __kmp_msg(kmp_ms_fatal, KMP_MSG(FatalSysError), KMP_ERR(error),
+                  __kmp_msg_null);
+      }
+      return error;
+    }
+    int set_system_affinity(bool abort_on_error) const override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal get affinity operation when not capable");
+      int retval =
+          syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
+      if (retval >= 0) {
+        return 0;
+      }
+      int error = errno;
+      if (abort_on_error) {
+        __kmp_msg(kmp_ms_fatal, KMP_MSG(FatalSysError), KMP_ERR(error),
+                  __kmp_msg_null);
+      }
+      return error;
+    }
+  };
+  void determine_capable(const char *env_var) override {
+    __kmp_affinity_determine_capable(env_var);
+  }
+  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
+  KMPAffinity::Mask *allocate_mask() override {
+    KMPNativeAffinity::Mask *retval = new Mask();
+    return retval;
+  }
+  void deallocate_mask(KMPAffinity::Mask *m) override {
+    KMPNativeAffinity::Mask *native_mask =
+        static_cast<KMPNativeAffinity::Mask *>(m);
+    delete m;
+  }
+  KMPAffinity::Mask *allocate_mask_array(int num) override {
+    return new Mask[num];
+  }
+  void deallocate_mask_array(KMPAffinity::Mask *array) override {
+    Mask *linux_array = static_cast<Mask *>(array);
+    delete[] linux_array;
+  }
+  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
+                                      int index) override {
+    Mask *linux_array = static_cast<Mask *>(array);
+    return &(linux_array[index]);
+  }
+  api_type get_api_type() const override { return NATIVE_OS; }
 };
 #endif /* KMP_OS_LINUX */
 
 #if KMP_OS_WINDOWS
 class KMPNativeAffinity : public KMPAffinity {
-    class Mask : public KMPAffinity::Mask {
-        typedef ULONG_PTR mask_t;
-        static const int BITS_PER_MASK_T = sizeof(mask_t)*CHAR_BIT;
-        mask_t* mask;
-    public:
-        Mask() { mask = (mask_t*)__kmp_allocate(sizeof(mask_t)*__kmp_num_proc_groups); }
-        ~Mask() { if (mask) __kmp_free(mask); }
-        void set(int i) override { mask[i/BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); }
-        bool is_set(int i) const override { return (mask[i/BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); }
-        void clear(int i) override { mask[i/BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); }
-        void zero() override {
-            for (size_t i=0; i<__kmp_num_proc_groups; ++i)
-                mask[i] = 0;
-        }
-        void copy(const KMPAffinity::Mask* src) override {
-            const Mask * convert = static_cast<const Mask*>(src);
-            for (size_t i=0; i<__kmp_num_proc_groups; ++i)
-                mask[i] = convert->mask[i];
-        }
-        void bitwise_and(const KMPAffinity::Mask* rhs) override {
-            const Mask * convert = static_cast<const Mask*>(rhs);
-            for (size_t i=0; i<__kmp_num_proc_groups; ++i)
-                mask[i] &= convert->mask[i];
-        }
-        void bitwise_or(const KMPAffinity::Mask* rhs) override {
-            const Mask * convert = static_cast<const Mask*>(rhs);
-            for (size_t i=0; i<__kmp_num_proc_groups; ++i)
-                mask[i] |= convert->mask[i];
-        }
-        void bitwise_not() override {
-            for (size_t i=0; i<__kmp_num_proc_groups; ++i)
-                mask[i] = ~(mask[i]);
-        }
-        int begin() const override {
-            int retval = 0;
-            while (retval < end() && !is_set(retval))
-                ++retval;
-            return retval;
-        }
-        int end() const override { return __kmp_num_proc_groups*BITS_PER_MASK_T; }
-        int next(int previous) const override {
-            int retval = previous+1;
-            while (retval < end() && !is_set(retval))
-                ++retval;
-            return retval;
-        }
-        int set_system_affinity(bool abort_on_error) const override {
-            if (__kmp_num_proc_groups > 1) {
-                // Check for a valid mask.
-                GROUP_AFFINITY ga;
-                int group = get_proc_group();
-                if (group < 0) {
-                    if (abort_on_error) {
-                        KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
-                    }
-                    return -1;
-                }
-                // Transform the bit vector into a GROUP_AFFINITY struct
-                // and make the system call to set affinity.
-                ga.Group = group;
-                ga.Mask = mask[group];
-                ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
-
-                KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
-                if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
-                    DWORD error = GetLastError();
-                    if (abort_on_error) {
-                        __kmp_msg(kmp_ms_fatal, KMP_MSG( CantSetThreadAffMask ),
-                                  KMP_ERR( error ), __kmp_msg_null);
-                    }
-                    return error;
-                }
-            } else {
-                if (!SetThreadAffinityMask( GetCurrentThread(), *mask )) {
-                    DWORD error = GetLastError();
-                    if (abort_on_error) {
-                        __kmp_msg(kmp_ms_fatal, KMP_MSG( CantSetThreadAffMask ),
-                                  KMP_ERR( error ), __kmp_msg_null);
-                    }
-                    return error;
-                }
-            }
-            return 0;
-        }
-        int get_system_affinity(bool abort_on_error) override {
-            if (__kmp_num_proc_groups > 1) {
-                this->zero();
-                GROUP_AFFINITY ga;
-                KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
-                if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
-                    DWORD error = GetLastError();
-                    if (abort_on_error) {
-                        __kmp_msg(kmp_ms_fatal, KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
-                                  KMP_ERR(error), __kmp_msg_null);
-                    }
-                    return error;
-                }
-                if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) || (ga.Mask == 0)) {
-                    return -1;
-                }
-                mask[ga.Group] = ga.Mask;
-            } else {
-                mask_t newMask, sysMask, retval;
-                if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
-                    DWORD error = GetLastError();
-                    if (abort_on_error) {
-                        __kmp_msg(kmp_ms_fatal, KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
-                                  KMP_ERR(error), __kmp_msg_null);
-                    }
-                    return error;
-                }
-                retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
-                if (! retval) {
-                    DWORD error = GetLastError();
-                    if (abort_on_error) {
-                        __kmp_msg(kmp_ms_fatal, KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
-                                  KMP_ERR(error), __kmp_msg_null);
-                    }
-                    return error;
-                }
-                newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
-                if (! newMask) {
-                    DWORD error = GetLastError();
-                    if (abort_on_error) {
-                        __kmp_msg(kmp_ms_fatal, KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
-                                  KMP_ERR(error), __kmp_msg_null);
-                    }
-                }
-                *mask = retval;
-            }
-            return 0;
-        }
-        int get_proc_group() const override {
-            int group = -1;
-            if (__kmp_num_proc_groups == 1) {
-                return 1;
-            }
-            for (int i = 0; i < __kmp_num_proc_groups; i++) {
-                if (mask[i] == 0)
-                    continue;
-                if (group >= 0)
-                    return -1;
-                group = i;
-            }
-            return group;
-        }
-    };
-    void determine_capable(const char* env_var) override {
-        __kmp_affinity_determine_capable(env_var);
-    }
-    void bind_thread(int which) override {
-        __kmp_affinity_bind_thread(which);
-    }
-    KMPAffinity::Mask* allocate_mask() override { return new Mask();  }
-    void deallocate_mask(KMPAffinity::Mask* m) override { delete m; }
-    KMPAffinity::Mask* allocate_mask_array(int num) override { return new Mask[num]; }
-    void deallocate_mask_array(KMPAffinity::Mask* array) override {
-        Mask* windows_array = static_cast<Mask*>(array);
-        delete[] windows_array;
-    }
-    KMPAffinity::Mask* index_mask_array(KMPAffinity::Mask* array, int index) override {
-        Mask* windows_array = static_cast<Mask*>(array);
-        return &(windows_array[index]);
-    }
-    api_type get_api_type() const override { return NATIVE_OS; }
+  class Mask : public KMPAffinity::Mask {
+    typedef ULONG_PTR mask_t;
+    static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
+    mask_t *mask;
+
+  public:
+    Mask() {
+      mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
+    }
+    ~Mask() {
+      if (mask)
+        __kmp_free(mask);
+    }
+    void set(int i) override {
+      mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
+    }
+    bool is_set(int i) const override {
+      return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
+    }
+    void clear(int i) override {
+      mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
+    }
+    void zero() override {
+      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] = 0;
+    }
+    void copy(const KMPAffinity::Mask *src) override {
+      const Mask *convert = static_cast<const Mask *>(src);
+      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] = convert->mask[i];
+    }
+    void bitwise_and(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] &= convert->mask[i];
+    }
+    void bitwise_or(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] |= convert->mask[i];
+    }
+    void bitwise_not() override {
+      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] = ~(mask[i]);
+    }
+    int begin() const override {
+      int retval = 0;
+      while (retval < end() && !is_set(retval))
+        ++retval;
+      return retval;
+    }
+    int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
+    int next(int previous) const override {
+      int retval = previous + 1;
+      while (retval < end() && !is_set(retval))
+        ++retval;
+      return retval;
+    }
+    int set_system_affinity(bool abort_on_error) const override {
+      if (__kmp_num_proc_groups > 1) {
+        // Check for a valid mask.
+        GROUP_AFFINITY ga;
+        int group = get_proc_group();
+        if (group < 0) {
+          if (abort_on_error) {
+            KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+          }
+          return -1;
+        }
+        // Transform the bit vector into a GROUP_AFFINITY struct
+        // and make the system call to set affinity.
+        ga.Group = group;
+        ga.Mask = mask[group];
+        ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
+
+        KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
+        if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_msg(kmp_ms_fatal, KMP_MSG(CantSetThreadAffMask),
+                      KMP_ERR(error), __kmp_msg_null);
+          }
+          return error;
+        }
+      } else {
+        if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_msg(kmp_ms_fatal, KMP_MSG(CantSetThreadAffMask),
+                      KMP_ERR(error), __kmp_msg_null);
+          }
+          return error;
+        }
+      }
+      return 0;
+    }
+    int get_system_affinity(bool abort_on_error) override {
+      if (__kmp_num_proc_groups > 1) {
+        this->zero();
+        GROUP_AFFINITY ga;
+        KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
+        if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_msg(kmp_ms_fatal,
+                      KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
+                      KMP_ERR(error), __kmp_msg_null);
+          }
+          return error;
+        }
+        if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
+            (ga.Mask == 0)) {
+          return -1;
+        }
+        mask[ga.Group] = ga.Mask;
+      } else {
+        mask_t newMask, sysMask, retval;
+        if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_msg(kmp_ms_fatal,
+                      KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
+                      KMP_ERR(error), __kmp_msg_null);
+          }
+          return error;
+        }
+        retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
+        if (!retval) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_msg(kmp_ms_fatal,
+                      KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
+                      KMP_ERR(error), __kmp_msg_null);
+          }
+          return error;
+        }
+        newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
+        if (!newMask) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_msg(kmp_ms_fatal,
+                      KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
+                      KMP_ERR(error), __kmp_msg_null);
+          }
+        }
+        *mask = retval;
+      }
+      return 0;
+    }
+    int get_proc_group() const override {
+      int group = -1;
+      if (__kmp_num_proc_groups == 1) {
+        return 1;
+      }
+      for (int i = 0; i < __kmp_num_proc_groups; i++) {
+        if (mask[i] == 0)
+          continue;
+        if (group >= 0)
+          return -1;
+        group = i;
+      }
+      return group;
+    }
+  };
+  void determine_capable(const char *env_var) override {
+    __kmp_affinity_determine_capable(env_var);
+  }
+  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
+  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
+  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
+  KMPAffinity::Mask *allocate_mask_array(int num) override {
+    return new Mask[num];
+  }
+  void deallocate_mask_array(KMPAffinity::Mask *array) override {
+    Mask *windows_array = static_cast<Mask *>(array);
+    delete[] windows_array;
+  }
+  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
+                                      int index) override {
+    Mask *windows_array = static_cast<Mask *>(array);
+    return &(windows_array[index]);
+  }
+  api_type get_api_type() const override { return NATIVE_OS; }
 };
 #endif /* KMP_OS_WINDOWS */
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 class Address {
 public:
-    static const unsigned maxDepth = 32;
-    unsigned labels[maxDepth];
-    unsigned childNums[maxDepth];
-    unsigned depth;
-    unsigned leader;
-    Address(unsigned _depth)
-      : depth(_depth), leader(FALSE) {
-    }
-    Address &operator=(const Address &b) {
-        depth = b.depth;
-        for (unsigned i = 0; i < depth; i++) {
-            labels[i] = b.labels[i];
-            childNums[i] = b.childNums[i];
-        }
-        leader = FALSE;
-        return *this;
-    }
-    bool operator==(const Address &b) const {
-        if (depth != b.depth)
-            return false;
-        for (unsigned i = 0; i < depth; i++)
-            if(labels[i] != b.labels[i])
-                return false;
-        return true;
-    }
-    bool isClose(const Address &b, int level) const {
-        if (depth != b.depth)
-            return false;
-        if ((unsigned)level >= depth)
-            return true;
-        for (unsigned i = 0; i < (depth - level); i++)
-            if(labels[i] != b.labels[i])
-                return false;
-        return true;
-    }
-    bool operator!=(const Address &b) const {
-        return !operator==(b);
-    }
-    void print() const {
-        unsigned i;
-        printf("Depth: %u --- ", depth);
-        for(i=0;i<depth;i++) {
-            printf("%u ", labels[i]);
-        }
+  static const unsigned maxDepth = 32;
+  unsigned labels[maxDepth];
+  unsigned childNums[maxDepth];
+  unsigned depth;
+  unsigned leader;
+  Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
+  Address &operator=(const Address &b) {
+    depth = b.depth;
+    for (unsigned i = 0; i < depth; i++) {
+      labels[i] = b.labels[i];
+      childNums[i] = b.childNums[i];
+    }
+    leader = FALSE;
+    return *this;
+  }
+  bool operator==(const Address &b) const {
+    if (depth != b.depth)
+      return false;
+    for (unsigned i = 0; i < depth; i++)
+      if (labels[i] != b.labels[i])
+        return false;
+    return true;
+  }
+  bool isClose(const Address &b, int level) const {
+    if (depth != b.depth)
+      return false;
+    if ((unsigned)level >= depth)
+      return true;
+    for (unsigned i = 0; i < (depth - level); i++)
+      if (labels[i] != b.labels[i])
+        return false;
+    return true;
+  }
+  bool operator!=(const Address &b) const { return !operator==(b); }
+  void print() const {
+    unsigned i;
+    printf("Depth: %u --- ", depth);
+    for (i = 0; i < depth; i++) {
+      printf("%u ", labels[i]);
     }
+  }
 };
 
 class AddrUnsPair {
 public:
-    Address first;
-    unsigned second;
-    AddrUnsPair(Address _first, unsigned _second)
-      : first(_first), second(_second) {
-    }
-    AddrUnsPair &operator=(const AddrUnsPair &b)
-    {
-        first = b.first;
-        second = b.second;
-        return *this;
-    }
-    void print() const {
-        printf("first = "); first.print();
-        printf(" --- second = %u", second);
-    }
-    bool operator==(const AddrUnsPair &b) const {
-        if(first != b.first) return false;
-        if(second != b.second) return false;
-        return true;
-    }
-    bool operator!=(const AddrUnsPair &b) const {
-        return !operator==(b);
-    }
+  Address first;
+  unsigned second;
+  AddrUnsPair(Address _first, unsigned _second)
+      : first(_first), second(_second) {}
+  AddrUnsPair &operator=(const AddrUnsPair &b) {
+    first = b.first;
+    second = b.second;
+    return *this;
+  }
+  void print() const {
+    printf("first = ");
+    first.print();
+    printf(" --- second = %u", second);
+  }
+  bool operator==(const AddrUnsPair &b) const {
+    if (first != b.first)
+      return false;
+    if (second != b.second)
+      return false;
+    return true;
+  }
+  bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
 };
 
-
-static int
-__kmp_affinity_cmp_Address_labels(const void *a, const void *b)
-{
-    const Address *aa = (const Address *)&(((AddrUnsPair *)a)
-      ->first);
-    const Address *bb = (const Address *)&(((AddrUnsPair *)b)
-      ->first);
-    unsigned depth = aa->depth;
-    unsigned i;
-    KMP_DEBUG_ASSERT(depth == bb->depth);
-    for (i  = 0; i < depth; i++) {
-        if (aa->labels[i] < bb->labels[i]) return -1;
-        if (aa->labels[i] > bb->labels[i]) return 1;
-    }
-    return 0;
+static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
+  const Address *aa = (const Address *)&(((AddrUnsPair *)a)->first);
+  const Address *bb = (const Address *)&(((AddrUnsPair *)b)->first);
+  unsigned depth = aa->depth;
+  unsigned i;
+  KMP_DEBUG_ASSERT(depth == bb->depth);
+  for (i = 0; i < depth; i++) {
+    if (aa->labels[i] < bb->labels[i])
+      return -1;
+    if (aa->labels[i] > bb->labels[i])
+      return 1;
+  }
+  return 0;
 }
 
-
-/** A structure for holding machine-specific hierarchy info to be computed once at init.
-    This structure represents a mapping of threads to the actual machine hierarchy, or to
-    our best guess at what the hierarchy might be, for the purpose of performing an
-    efficient barrier.  In the worst case, when there is no machine hierarchy information,
-    it produces a tree suitable for a barrier, similar to the tree used in the hyper barrier. */
+/* A structure for holding machine-specific hierarchy info to be computed once
+   at init. This structure represents a mapping of threads to the actual machine
+   hierarchy, or to our best guess at what the hierarchy might be, for the
+   purpose of performing an efficient barrier. In the worst case, when there is
+   no machine hierarchy information, it produces a tree suitable for a barrier,
+   similar to the tree used in the hyper barrier. */
 class hierarchy_info {
 public:
-    /** Good default values for number of leaves and branching factor, given no affinity information.
-	Behaves a bit like hyper barrier. */
-    static const kmp_uint32 maxLeaves=4;
-    static const kmp_uint32 minBranch=4;
-    /** Number of levels in the hierarchy. Typical levels are threads/core, cores/package
-	or socket, packages/node, nodes/machine, etc.  We don't want to get specific with
-	nomenclature.  When the machine is oversubscribed we add levels to duplicate the
-	hierarchy, doubling the thread capacity of the hierarchy each time we add a level. */
-    kmp_uint32 maxLevels;
-
-    /** This is specifically the depth of the machine configuration hierarchy, in terms of the
-        number of levels along the longest path from root to any leaf. It corresponds to the
-        number of entries in numPerLevel if we exclude all but one trailing 1. */
-    kmp_uint32 depth;
-    kmp_uint32 base_num_threads;
-    enum init_status { initialized=0, not_initialized=1, initializing=2 };
-    volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, 2=initialization in progress
-    volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
-
-    /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
-        node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
-        and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
-    kmp_uint32 *numPerLevel;
-    kmp_uint32 *skipPerLevel;
-
-    void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
-        int hier_depth = adr2os[0].first.depth;
-        int level = 0;
-        for (int i=hier_depth-1; i>=0; --i) {
-            int max = -1;
-            for (int j=0; j<num_addrs; ++j) {
-                int next = adr2os[j].first.childNums[i];
-                if (next > max) max = next;
-            }
-            numPerLevel[level] = max+1;
-            ++level;
-        }
-    }
-
-    hierarchy_info() : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
-
-    void fini() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); }
-
-    void init(AddrUnsPair *adr2os, int num_addrs)
-    {
-        kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, not_initialized, initializing);
-        if (bool_result == 0) { // Wait for initialization
-            while (TCR_1(uninitialized) != initialized) KMP_CPU_PAUSE();
-            return;
-        }
-        KMP_DEBUG_ASSERT(bool_result==1);
-
-        /* Added explicit initialization of the data fields here to prevent usage of dirty value
-           observed when static library is re-initialized multiple times (e.g. when
-           non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
-        depth = 1;
-        resizing = 0;
-        maxLevels = 7;
-        numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
-        skipPerLevel = &(numPerLevel[maxLevels]);
-        for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
-            numPerLevel[i] = 1;
-            skipPerLevel[i] = 1;
-        }
+  /* Good default values for number of leaves and branching factor, given no
+     affinity information. Behaves a bit like hyper barrier. */
+  static const kmp_uint32 maxLeaves = 4;
+  static const kmp_uint32 minBranch = 4;
+  /** Number of levels in the hierarchy. Typical levels are threads/core,
+      cores/package or socket, packages/node, nodes/machine, etc. We don't want
+      to get specific with nomenclature. When the machine is oversubscribed we
+      add levels to duplicate the hierarchy, doubling the thread capacity of the
+      hierarchy each time we add a level. */
+  kmp_uint32 maxLevels;
+
+  /** This is specifically the depth of the machine configuration hierarchy, in
+      terms of the number of levels along the longest path from root to any
+      leaf. It corresponds to the number of entries in numPerLevel if we exclude
+      all but one trailing 1. */
+  kmp_uint32 depth;
+  kmp_uint32 base_num_threads;
+  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
+  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
+  // 2=initialization in progress
+  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
+
+  /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
+      the parent of a node at level i has. For example, if we have a machine
+      with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
+      {2, 4, 4, 1, 1}. All empty levels are set to 1. */
+  kmp_uint32 *numPerLevel;
+  kmp_uint32 *skipPerLevel;
+
+  void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
+    int hier_depth = adr2os[0].first.depth;
+    int level = 0;
+    for (int i = hier_depth - 1; i >= 0; --i) {
+      int max = -1;
+      for (int j = 0; j < num_addrs; ++j) {
+        int next = adr2os[j].first.childNums[i];
+        if (next > max)
+          max = next;
+      }
+      numPerLevel[level] = max + 1;
+      ++level;
+    }
+  }
+
+  hierarchy_info()
+      : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
+
+  void fini() {
+    if (!uninitialized && numPerLevel)
+      __kmp_free(numPerLevel);
+  }
+
+  void init(AddrUnsPair *adr2os, int num_addrs) {
+    kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
+        &uninitialized, not_initialized, initializing);
+    if (bool_result == 0) { // Wait for initialization
+      while (TCR_1(uninitialized) != initialized)
+        KMP_CPU_PAUSE();
+      return;
+    }
+    KMP_DEBUG_ASSERT(bool_result == 1);
+
+    /* Added explicit initialization of the data fields here to prevent usage of
+       dirty value observed when static library is re-initialized multiple times
+       (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
+       OpenMP). */
+    depth = 1;
+    resizing = 0;
+    maxLevels = 7;
+    numPerLevel =
+        (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
+    skipPerLevel = &(numPerLevel[maxLevels]);
+    for (kmp_uint32 i = 0; i < maxLevels;
+         ++i) { // init numPerLevel[*] to 1 item per level
+      numPerLevel[i] = 1;
+      skipPerLevel[i] = 1;
+    }
+
+    // Sort table by physical ID
+    if (adr2os) {
+      qsort(adr2os, num_addrs, sizeof(*adr2os),
+            __kmp_affinity_cmp_Address_labels);
+      deriveLevels(adr2os, num_addrs);
+    } else {
+      numPerLevel[0] = maxLeaves;
+      numPerLevel[1] = num_addrs / maxLeaves;
+      if (num_addrs % maxLeaves)
+        numPerLevel[1]++;
+    }
+
+    base_num_threads = num_addrs;
+    for (int i = maxLevels - 1; i >= 0;
+         --i) // count non-empty levels to get depth
+      if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
+        depth++;
+
+    kmp_uint32 branch = minBranch;
+    if (numPerLevel[0] == 1)
+      branch = num_addrs / maxLeaves;
+    if (branch < minBranch)
+      branch = minBranch;
+    for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
+      while (numPerLevel[d] > branch ||
+             (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
+        if (numPerLevel[d] & 1)
+          numPerLevel[d]++;
+        numPerLevel[d] = numPerLevel[d] >> 1;
+        if (numPerLevel[d + 1] == 1)
+          depth++;
+        numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
+      }
+      if (numPerLevel[0] == 1) {
+        branch = branch >> 1;
+        if (branch < 4)
+          branch = minBranch;
+      }
+    }
+
+    for (kmp_uint32 i = 1; i < depth; ++i)
+      skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
+    // Fill in hierarchy in the case of oversubscription
+    for (kmp_uint32 i = depth; i < maxLevels; ++i)
+      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
+
+    uninitialized = initialized; // One writer
+
+  }
+
+  // Resize the hierarchy if nproc changes to something larger than before
+  void resize(kmp_uint32 nproc) {
+    kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
+    while (bool_result == 0) { // someone else is trying to resize
+      KMP_CPU_PAUSE();
+      if (nproc <= base_num_threads) // happy with other thread's resize
+        return;
+      else // try to resize
+        bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
+    }
+    KMP_DEBUG_ASSERT(bool_result != 0);
+    if (nproc <= base_num_threads)
+      return; // happy with other thread's resize
+
+    // Calculate new maxLevels
+    kmp_uint32 old_sz = skipPerLevel[depth - 1];
+    kmp_uint32 incs = 0, old_maxLevels = maxLevels;
+    // First see if old maxLevels is enough to contain new size
+    for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
+      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
+      numPerLevel[i - 1] *= 2;
+      old_sz *= 2;
+      depth++;
+    }
+    if (nproc > old_sz) { // Not enough space, need to expand hierarchy
+      while (nproc > old_sz) {
+        old_sz *= 2;
+        incs++;
+        depth++;
+      }
+      maxLevels += incs;
+
+      // Resize arrays
+      kmp_uint32 *old_numPerLevel = numPerLevel;
+      kmp_uint32 *old_skipPerLevel = skipPerLevel;
+      numPerLevel = skipPerLevel = NULL;
+      numPerLevel =
+          (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
+      skipPerLevel = &(numPerLevel[maxLevels]);
+
+      // Copy old elements from old arrays
+      for (kmp_uint32 i = 0; i < old_maxLevels;
+           ++i) { // init numPerLevel[*] to 1 item per level
+        numPerLevel[i] = old_numPerLevel[i];
+        skipPerLevel[i] = old_skipPerLevel[i];
+      }
+
+      // Init new elements in arrays to 1
+      for (kmp_uint32 i = old_maxLevels; i < maxLevels;
+           ++i) { // init numPerLevel[*] to 1 item per level
+        numPerLevel[i] = 1;
+        skipPerLevel[i] = 1;
+      }
+
+      // Free old arrays
+      __kmp_free(old_numPerLevel);
+    }
+
+    // Fill in oversubscription levels of hierarchy
+    for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
+      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
 
-        // Sort table by physical ID
-        if (adr2os) {
-            qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
-            deriveLevels(adr2os, num_addrs);
-        }
-        else {
-            numPerLevel[0] = maxLeaves;
-            numPerLevel[1] = num_addrs/maxLeaves;
-            if (num_addrs%maxLeaves) numPerLevel[1]++;
-        }
-
-        base_num_threads = num_addrs;
-        for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
-            if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
-                depth++;
-
-        kmp_uint32 branch = minBranch;
-        if (numPerLevel[0] == 1) branch = num_addrs/maxLeaves;
-        if (branch<minBranch) branch=minBranch;
-        for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
-            while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>maxLeaves)) { // max 4 on level 0!
-                if (numPerLevel[d] & 1) numPerLevel[d]++;
-                numPerLevel[d] = numPerLevel[d] >> 1;
-                if (numPerLevel[d+1] == 1) depth++;
-                numPerLevel[d+1] = numPerLevel[d+1] << 1;
-            }
-            if(numPerLevel[0] == 1) {
-                branch = branch >> 1;
-                if (branch<4) branch = minBranch;
-            }
-        }
+    base_num_threads = nproc;
+    resizing = 0; // One writer
 
-        for (kmp_uint32 i=1; i<depth; ++i)
-            skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
-        // Fill in hierarchy in the case of oversubscription
-        for (kmp_uint32 i=depth; i<maxLevels; ++i)
-            skipPerLevel[i] = 2*skipPerLevel[i-1];
-
-        uninitialized = initialized; // One writer
-
-    }
-
-    // Resize the hierarchy if nproc changes to something larger than before
-    void resize(kmp_uint32 nproc)
-    {
-        kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
-        while (bool_result == 0) { // someone else is trying to resize
-            KMP_CPU_PAUSE();
-            if (nproc <= base_num_threads)  // happy with other thread's resize
-                return;
-            else // try to resize
-                bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
-        }
-        KMP_DEBUG_ASSERT(bool_result!=0);
-        if (nproc <= base_num_threads) return; // happy with other thread's resize
-
-        // Calculate new maxLevels
-        kmp_uint32 old_sz = skipPerLevel[depth-1];
-        kmp_uint32 incs = 0, old_maxLevels = maxLevels;
-        // First see if old maxLevels is enough to contain new size
-        for (kmp_uint32 i=depth; i<maxLevels && nproc>old_sz; ++i) {
-            skipPerLevel[i] = 2*skipPerLevel[i-1];
-            numPerLevel[i-1] *= 2;
-            old_sz *= 2;
-            depth++;
-        }
-        if (nproc > old_sz) { // Not enough space, need to expand hierarchy
-            while (nproc > old_sz) {
-                old_sz *=2;
-                incs++;
-                depth++;
-            }
-            maxLevels += incs;
-
-            // Resize arrays
-            kmp_uint32 *old_numPerLevel = numPerLevel;
-            kmp_uint32 *old_skipPerLevel = skipPerLevel;
-            numPerLevel = skipPerLevel = NULL;
-            numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
-            skipPerLevel = &(numPerLevel[maxLevels]);
-
-            // Copy old elements from old arrays
-            for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
-                numPerLevel[i] = old_numPerLevel[i];
-                skipPerLevel[i] = old_skipPerLevel[i];
-            }
-
-            // Init new elements in arrays to 1
-            for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
-                numPerLevel[i] = 1;
-                skipPerLevel[i] = 1;
-            }
-
-            // Free old arrays
-            __kmp_free(old_numPerLevel);
-        }
-
-        // Fill in oversubscription levels of hierarchy
-        for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i)
-            skipPerLevel[i] = 2*skipPerLevel[i-1];
-
-        base_num_threads = nproc;
-        resizing = 0; // One writer
-
-    }
+  }
 };
 #endif // KMP_AFFINITY_H

Modified: openmp/trunk/runtime/src/kmp_alloc.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_alloc.cpp?rev=302929&r1=302928&r2=302929&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_alloc.cpp (original)
+++ openmp/trunk/runtime/src/kmp_alloc.cpp Fri May 12 13:01:32 2017
@@ -14,742 +14,679 @@
 
 
 #include "kmp.h"
-#include "kmp_wrapper_malloc.h"
 #include "kmp_io.h"
+#include "kmp_wrapper_malloc.h"
 
 // Disable bget when it is not used
 #if KMP_USE_BGET
 
 /* Thread private buffer management code */
 
-typedef int   (*bget_compact_t)(size_t, int);
+typedef int (*bget_compact_t)(size_t, int);
 typedef void *(*bget_acquire_t)(size_t);
-typedef void  (*bget_release_t)(void *);
+typedef void (*bget_release_t)(void *);
 
 /* NOTE: bufsize must be a signed datatype */
 
 #if KMP_OS_WINDOWS
-# if KMP_ARCH_X86 || KMP_ARCH_ARM
-   typedef kmp_int32 bufsize;
-# else
-   typedef kmp_int64 bufsize;
-# endif
+#if KMP_ARCH_X86 || KMP_ARCH_ARM
+typedef kmp_int32 bufsize;
+#else
+typedef kmp_int64 bufsize;
+#endif
 #else
-  typedef ssize_t bufsize;
+typedef ssize_t bufsize;
 #endif
 
 /* The three modes of operation are, fifo search, lifo search, and best-fit */
 
 typedef enum bget_mode {
-    bget_mode_fifo = 0,
-    bget_mode_lifo = 1,
-    bget_mode_best = 2
+  bget_mode_fifo = 0,
+  bget_mode_lifo = 1,
+  bget_mode_best = 2
 } bget_mode_t;
 
-
-static void    bpool( kmp_info_t *th, void *buffer, bufsize len);
-static void   *bget( kmp_info_t *th, bufsize size);
-static void   *bgetz( kmp_info_t *th, bufsize size);
-static void   *bgetr( kmp_info_t *th, void *buffer, bufsize newsize);
-static void    brel( kmp_info_t *th, void *buf);
-static void    bectl(  kmp_info_t *th, bget_compact_t compact, bget_acquire_t acquire, bget_release_t release, bufsize pool_incr );
+static void bpool(kmp_info_t *th, void *buffer, bufsize len);
+static void *bget(kmp_info_t *th, bufsize size);
+static void *bgetz(kmp_info_t *th, bufsize size);
+static void *bgetr(kmp_info_t *th, void *buffer, bufsize newsize);
+static void brel(kmp_info_t *th, void *buf);
+static void bectl(kmp_info_t *th, bget_compact_t compact,
+                  bget_acquire_t acquire, bget_release_t release,
+                  bufsize pool_incr);
 
 #ifdef KMP_DEBUG
-static void    bstats( kmp_info_t *th, bufsize *curalloc, bufsize *totfree, bufsize *maxfree, long *nget, long *nrel);
-static void    bstatse( kmp_info_t *th, bufsize *pool_incr, long *npool, long *npget, long *nprel, long *ndget, long *ndrel);
-static void    bufdump( kmp_info_t *th, void *buf);
-static void    bpoold( kmp_info_t *th, void *pool, int dumpalloc, int dumpfree);
-static int     bpoolv( kmp_info_t *th, void *pool);
+static void bstats(kmp_info_t *th, bufsize *curalloc, bufsize *totfree,
+                   bufsize *maxfree, long *nget, long *nrel);
+static void bstatse(kmp_info_t *th, bufsize *pool_incr, long *npool,
+                    long *npget, long *nprel, long *ndget, long *ndrel);
+static void bufdump(kmp_info_t *th, void *buf);
+static void bpoold(kmp_info_t *th, void *pool, int dumpalloc, int dumpfree);
+static int bpoolv(kmp_info_t *th, void *pool);
 #endif
 
 /* BGET CONFIGURATION */
-                                      /* Buffer allocation size quantum:
-                                         all buffers allocated are a
-                                         multiple of this size.  This
-                                         MUST be a power of two. */
-
-                                      /* On IA-32 architecture with  Linux* OS,
-                                         malloc() does not
-                                         ensure 16 byte alignmnent */
+/* Buffer allocation size quantum: all buffers allocated are a
+   multiple of this size.  This MUST be a power of two. */
+
+/* On IA-32 architecture with  Linux* OS, malloc() does not
+   ensure 16 byte alignmnent */
 
 #if KMP_ARCH_X86 || !KMP_HAVE_QUAD
 
-#define SizeQuant   8
-#define AlignType   double
+#define SizeQuant 8
+#define AlignType double
 
 #else
 
-#define SizeQuant   16
-#define AlignType   _Quad
+#define SizeQuant 16
+#define AlignType _Quad
 
 #endif
 
-#define BufStats    1                 /* Define this symbol to enable the
-                                         bstats() function which calculates
-                                         the total free space in the buffer
-                                         pool, the largest available
-                                         buffer, and the total space
-                                         currently allocated. */
+// Define this symbol to enable the bstats() function which calculates the
+// total free space in the buffer pool, the largest available buffer, and the
+// total space currently allocated.
+#define BufStats 1
 
 #ifdef KMP_DEBUG
 
-#define BufDump     1                 /* Define this symbol to enable the
-                                         bpoold() function which dumps the
-                                         buffers in a buffer pool. */
-
-#define BufValid    1                 /* Define this symbol to enable the
-                                         bpoolv() function for validating
-                                         a buffer pool. */
-
-#define DumpData    1                 /* Define this symbol to enable the
-                                         bufdump() function which allows
-                                         dumping the contents of an allocated
-                                         or free buffer. */
+// Define this symbol to enable the bpoold() function which dumps the buffers
+// in a buffer pool.
+#define BufDump 1
+
+// Define this symbol to enable the bpoolv() function for validating a buffer
+// pool.
+#define BufValid 1
+
+// Define this symbol to enable the bufdump() function which allows dumping the
+// contents of an allocated or free buffer.
+#define DumpData 1
+
 #ifdef NOT_USED_NOW
 
-#define FreeWipe    1                 /* Wipe free buffers to a guaranteed
-                                         pattern of garbage to trip up
-                                         miscreants who attempt to use
-                                         pointers into released buffers. */
-
-#define BestFit     1                 /* Use a best fit algorithm when
-                                         searching for space for an
-                                         allocation request.  This uses
-                                         memory more efficiently, but
-                                         allocation will be much slower. */
+// Wipe free buffers to a guaranteed pattern of garbage to trip up miscreants
+// who attempt to use pointers into released buffers.
+#define FreeWipe 1
+
+// Use a best fit algorithm when searching for space for an allocation request.
+// This uses memory more efficiently, but allocation will be much slower.
+#define BestFit 1
+
 #endif /* NOT_USED_NOW */
 #endif /* KMP_DEBUG */
 
-
-static bufsize bget_bin_size[ ] = {
+static bufsize bget_bin_size[] = {
     0,
-//    1 << 6,    /* .5 Cache line */
-    1 << 7,    /* 1 Cache line, new */
-    1 << 8,    /* 2 Cache lines */
-    1 << 9,    /* 4 Cache lines, new */
-    1 << 10,   /* 8 Cache lines */
-    1 << 11,   /* 16 Cache lines, new */
-    1 << 12,
-    1 << 13,   /* new */
-    1 << 14,
-    1 << 15,   /* new */
-    1 << 16,
-    1 << 17,
-    1 << 18,
-    1 << 19,
-    1 << 20,    /*  1MB */
-    1 << 21,    /*  2MB */
-    1 << 22,    /*  4MB */
-    1 << 23,    /*  8MB */
-    1 << 24,    /* 16MB */
-    1 << 25,    /* 32MB */
+    //    1 << 6,    /* .5 Cache line */
+    1 << 7, /* 1 Cache line, new */
+    1 << 8, /* 2 Cache lines */
+    1 << 9, /* 4 Cache lines, new */
+    1 << 10, /* 8 Cache lines */
+    1 << 11, /* 16 Cache lines, new */
+    1 << 12, 1 << 13, /* new */
+    1 << 14, 1 << 15, /* new */
+    1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, /*  1MB */
+    1 << 21, /*  2MB */
+    1 << 22, /*  4MB */
+    1 << 23, /*  8MB */
+    1 << 24, /* 16MB */
+    1 << 25, /* 32MB */
 };
 
-#define MAX_BGET_BINS   (int)(sizeof(bget_bin_size) / sizeof(bufsize))
+#define MAX_BGET_BINS (int)(sizeof(bget_bin_size) / sizeof(bufsize))
 
 struct bfhead;
 
-/*  Declare the interface, including the requested buffer size type,
-    bufsize.  */
+//  Declare the interface, including the requested buffer size type, bufsize.
 
 /* Queue links */
-
 typedef struct qlinks {
-    struct bfhead *flink;             /* Forward link */
-    struct bfhead *blink;             /* Backward link */
+  struct bfhead *flink; /* Forward link */
+  struct bfhead *blink; /* Backward link */
 } qlinks_t;
 
 /* Header in allocated and free buffers */
-
 typedef struct bhead2 {
-    kmp_info_t *bthr;                 /* The thread which owns the buffer pool */
-    bufsize     prevfree;             /* Relative link back to previous
-                                         free buffer in memory or 0 if
-                                         previous buffer is allocated.  */
-    bufsize     bsize;                /* Buffer size: positive if free,
-                                         negative if allocated. */
+  kmp_info_t *bthr; /* The thread which owns the buffer pool */
+  bufsize prevfree; /* Relative link back to previous free buffer in memory or
+                       0 if previous buffer is allocated.  */
+  bufsize bsize; /* Buffer size: positive if free, negative if allocated. */
 } bhead2_t;
 
 /* Make sure the bhead structure is a multiple of SizeQuant in size. */
-
 typedef union bhead {
-    KMP_ALIGN( SizeQuant )
-    AlignType           b_align;
-    char                b_pad[ sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant)) ];
-    bhead2_t            bb;
+  KMP_ALIGN(SizeQuant)
+  AlignType b_align;
+  char b_pad[sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant))];
+  bhead2_t bb;
 } bhead_t;
-#define BH(p)   ((bhead_t *) (p))
+#define BH(p) ((bhead_t *)(p))
 
 /*  Header in directly allocated buffers (by acqfcn) */
-
-typedef struct bdhead
-{
-    bufsize tsize;                    /* Total size, including overhead */
-    bhead_t bh;                       /* Common header */
+typedef struct bdhead {
+  bufsize tsize; /* Total size, including overhead */
+  bhead_t bh; /* Common header */
 } bdhead_t;
-#define BDH(p)  ((bdhead_t *) (p))
+#define BDH(p) ((bdhead_t *)(p))
 
 /* Header in free buffers */
-
 typedef struct bfhead {
-    bhead_t  bh;                      /* Common allocated/free header */
-    qlinks_t ql;                      /* Links on free list */
+  bhead_t bh; /* Common allocated/free header */
+  qlinks_t ql; /* Links on free list */
 } bfhead_t;
-#define BFH(p)  ((bfhead_t *) (p))
+#define BFH(p) ((bfhead_t *)(p))
 
 typedef struct thr_data {
-    bfhead_t freelist[ MAX_BGET_BINS ];
+  bfhead_t freelist[MAX_BGET_BINS];
 #if BufStats
-    size_t totalloc;               /* Total space currently allocated */
-    long numget, numrel;           /* Number of bget() and brel() calls */
-    long numpblk;                  /* Number of pool blocks */
-    long numpget, numprel;         /* Number of block gets and rels */
-    long numdget, numdrel;         /* Number of direct gets and rels */
+  size_t totalloc; /* Total space currently allocated */
+  long numget, numrel; /* Number of bget() and brel() calls */
+  long numpblk; /* Number of pool blocks */
+  long numpget, numprel; /* Number of block gets and rels */
+  long numdget, numdrel; /* Number of direct gets and rels */
 #endif /* BufStats */
 
-    /* Automatic expansion block management functions */
-    bget_compact_t compfcn;
-    bget_acquire_t acqfcn;
-    bget_release_t relfcn;
-
-    bget_mode_t    mode;              /* what allocation mode to use? */
-
-    bufsize exp_incr;                 /* Expansion block size */
-    bufsize pool_len;                 /* 0: no bpool calls have been made
-                                         -1: not all pool blocks are
-                                             the same size
-                                         >0: (common) block size for all
-                                             bpool calls made so far
-                                      */
-    bfhead_t * last_pool;             /* Last pool owned by this thread (delay dealocation) */
+  /* Automatic expansion block management functions */
+  bget_compact_t compfcn;
+  bget_acquire_t acqfcn;
+  bget_release_t relfcn;
+
+  bget_mode_t mode; /* what allocation mode to use? */
+
+  bufsize exp_incr; /* Expansion block size */
+  bufsize pool_len; /* 0: no bpool calls have been made
+                       -1: not all pool blocks are the same size
+                       >0: (common) block size for all bpool calls made so far
+                    */
+  bfhead_t *last_pool; /* Last pool owned by this thread (delay dealocation) */
 } thr_data_t;
 
 /*  Minimum allocation quantum: */
-
-#define QLSize  (sizeof(qlinks_t))
-#define SizeQ   ((SizeQuant > QLSize) ? SizeQuant : QLSize)
-#define MaxSize (bufsize)( ~ ( ( (bufsize)( 1 ) << ( sizeof( bufsize ) * CHAR_BIT - 1 ) ) | ( SizeQuant - 1 ) ) )
-    // Maximun for the requested size.
+#define QLSize (sizeof(qlinks_t))
+#define SizeQ ((SizeQuant > QLSize) ? SizeQuant : QLSize)
+#define MaxSize                                                                \
+  (bufsize)(                                                                   \
+      ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))
+// Maximun for the requested size.
 
 /* End sentinel: value placed in bsize field of dummy block delimiting
    end of pool block.  The most negative number which will  fit  in  a
    bufsize, defined in a way that the compiler will accept. */
 
-#define ESent   ((bufsize) (-(((((bufsize)1)<<((int)sizeof(bufsize)*8-2))-1)*2)-2))
-
-/* ------------------------------------------------------------------------ */
+#define ESent                                                                  \
+  ((bufsize)(-(((((bufsize)1) << ((int)sizeof(bufsize) * 8 - 2)) - 1) * 2) - 2))
 
 /* Thread Data management routines */
+static int bget_get_bin(bufsize size) {
+  // binary chop bins
+  int lo = 0, hi = MAX_BGET_BINS - 1;
 
-static int
-bget_get_bin( bufsize size )
-{
-    // binary chop bins
-    int lo = 0, hi = MAX_BGET_BINS - 1;
-
-    KMP_DEBUG_ASSERT( size > 0 );
+  KMP_DEBUG_ASSERT(size > 0);
 
-    while ( (hi - lo) > 1 ) {
-        int mid = (lo + hi) >> 1;
-        if (size < bget_bin_size[ mid ])
-            hi = mid - 1;
-        else
-            lo = mid;
-    }
+  while ((hi - lo) > 1) {
+    int mid = (lo + hi) >> 1;
+    if (size < bget_bin_size[mid])
+      hi = mid - 1;
+    else
+      lo = mid;
+  }
 
-    KMP_DEBUG_ASSERT( (lo >= 0) && (lo < MAX_BGET_BINS) );
+  KMP_DEBUG_ASSERT((lo >= 0) && (lo < MAX_BGET_BINS));
 
-    return lo;
+  return lo;
 }
 
-static void
-set_thr_data( kmp_info_t *th )
-{
-    int i;
-    thr_data_t *data;
+static void set_thr_data(kmp_info_t *th) {
+  int i;
+  thr_data_t *data;
 
-    data =
-        (thr_data_t *)(
-            ( ! th->th.th_local.bget_data ) ? __kmp_allocate( sizeof( *data ) ) : th->th.th_local.bget_data
-        );
+  data = (thr_data_t *)((!th->th.th_local.bget_data)
+                            ? __kmp_allocate(sizeof(*data))
+                            : th->th.th_local.bget_data);
 
-    memset( data, '\0', sizeof( *data ) );
+  memset(data, '\0', sizeof(*data));
 
-    for (i = 0; i < MAX_BGET_BINS; ++i) {
-        data->freelist[ i ].ql.flink = & data->freelist[ i ];
-        data->freelist[ i ].ql.blink = & data->freelist[ i ];
-    }
+  for (i = 0; i < MAX_BGET_BINS; ++i) {
+    data->freelist[i].ql.flink = &data->freelist[i];
+    data->freelist[i].ql.blink = &data->freelist[i];
+  }
 
-    th->th.th_local.bget_data = data;
-    th->th.th_local.bget_list = 0;
-#if ! USE_CMP_XCHG_FOR_BGET
+  th->th.th_local.bget_data = data;
+  th->th.th_local.bget_list = 0;
+#if !USE_CMP_XCHG_FOR_BGET
 #ifdef USE_QUEUING_LOCK_FOR_BGET
-    __kmp_init_lock( & th->th.th_local.bget_lock );
+  __kmp_init_lock(&th->th.th_local.bget_lock);
 #else
-    __kmp_init_bootstrap_lock( & th->th.th_local.bget_lock );
+  __kmp_init_bootstrap_lock(&th->th.th_local.bget_lock);
 #endif /* USE_LOCK_FOR_BGET */
 #endif /* ! USE_CMP_XCHG_FOR_BGET */
 }
 
-static thr_data_t *
-get_thr_data( kmp_info_t *th )
-{
-    thr_data_t *data;
+static thr_data_t *get_thr_data(kmp_info_t *th) {
+  thr_data_t *data;
 
-    data = (thr_data_t *) th->th.th_local.bget_data;
+  data = (thr_data_t *)th->th.th_local.bget_data;
 
-    KMP_DEBUG_ASSERT( data != 0 );
+  KMP_DEBUG_ASSERT(data != 0);
 
-    return data;
+  return data;
 }
 
-
 #ifdef KMP_DEBUG
 
-static void
-__kmp_bget_validate_queue( kmp_info_t *th )
-{
-    /* NOTE: assume that the global_lock is held */
+static void __kmp_bget_validate_queue(kmp_info_t *th) {
+  /* NOTE: assume that the global_lock is held */
 
-    void *p = (void *) th->th.th_local.bget_list;
+  void *p = (void *)th->th.th_local.bget_list;
 
-    while (p != 0) {
-        bfhead_t *b = BFH(((char *) p) - sizeof(bhead_t));
+  while (p != 0) {
+    bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
 
-        KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
-        p = (void *) b->ql.flink;
-    }
+    KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
+    p = (void *)b->ql.flink;
+  }
 }
 
 #endif
 
 /* Walk the free list and release the enqueued buffers */
+static void __kmp_bget_dequeue(kmp_info_t *th) {
+  void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
 
-static void
-__kmp_bget_dequeue( kmp_info_t *th )
-{
-    void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
-
-    if (p != 0) {
-        #if USE_CMP_XCHG_FOR_BGET
-            {
-                volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
-                while ( ! KMP_COMPARE_AND_STORE_PTR(
-                    & th->th.th_local.bget_list, old_value, NULL ) )
-                {
-                    KMP_CPU_PAUSE();
-                    old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
-                }
-                p = (void *) old_value;
-            }
-        #else /* ! USE_CMP_XCHG_FOR_BGET */
-            #ifdef USE_QUEUING_LOCK_FOR_BGET
-                __kmp_acquire_lock( & th->th.th_local.bget_lock,
-                                    __kmp_gtid_from_thread(th) );
-            #else
-                __kmp_acquire_bootstrap_lock( & th->th.th_local.bget_lock );
-            #endif /* USE_QUEUING_LOCK_FOR_BGET */
-
-             p = (void *) th->th.th_local.bget_list;
-             th->th.th_local.bget_list = 0;
-
-            #ifdef USE_QUEUING_LOCK_FOR_BGET
-                __kmp_release_lock( & th->th.th_local.bget_lock,
-                                    __kmp_gtid_from_thread(th) );
-            #else
-                __kmp_release_bootstrap_lock( & th->th.th_local.bget_lock );
-            #endif
-        #endif /* USE_CMP_XCHG_FOR_BGET */
-
-        /* Check again to make sure the list is not empty */
-
-        while (p != 0) {
-            void *buf = p;
-            bfhead_t *b = BFH(((char *) p) - sizeof(bhead_t));
-
-            KMP_DEBUG_ASSERT( b->bh.bb.bsize != 0 );
-            KMP_DEBUG_ASSERT( ( (kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1 ) ==
-                                (kmp_uintptr_t)th ); // clear possible mark
-            KMP_DEBUG_ASSERT( b->ql.blink == 0 );
+  if (p != 0) {
+#if USE_CMP_XCHG_FOR_BGET
+    {
+      volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
+      while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list, old_value,
+                                        NULL)) {
+        KMP_CPU_PAUSE();
+        old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
+      }
+      p = (void *)old_value;
+    }
+#else /* ! USE_CMP_XCHG_FOR_BGET */
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+    __kmp_acquire_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
+#else
+    __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif /* USE_QUEUING_LOCK_FOR_BGET */
 
-            p = (void *) b->ql.flink;
+    p = (void *)th->th.th_local.bget_list;
+    th->th.th_local.bget_list = 0;
 
-            brel( th, buf );
-        }
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+    __kmp_release_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
+#else
+    __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif
+#endif /* USE_CMP_XCHG_FOR_BGET */
+
+    /* Check again to make sure the list is not empty */
+    while (p != 0) {
+      void *buf = p;
+      bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
+
+      KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
+      KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
+                       (kmp_uintptr_t)th); // clear possible mark
+      KMP_DEBUG_ASSERT(b->ql.blink == 0);
+
+      p = (void *)b->ql.flink;
+
+      brel(th, buf);
     }
+  }
 }
 
 /* Chain together the free buffers by using the thread owner field */
-
-static void
-__kmp_bget_enqueue( kmp_info_t *th, void *buf
+static void __kmp_bget_enqueue(kmp_info_t *th, void *buf
 #ifdef USE_QUEUING_LOCK_FOR_BGET
-                    , kmp_int32 rel_gtid
+                               ,
+                               kmp_int32 rel_gtid
 #endif
-                  )
-{
-    bfhead_t *b = BFH(((char *) buf) - sizeof(bhead_t));
+                               ) {
+  bfhead_t *b = BFH(((char *)buf) - sizeof(bhead_t));
 
-    KMP_DEBUG_ASSERT( b->bh.bb.bsize != 0 );
-    KMP_DEBUG_ASSERT( ( (kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1 ) ==
-                        (kmp_uintptr_t)th ); // clear possible mark
+  KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
+  KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
+                   (kmp_uintptr_t)th); // clear possible mark
 
-    b->ql.blink = 0;
+  b->ql.blink = 0;
 
-    KC_TRACE( 10, ( "__kmp_bget_enqueue: moving buffer to T#%d list\n",
-                    __kmp_gtid_from_thread( th ) ) );
+  KC_TRACE(10, ("__kmp_bget_enqueue: moving buffer to T#%d list\n",
+                __kmp_gtid_from_thread(th)));
 
 #if USE_CMP_XCHG_FOR_BGET
-    {
-        volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
-        /* the next pointer must be set before setting bget_list to buf to avoid
-           exposing a broken list to other threads, even for an instant. */
-        b->ql.flink = BFH( old_value );
-
-        while ( ! KMP_COMPARE_AND_STORE_PTR(
-            & th->th.th_local.bget_list, old_value, buf ) )
-        {
-            KMP_CPU_PAUSE();
-            old_value = TCR_PTR(th->th.th_local.bget_list);
-            /* the next pointer must be set before setting bget_list to buf to avoid
-               exposing a broken list to other threads, even for an instant. */
-            b->ql.flink = BFH( old_value );
-        }
+  {
+    volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
+    /* the next pointer must be set before setting bget_list to buf to avoid
+       exposing a broken list to other threads, even for an instant. */
+    b->ql.flink = BFH(old_value);
+
+    while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list, old_value,
+                                      buf)) {
+      KMP_CPU_PAUSE();
+      old_value = TCR_PTR(th->th.th_local.bget_list);
+      /* the next pointer must be set before setting bget_list to buf to avoid
+         exposing a broken list to other threads, even for an instant. */
+      b->ql.flink = BFH(old_value);
     }
+  }
 #else /* ! USE_CMP_XCHG_FOR_BGET */
-# ifdef USE_QUEUING_LOCK_FOR_BGET
-    __kmp_acquire_lock( & th->th.th_local.bget_lock, rel_gtid );
-# else
-    __kmp_acquire_bootstrap_lock( & th->th.th_local.bget_lock );
- # endif
-
-    b->ql.flink = BFH( th->th.th_local.bget_list );
-    th->th.th_local.bget_list = (void *) buf;
-
-# ifdef USE_QUEUING_LOCK_FOR_BGET
-    __kmp_release_lock( & th->th.th_local.bget_lock, rel_gtid );
-# else
-    __kmp_release_bootstrap_lock( & th->th.th_local.bget_lock );
-# endif
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+  __kmp_acquire_lock(&th->th.th_local.bget_lock, rel_gtid);
+#else
+  __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif
+
+  b->ql.flink = BFH(th->th.th_local.bget_list);
+  th->th.th_local.bget_list = (void *)buf;
+
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+  __kmp_release_lock(&th->th.th_local.bget_lock, rel_gtid);
+#else
+  __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif
 #endif /* USE_CMP_XCHG_FOR_BGET */
 }
 
 /* insert buffer back onto a new freelist */
+static void __kmp_bget_insert_into_freelist(thr_data_t *thr, bfhead_t *b) {
+  int bin;
 
-static void
-__kmp_bget_insert_into_freelist( thr_data_t *thr, bfhead_t *b )
-{
-    int bin;
+  KMP_DEBUG_ASSERT(((size_t)b) % SizeQuant == 0);
+  KMP_DEBUG_ASSERT(b->bh.bb.bsize % SizeQuant == 0);
 
-    KMP_DEBUG_ASSERT( ((size_t)b ) % SizeQuant == 0 );
-    KMP_DEBUG_ASSERT( b->bh.bb.bsize % SizeQuant == 0 );
+  bin = bget_get_bin(b->bh.bb.bsize);
 
-    bin = bget_get_bin( b->bh.bb.bsize );
+  KMP_DEBUG_ASSERT(thr->freelist[bin].ql.blink->ql.flink ==
+                   &thr->freelist[bin]);
+  KMP_DEBUG_ASSERT(thr->freelist[bin].ql.flink->ql.blink ==
+                   &thr->freelist[bin]);
 
-    KMP_DEBUG_ASSERT(thr->freelist[ bin ].ql.blink->ql.flink == &thr->freelist[ bin ]);
-    KMP_DEBUG_ASSERT(thr->freelist[ bin ].ql.flink->ql.blink == &thr->freelist[ bin ]);
+  b->ql.flink = &thr->freelist[bin];
+  b->ql.blink = thr->freelist[bin].ql.blink;
 
-    b->ql.flink = &thr->freelist[ bin ];
-    b->ql.blink = thr->freelist[ bin ].ql.blink;
-
-    thr->freelist[ bin ].ql.blink = b;
-    b->ql.blink->ql.flink = b;
+  thr->freelist[bin].ql.blink = b;
+  b->ql.blink->ql.flink = b;
 }
 
 /* unlink the buffer from the old freelist */
+static void __kmp_bget_remove_from_freelist(bfhead_t *b) {
+  KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
+  KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
 
-static void
-__kmp_bget_remove_from_freelist( bfhead_t *b )
-{
-    KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
-    KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
-
-    b->ql.blink->ql.flink = b->ql.flink;
-    b->ql.flink->ql.blink = b->ql.blink;
+  b->ql.blink->ql.flink = b->ql.flink;
+  b->ql.flink->ql.blink = b->ql.blink;
 }
 
-/* ------------------------------------------------------------------------ */
-
 /*  GET STATS -- check info on free list */
+static void bcheck(kmp_info_t *th, bufsize *max_free, bufsize *total_free) {
+  thr_data_t *thr = get_thr_data(th);
+  int bin;
 
-static void
-bcheck(  kmp_info_t *th, bufsize *max_free, bufsize *total_free )
-{
-    thr_data_t *thr = get_thr_data( th );
-    int bin;
-
-    *total_free = *max_free = 0;
+  *total_free = *max_free = 0;
 
-    for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
-        bfhead_t *b, *best;
+  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+    bfhead_t *b, *best;
 
-        best = &thr->freelist[ bin ];
-        b = best->ql.flink;
+    best = &thr->freelist[bin];
+    b = best->ql.flink;
 
-        while (b != &thr->freelist[ bin ]) {
-            *total_free += (b->bh.bb.bsize - sizeof( bhead_t ));
-            if ((best == &thr->freelist[ bin ]) || (b->bh.bb.bsize < best->bh.bb.bsize))
-                best = b;
-
-            /* Link to next buffer */
-            b = b->ql.flink;
-        }
+    while (b != &thr->freelist[bin]) {
+      *total_free += (b->bh.bb.bsize - sizeof(bhead_t));
+      if ((best == &thr->freelist[bin]) || (b->bh.bb.bsize < best->bh.bb.bsize))
+        best = b;
 
-        if (*max_free < best->bh.bb.bsize)
-            *max_free = best->bh.bb.bsize;
+      /* Link to next buffer */
+      b = b->ql.flink;
     }
 
-    if (*max_free > (bufsize)sizeof( bhead_t ))
-        *max_free -= sizeof( bhead_t );
-}
+    if (*max_free < best->bh.bb.bsize)
+      *max_free = best->bh.bb.bsize;
+  }
 
-/* ------------------------------------------------------------------------ */
+  if (*max_free > (bufsize)sizeof(bhead_t))
+    *max_free -= sizeof(bhead_t);
+}
 
 /*  BGET  --  Allocate a buffer.  */
+static void *bget(kmp_info_t *th, bufsize requested_size) {
+  thr_data_t *thr = get_thr_data(th);
+  bufsize size = requested_size;
+  bfhead_t *b;
+  void *buf;
+  int compactseq = 0;
+  int use_blink = 0;
+  /* For BestFit */
+  bfhead_t *best;
 
-static void *
-bget(  kmp_info_t *th, bufsize requested_size )
-{
-    thr_data_t *thr = get_thr_data( th );
-    bufsize size = requested_size;
-    bfhead_t *b;
-    void *buf;
-    int compactseq = 0;
-    int use_blink = 0;
-/* For BestFit */
-    bfhead_t *best;
-
-    if ( size < 0 || size + sizeof( bhead_t ) > MaxSize ) {
-        return NULL;
-    }; // if
-
-    __kmp_bget_dequeue( th );         /* Release any queued buffers */
-
-    if (size < (bufsize)SizeQ) {      /* Need at least room for the */
-        size = SizeQ;                 /*    queue links.  */
-    }
-    #if defined( SizeQuant ) && ( SizeQuant > 1 )
-        size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));
-    #endif
-
-    size += sizeof(bhead_t);     /* Add overhead in allocated buffer
-                                         to size required. */
-    KMP_DEBUG_ASSERT( size >= 0 );
-    KMP_DEBUG_ASSERT( size % SizeQuant == 0 );
-
-    use_blink = ( thr->mode == bget_mode_lifo );
-
-    /* If a compact function was provided in the call to bectl(), wrap
-       a loop around the allocation process  to  allow  compaction  to
-       intervene in case we don't find a suitable buffer in the chain. */
-
-    for (;;) {
-        int bin;
-
-        for (bin = bget_get_bin( size ); bin < MAX_BGET_BINS; ++bin) {
-            /* Link to next buffer */
-            b = ( use_blink ? thr->freelist[ bin ].ql.blink : thr->freelist[ bin ].ql.flink );
-
-            if (thr->mode == bget_mode_best) {
-                best = &thr->freelist[ bin ];
-
-                /* Scan the free list searching for the first buffer big enough
-                   to hold the requested size buffer. */
-
-                while (b != &thr->freelist[ bin ]) {
-                    if (b->bh.bb.bsize >= (bufsize) size) {
-                        if ((best == &thr->freelist[ bin ]) || (b->bh.bb.bsize < best->bh.bb.bsize)) {
-                            best = b;
-                        }
-                    }
-
-                    /* Link to next buffer */
-                    b = ( use_blink ? b->ql.blink : b->ql.flink );
-                }
-                b = best;
+  if (size < 0 || size + sizeof(bhead_t) > MaxSize) {
+    return NULL;
+  }; // if
+
+  __kmp_bget_dequeue(th); /* Release any queued buffers */
+
+  if (size < (bufsize)SizeQ) { // Need at least room for the queue links.
+    size = SizeQ;
+  }
+#if defined(SizeQuant) && (SizeQuant > 1)
+  size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));
+#endif
+
+  size += sizeof(bhead_t); // Add overhead in allocated buffer to size required.
+  KMP_DEBUG_ASSERT(size >= 0);
+  KMP_DEBUG_ASSERT(size % SizeQuant == 0);
+
+  use_blink = (thr->mode == bget_mode_lifo);
+
+  /* If a compact function was provided in the call to bectl(), wrap
+     a loop around the allocation process  to  allow  compaction  to
+     intervene in case we don't find a suitable buffer in the chain. */
+
+  for (;;) {
+    int bin;
+
+    for (bin = bget_get_bin(size); bin < MAX_BGET_BINS; ++bin) {
+      /* Link to next buffer */
+      b = (use_blink ? thr->freelist[bin].ql.blink
+                     : thr->freelist[bin].ql.flink);
+
+      if (thr->mode == bget_mode_best) {
+        best = &thr->freelist[bin];
+
+        /* Scan the free list searching for the first buffer big enough
+           to hold the requested size buffer. */
+        while (b != &thr->freelist[bin]) {
+          if (b->bh.bb.bsize >= (bufsize)size) {
+            if ((best == &thr->freelist[bin]) ||
+                (b->bh.bb.bsize < best->bh.bb.bsize)) {
+              best = b;
             }
+          }
 
-            while (b != &thr->freelist[ bin ]) {
-                if ((bufsize) b->bh.bb.bsize >= (bufsize) size) {
+          /* Link to next buffer */
+          b = (use_blink ? b->ql.blink : b->ql.flink);
+        }
+        b = best;
+      }
 
-                    /* Buffer  is big enough to satisfy  the request.  Allocate it
-                       to the caller.  We must decide whether the buffer is  large
-                       enough  to  split  into  the part given to the caller and a
-                       free buffer that remains on the free list, or  whether  the
-                       entire  buffer  should  be  removed  from the free list and
-                       given to the caller in its entirety.   We  only  split  the
-                       buffer if enough room remains for a header plus the minimum
-                       quantum of allocation. */
-
-                    if ((b->bh.bb.bsize - (bufsize) size) > (bufsize)(SizeQ + (sizeof(bhead_t)))) {
-                        bhead_t *ba, *bn;
-
-                        ba = BH(((char *) b) + (b->bh.bb.bsize - (bufsize) size));
-                        bn = BH(((char *) ba) + size);
-
-                        KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);
-
-                        /* Subtract size from length of free block. */
-                        b->bh.bb.bsize -= (bufsize) size;
-
-                        /* Link allocated buffer to the previous free buffer. */
-                        ba->bb.prevfree = b->bh.bb.bsize;
-
-                        /* Plug negative size into user buffer. */
-                        ba->bb.bsize = -size;
-
-                        /* Mark this buffer as owned by this thread. */
-                        TCW_PTR(ba->bb.bthr, th);   // not an allocated address (do not mark it)
-                        /* Mark buffer after this one not preceded by free block. */
-                        bn->bb.prevfree = 0;
-
-                        /* unlink the buffer from the old freelist, and reinsert it into the new freelist */
-                        __kmp_bget_remove_from_freelist( b );
-                        __kmp_bget_insert_into_freelist( thr, b );
+      while (b != &thr->freelist[bin]) {
+        if ((bufsize)b->bh.bb.bsize >= (bufsize)size) {
+
+          // Buffer is big enough to satisfy the request. Allocate it to the
+          // caller. We must decide whether the buffer is large enough to split
+          // into the part given to the caller and a free buffer that remains
+          // on the free list, or whether the entire buffer should be removed
+          // from the free list and given to the caller in its entirety. We
+          // only split the buffer if enough room remains for a header plus the
+          // minimum quantum of allocation.
+          if ((b->bh.bb.bsize - (bufsize)size) >
+              (bufsize)(SizeQ + (sizeof(bhead_t)))) {
+            bhead_t *ba, *bn;
+
+            ba = BH(((char *)b) + (b->bh.bb.bsize - (bufsize)size));
+            bn = BH(((char *)ba) + size);
+
+            KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);
+
+            /* Subtract size from length of free block. */
+            b->bh.bb.bsize -= (bufsize)size;
+
+            /* Link allocated buffer to the previous free buffer. */
+            ba->bb.prevfree = b->bh.bb.bsize;
+
+            /* Plug negative size into user buffer. */
+            ba->bb.bsize = -size;
+
+            /* Mark this buffer as owned by this thread. */
+            TCW_PTR(ba->bb.bthr,
+                    th); // not an allocated address (do not mark it)
+            /* Mark buffer after this one not preceded by free block. */
+            bn->bb.prevfree = 0;
+
+            // unlink buffer from old freelist, and reinsert into new freelist
+            __kmp_bget_remove_from_freelist(b);
+            __kmp_bget_insert_into_freelist(thr, b);
 #if BufStats
-                        thr->totalloc += (size_t) size;
-                        thr->numget++;        /* Increment number of bget() calls */
+            thr->totalloc += (size_t)size;
+            thr->numget++; /* Increment number of bget() calls */
 #endif
-                        buf = (void *) ((((char *) ba) + sizeof(bhead_t)));
-                        KMP_DEBUG_ASSERT( ((size_t)buf) % SizeQuant == 0 );
-                        return buf;
-                    } else {
-                        bhead_t *ba;
+            buf = (void *)((((char *)ba) + sizeof(bhead_t)));
+            KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
+            return buf;
+          } else {
+            bhead_t *ba;
 
-                        ba = BH(((char *) b) + b->bh.bb.bsize);
+            ba = BH(((char *)b) + b->bh.bb.bsize);
 
-                        KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);
+            KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);
 
-                        /* The buffer isn't big enough to split.  Give  the  whole
-                           shebang to the caller and remove it from the free list. */
+            /* The buffer isn't big enough to split.  Give  the  whole
+               shebang to the caller and remove it from the free list. */
 
-                       __kmp_bget_remove_from_freelist( b );
+            __kmp_bget_remove_from_freelist(b);
 #if BufStats
-                        thr->totalloc += (size_t) b->bh.bb.bsize;
-                        thr->numget++;        /* Increment number of bget() calls */
+            thr->totalloc += (size_t)b->bh.bb.bsize;
+            thr->numget++; /* Increment number of bget() calls */
 #endif
-                        /* Negate size to mark buffer allocated. */
-                        b->bh.bb.bsize = -(b->bh.bb.bsize);
+            /* Negate size to mark buffer allocated. */
+            b->bh.bb.bsize = -(b->bh.bb.bsize);
 
-                        /* Mark this buffer as owned by this thread. */
-                        TCW_PTR(ba->bb.bthr, th);   // not an allocated address (do not mark it)
-                        /* Zero the back pointer in the next buffer in memory
-                           to indicate that this buffer is allocated. */
-                        ba->bb.prevfree = 0;
-
-                        /* Give user buffer starting at queue links. */
-                        buf =  (void *) &(b->ql);
-                        KMP_DEBUG_ASSERT( ((size_t)buf) % SizeQuant == 0 );
-                        return buf;
-                    }
-                }
+            /* Mark this buffer as owned by this thread. */
+            TCW_PTR(ba->bb.bthr, th); // not an allocated address (do not mark)
+            /* Zero the back pointer in the next buffer in memory
+               to indicate that this buffer is allocated. */
+            ba->bb.prevfree = 0;
 
-                /* Link to next buffer */
-                b = ( use_blink ? b->ql.blink : b->ql.flink );
-            }
+            /* Give user buffer starting at queue links. */
+            buf = (void *)&(b->ql);
+            KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
+            return buf;
+          }
         }
 
-        /* We failed to find a buffer.  If there's a compact  function
-           defined,  notify  it  of the size requested.  If it returns
-           TRUE, try the allocation again. */
-
-        if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {
-            break;
-        }
+        /* Link to next buffer */
+        b = (use_blink ? b->ql.blink : b->ql.flink);
+      }
     }
 
-    /* No buffer available with requested size free. */
-
-    /* Don't give up yet -- look in the reserve supply. */
+    /* We failed to find a buffer. If there's a compact function defined,
+       notify it of the size requested. If it returns TRUE, try the allocation
+       again. */
 
-    if (thr->acqfcn != 0) {
-        if (size > (bufsize) (thr->exp_incr - sizeof(bhead_t))) {
+    if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {
+      break;
+    }
+  }
 
-            /* Request  is  too  large  to  fit in a single expansion
-               block.  Try to satisy it by a direct buffer acquisition. */
+  /* No buffer available with requested size free. */
 
-            bdhead_t *bdh;
+  /* Don't give up yet -- look in the reserve supply. */
+  if (thr->acqfcn != 0) {
+    if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
+      /* Request is too large to fit in a single expansion block.
+	 Try to satisy it by a direct buffer acquisition. */
+      bdhead_t *bdh;
 
-            size += sizeof(bdhead_t) - sizeof(bhead_t);
+      size += sizeof(bdhead_t) - sizeof(bhead_t);
 
-            KE_TRACE( 10, ("%%%%%% MALLOC( %d )\n", (int) size ) );
+      KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", (int)size));
 
-            /* richryan */
-            bdh = BDH((*thr->acqfcn)((bufsize) size));
-            if (bdh != NULL) {
+      /* richryan */
+      bdh = BDH((*thr->acqfcn)((bufsize)size));
+      if (bdh != NULL) {
 
-                /*  Mark the buffer special by setting the size field
-                    of its header to zero.  */
-                bdh->bh.bb.bsize = 0;
+        // Mark the buffer special by setting size field of its header to zero.
+        bdh->bh.bb.bsize = 0;
 
-                /* Mark this buffer as owned by this thread. */
-                TCW_PTR(bdh->bh.bb.bthr, th);  // don't mark buffer as allocated,
-                                               // because direct buffer never goes to free list
-                bdh->bh.bb.prevfree = 0;
-                bdh->tsize = size;
+        /* Mark this buffer as owned by this thread. */
+        TCW_PTR(bdh->bh.bb.bthr, th); // don't mark buffer as allocated,
+        // because direct buffer never goes to free list
+        bdh->bh.bb.prevfree = 0;
+        bdh->tsize = size;
 #if BufStats
-                thr->totalloc += (size_t) size;
-                thr->numget++;        /* Increment number of bget() calls */
-                thr->numdget++;       /* Direct bget() call count */
-#endif
-                buf =  (void *) (bdh + 1);
-                KMP_DEBUG_ASSERT( ((size_t)buf) % SizeQuant == 0 );
-                return buf;
-            }
+        thr->totalloc += (size_t)size;
+        thr->numget++; /* Increment number of bget() calls */
+        thr->numdget++; /* Direct bget() call count */
+#endif
+        buf = (void *)(bdh + 1);
+        KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
+        return buf;
+      }
 
-        } else {
-
-            /*  Try to obtain a new expansion block */
+    } else {
 
-            void *newpool;
+      /*  Try to obtain a new expansion block */
+      void *newpool;
 
-            KE_TRACE( 10, ("%%%%%% MALLOCB( %d )\n", (int) thr->exp_incr ) );
+      KE_TRACE(10, ("%%%%%% MALLOCB( %d )\n", (int)thr->exp_incr));
 
-            /* richryan */
-            newpool = (*thr->acqfcn)((bufsize) thr->exp_incr);
-            KMP_DEBUG_ASSERT( ((size_t)newpool) % SizeQuant == 0 );
-            if (newpool != NULL) {
-                bpool( th, newpool, thr->exp_incr);
-                buf =  bget( th, requested_size);  /* This can't, I say, can't get into a loop. */
-                return buf;
-            }
-        }
+      /* richryan */
+      newpool = (*thr->acqfcn)((bufsize)thr->exp_incr);
+      KMP_DEBUG_ASSERT(((size_t)newpool) % SizeQuant == 0);
+      if (newpool != NULL) {
+        bpool(th, newpool, thr->exp_incr);
+        buf = bget(
+            th, requested_size); /* This can't, I say, can't get into a loop. */
+        return buf;
+      }
     }
+  }
 
-    /*  Still no buffer available */
+  /*  Still no buffer available */
 
-    return NULL;
+  return NULL;
 }
 
 /*  BGETZ  --  Allocate a buffer and clear its contents to zero.  We clear
                the  entire  contents  of  the buffer to zero, not just the
                region requested by the caller. */
 
-static void *
-bgetz(  kmp_info_t *th, bufsize size )
-{
-    char *buf = (char *) bget( th, size);
-
-    if (buf != NULL) {
-        bhead_t *b;
-        bufsize rsize;
-
-        b = BH(buf - sizeof(bhead_t));
-        rsize = -(b->bb.bsize);
-        if (rsize == 0) {
-            bdhead_t *bd;
-
-            bd = BDH(buf - sizeof(bdhead_t));
-            rsize = bd->tsize - (bufsize) sizeof(bdhead_t);
-        } else {
-            rsize -= sizeof(bhead_t);
-        }
+static void *bgetz(kmp_info_t *th, bufsize size) {
+  char *buf = (char *)bget(th, size);
+
+  if (buf != NULL) {
+    bhead_t *b;
+    bufsize rsize;
 
-        KMP_DEBUG_ASSERT(rsize >= size);
+    b = BH(buf - sizeof(bhead_t));
+    rsize = -(b->bb.bsize);
+    if (rsize == 0) {
+      bdhead_t *bd;
 
-        (void) memset(buf, 0, (bufsize) rsize);
+      bd = BDH(buf - sizeof(bdhead_t));
+      rsize = bd->tsize - (bufsize)sizeof(bdhead_t);
+    } else {
+      rsize -= sizeof(bhead_t);
     }
-    return ((void *) buf);
+
+    KMP_DEBUG_ASSERT(rsize >= size);
+
+    (void)memset(buf, 0, (bufsize)rsize);
+  }
+  return ((void *)buf);
 }
 
 /*  BGETR  --  Reallocate a buffer.  This is a minimal implementation,
@@ -757,392 +694,372 @@ bgetz(  kmp_info_t *th, bufsize size )
                enhanced to allow the buffer to grow into adjacent free
                blocks and to avoid moving data unnecessarily.  */
 
-static void *
-bgetr(  kmp_info_t *th, void *buf, bufsize size)
-{
-    void *nbuf;
-    bufsize osize;                    /* Old size of buffer */
-    bhead_t *b;
-
-    nbuf = bget( th, size );
-    if ( nbuf == NULL ) { /* Acquire new buffer */
-        return NULL;
-    }
-    if ( buf == NULL ) {
-        return nbuf;
-    }
-    b = BH(((char *) buf) - sizeof(bhead_t));
-    osize = -b->bb.bsize;
-    if (osize == 0) {
-        /*  Buffer acquired directly through acqfcn. */
-        bdhead_t *bd;
-
-        bd = BDH(((char *) buf) - sizeof(bdhead_t));
-        osize = bd->tsize - (bufsize) sizeof(bdhead_t);
-    } else {
-        osize -= sizeof(bhead_t);
-    };
-
-    KMP_DEBUG_ASSERT(osize > 0);
-
-    (void) KMP_MEMCPY((char *) nbuf, (char *) buf, /* Copy the data */
-             (size_t) ((size < osize) ? size : osize));
-    brel( th, buf );
+static void *bgetr(kmp_info_t *th, void *buf, bufsize size) {
+  void *nbuf;
+  bufsize osize; /* Old size of buffer */
+  bhead_t *b;
 
+  nbuf = bget(th, size);
+  if (nbuf == NULL) { /* Acquire new buffer */
+    return NULL;
+  }
+  if (buf == NULL) {
     return nbuf;
+  }
+  b = BH(((char *)buf) - sizeof(bhead_t));
+  osize = -b->bb.bsize;
+  if (osize == 0) {
+    /*  Buffer acquired directly through acqfcn. */
+    bdhead_t *bd;
+
+    bd = BDH(((char *)buf) - sizeof(bdhead_t));
+    osize = bd->tsize - (bufsize)sizeof(bdhead_t);
+  } else {
+    osize -= sizeof(bhead_t);
+  };
+
+  KMP_DEBUG_ASSERT(osize > 0);
+
+  (void)KMP_MEMCPY((char *)nbuf, (char *)buf, /* Copy the data */
+                   (size_t)((size < osize) ? size : osize));
+  brel(th, buf);
+
+  return nbuf;
 }
 
 /*  BREL  --  Release a buffer.  */
+static void brel(kmp_info_t *th, void *buf) {
+  thr_data_t *thr = get_thr_data(th);
+  bfhead_t *b, *bn;
+  kmp_info_t *bth;
 
-static void
-brel(  kmp_info_t *th, void *buf )
-{
-    thr_data_t *thr = get_thr_data( th );
-    bfhead_t *b, *bn;
-    kmp_info_t *bth;
-
-    KMP_DEBUG_ASSERT(buf != NULL);
-    KMP_DEBUG_ASSERT( ((size_t)buf) % SizeQuant == 0 );
+  KMP_DEBUG_ASSERT(buf != NULL);
+  KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
 
-    b = BFH(((char *) buf) - sizeof(bhead_t));
+  b = BFH(((char *)buf) - sizeof(bhead_t));
 
-    if (b->bh.bb.bsize == 0) {        /* Directly-acquired buffer? */
-        bdhead_t *bdh;
+  if (b->bh.bb.bsize == 0) { /* Directly-acquired buffer? */
+    bdhead_t *bdh;
 
-        bdh = BDH(((char *) buf) - sizeof(bdhead_t));
-        KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
+    bdh = BDH(((char *)buf) - sizeof(bdhead_t));
+    KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
 #if BufStats
-        thr->totalloc -= (size_t) bdh->tsize;
-        thr->numdrel++;               /* Number of direct releases */
-        thr->numrel++;                /* Increment number of brel() calls */
+    thr->totalloc -= (size_t)bdh->tsize;
+    thr->numdrel++; /* Number of direct releases */
+    thr->numrel++; /* Increment number of brel() calls */
 #endif /* BufStats */
 #ifdef FreeWipe
-        (void) memset((char *) buf, 0x55,
-                 (size_t) (bdh->tsize - sizeof(bdhead_t)));
+    (void)memset((char *)buf, 0x55, (size_t)(bdh->tsize - sizeof(bdhead_t)));
 #endif /* FreeWipe */
 
-        KE_TRACE( 10, ("%%%%%% FREE( %p )\n", (void *) bdh ) );
+    KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)bdh));
 
-        KMP_DEBUG_ASSERT( thr->relfcn != 0 );
-        (*thr->relfcn)((void *) bdh);      /* Release it directly. */
-        return;
-    }
-
-    bth = (kmp_info_t *)( (kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1 ); // clear possible mark before comparison
-    if ( bth != th ) {
-        /* Add this buffer to be released by the owning thread later */
-        __kmp_bget_enqueue( bth, buf
+    KMP_DEBUG_ASSERT(thr->relfcn != 0);
+    (*thr->relfcn)((void *)bdh); /* Release it directly. */
+    return;
+  }
+
+  bth = (kmp_info_t *)((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) &
+                       ~1); // clear possible mark before comparison
+  if (bth != th) {
+    /* Add this buffer to be released by the owning thread later */
+    __kmp_bget_enqueue(bth, buf
 #ifdef USE_QUEUING_LOCK_FOR_BGET
-                            , __kmp_gtid_from_thread( th )
+                       ,
+                       __kmp_gtid_from_thread(th)
 #endif
-        );
-        return;
-    }
+                           );
+    return;
+  }
+
+  /* Buffer size must be negative, indicating that the buffer is allocated. */
+  if (b->bh.bb.bsize >= 0) {
+    bn = NULL;
+  }
+  KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);
 
-    /* Buffer size must be negative, indicating that the buffer is
-       allocated. */
-
-    if (b->bh.bb.bsize >= 0) {
-        bn = NULL;
-    }
-    KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);
+  /*  Back pointer in next buffer must be zero, indicating the same thing: */
 
-    /*  Back pointer in next buffer must be zero, indicating the
-        same thing: */
-
-    KMP_DEBUG_ASSERT(BH((char *) b - b->bh.bb.bsize)->bb.prevfree == 0);
+  KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.bsize)->bb.prevfree == 0);
 
 #if BufStats
-    thr->numrel++;                    /* Increment number of brel() calls */
-    thr->totalloc += (size_t) b->bh.bb.bsize;
+  thr->numrel++; /* Increment number of brel() calls */
+  thr->totalloc += (size_t)b->bh.bb.bsize;
 #endif
 
-    /* If the back link is nonzero, the previous buffer is free.  */
-
-    if (b->bh.bb.prevfree != 0) {
-        /* The previous buffer is free.  Consolidate this buffer  with  it
-           by  adding  the  length  of  this  buffer  to the previous free
-           buffer.  Note that we subtract the size  in  the  buffer  being
-           released,  since  it's  negative to indicate that the buffer is
-           allocated. */
-
-        register bufsize size = b->bh.bb.bsize;
-
-        /* Make the previous buffer the one we're working on. */
-        KMP_DEBUG_ASSERT(BH((char *) b - b->bh.bb.prevfree)->bb.bsize == b->bh.bb.prevfree);
-        b = BFH(((char *) b) - b->bh.bb.prevfree);
-        b->bh.bb.bsize -= size;
-
-        /* unlink the buffer from the old freelist */
-        __kmp_bget_remove_from_freelist( b );
-    }
-    else {
-        /* The previous buffer isn't allocated.  Mark this buffer
-           size as positive (i.e. free) and fall through to place
-           the buffer on the free list as an isolated free block. */
-
-        b->bh.bb.bsize = -b->bh.bb.bsize;
-    }
-
-    /* insert buffer back onto a new freelist */
-    __kmp_bget_insert_into_freelist( thr, b );
-
+  /* If the back link is nonzero, the previous buffer is free.  */
 
-    /* Now we look at the next buffer in memory, located by advancing from
-       the  start  of  this  buffer  by its size, to see if that buffer is
-       free.  If it is, we combine  this  buffer  with  the  next  one  in
-       memory, dechaining the second buffer from the free list. */
-
-    bn =  BFH(((char *) b) + b->bh.bb.bsize);
-    if (bn->bh.bb.bsize > 0) {
-
-        /* The buffer is free.  Remove it from the free list and add
-           its size to that of our buffer. */
-
-        KMP_DEBUG_ASSERT(BH((char *) bn + bn->bh.bb.bsize)->bb.prevfree == bn->bh.bb.bsize);
-
-        __kmp_bget_remove_from_freelist( bn );
-
-        b->bh.bb.bsize += bn->bh.bb.bsize;
-
-        /* unlink the buffer from the old freelist, and reinsert it into the new freelist */
-
-        __kmp_bget_remove_from_freelist( b );
-        __kmp_bget_insert_into_freelist( thr, b );
-
-        /* Finally,  advance  to   the  buffer  that   follows  the  newly
-           consolidated free block.  We must set its  backpointer  to  the
-           head  of  the  consolidated free block.  We know the next block
-           must be an allocated block because the process of recombination
-           guarantees  that  two  free  blocks will never be contiguous in
-           memory.  */
-
-        bn = BFH(((char *) b) + b->bh.bb.bsize);
-    }
+  if (b->bh.bb.prevfree != 0) {
+    /* The previous buffer is free. Consolidate this buffer with it by adding
+       the length of this buffer to the previous free buffer. Note that we
+       subtract the size in the buffer being released, since it's negative to
+       indicate that the buffer is allocated. */
+    register bufsize size = b->bh.bb.bsize;
+
+    /* Make the previous buffer the one we're working on. */
+    KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.prevfree)->bb.bsize ==
+                     b->bh.bb.prevfree);
+    b = BFH(((char *)b) - b->bh.bb.prevfree);
+    b->bh.bb.bsize -= size;
+
+    /* unlink the buffer from the old freelist */
+    __kmp_bget_remove_from_freelist(b);
+  } else {
+    /* The previous buffer isn't allocated. Mark this buffer size as positive
+       (i.e. free) and fall through to place the buffer on the free list as an
+       isolated free block. */
+    b->bh.bb.bsize = -b->bh.bb.bsize;
+  }
+
+  /* insert buffer back onto a new freelist */
+  __kmp_bget_insert_into_freelist(thr, b);
+
+  /* Now we look at the next buffer in memory, located by advancing from
+     the  start  of  this  buffer  by its size, to see if that buffer is
+     free.  If it is, we combine  this  buffer  with  the  next  one  in
+     memory, dechaining the second buffer from the free list. */
+  bn = BFH(((char *)b) + b->bh.bb.bsize);
+  if (bn->bh.bb.bsize > 0) {
+
+    /* The buffer is free.  Remove it from the free list and add
+       its size to that of our buffer. */
+    KMP_DEBUG_ASSERT(BH((char *)bn + bn->bh.bb.bsize)->bb.prevfree ==
+                     bn->bh.bb.bsize);
+
+    __kmp_bget_remove_from_freelist(bn);
+
+    b->bh.bb.bsize += bn->bh.bb.bsize;
+
+    /* unlink the buffer from the old freelist, and reinsert it into the new
+     * freelist */
+    __kmp_bget_remove_from_freelist(b);
+    __kmp_bget_insert_into_freelist(thr, b);
+
+    /* Finally,  advance  to   the  buffer  that   follows  the  newly
+       consolidated free block.  We must set its  backpointer  to  the
+       head  of  the  consolidated free block.  We know the next block
+       must be an allocated block because the process of recombination
+       guarantees  that  two  free  blocks will never be contiguous in
+       memory.  */
+    bn = BFH(((char *)b) + b->bh.bb.bsize);
+  }
 #ifdef FreeWipe
-    (void) memset(((char *) b) + sizeof(bfhead_t), 0x55,
-            (size_t) (b->bh.bb.bsize - sizeof(bfhead_t)));
+  (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
+               (size_t)(b->bh.bb.bsize - sizeof(bfhead_t)));
 #endif
-    KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);
-
-    /* The next buffer is allocated.  Set the backpointer in it  to  point
-       to this buffer; the previous free buffer in memory. */
-
-    bn->bh.bb.prevfree = b->bh.bb.bsize;
+  KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);
 
-    /*  If  a  block-release function is defined, and this free buffer
-        constitutes the entire block, release it.  Note that  pool_len
-        is  defined  in  such a way that the test will fail unless all
-        pool blocks are the same size.  */
+  /* The next buffer is allocated.  Set the backpointer in it  to  point
+     to this buffer; the previous free buffer in memory. */
 
-    if (thr->relfcn != 0 &&
-        b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t)))
-    {
+  bn->bh.bb.prevfree = b->bh.bb.bsize;
+
+  /*  If  a  block-release function is defined, and this free buffer
+      constitutes the entire block, release it.  Note that  pool_len
+      is  defined  in  such a way that the test will fail unless all
+      pool blocks are the same size.  */
+  if (thr->relfcn != 0 &&
+      b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
 #if BufStats
-        if (thr->numpblk != 1) {        /* Do not release the last buffer until finalization time */
+    if (thr->numpblk !=
+        1) { /* Do not release the last buffer until finalization time */
 #endif
 
-            KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
-            KMP_DEBUG_ASSERT(BH((char *) b + b->bh.bb.bsize)->bb.bsize == ESent);
-            KMP_DEBUG_ASSERT(BH((char *) b + b->bh.bb.bsize)->bb.prevfree == b->bh.bb.bsize);
+      KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
+      KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
+      KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
+                       b->bh.bb.bsize);
 
-            /*  Unlink the buffer from the free list  */
-            __kmp_bget_remove_from_freelist( b );
+      /*  Unlink the buffer from the free list  */
+      __kmp_bget_remove_from_freelist(b);
 
-            KE_TRACE( 10, ("%%%%%% FREE( %p )\n", (void *) b ) );
+      KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
 
-            (*thr->relfcn)(b);
+      (*thr->relfcn)(b);
 #if BufStats
-            thr->numprel++;               /* Nr of expansion block releases */
-            thr->numpblk--;               /* Total number of blocks */
-            KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
-
-            /* avoid leaving stale last_pool pointer around if it is being dealloced */
-            if (thr->last_pool == b) thr->last_pool = 0;
-        }
-        else {
-            thr->last_pool = b;
-        }
-#endif /* BufStats */
+      thr->numprel++; /* Nr of expansion block releases */
+      thr->numpblk--; /* Total number of blocks */
+      KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
+
+      // avoid leaving stale last_pool pointer around if it is being dealloced
+      if (thr->last_pool == b)
+        thr->last_pool = 0;
+    } else {
+      thr->last_pool = b;
     }
+#endif /* BufStats */
+  }
 }
 
 /*  BECTL  --  Establish automatic pool expansion control  */
-
-static void
-bectl(  kmp_info_t *th, bget_compact_t compact, bget_acquire_t acquire, bget_release_t release, bufsize pool_incr)
-{
-    thr_data_t *thr = get_thr_data( th );
-
-    thr->compfcn = compact;
-    thr->acqfcn = acquire;
-    thr->relfcn = release;
-    thr->exp_incr = pool_incr;
+static void bectl(kmp_info_t *th, bget_compact_t compact,
+                  bget_acquire_t acquire, bget_release_t release,
+                  bufsize pool_incr) {
+  thr_data_t *thr = get_thr_data(th);
+
+  thr->compfcn = compact;
+  thr->acqfcn = acquire;
+  thr->relfcn = release;
+  thr->exp_incr = pool_incr;
 }
 
 /*  BPOOL  --  Add a region of memory to the buffer pool.  */
+static void bpool(kmp_info_t *th, void *buf, bufsize len) {
+  /*    int bin = 0; */
+  thr_data_t *thr = get_thr_data(th);
+  bfhead_t *b = BFH(buf);
+  bhead_t *bn;
 
-static void
-bpool(  kmp_info_t *th, void *buf, bufsize len)
-{
-/*    int bin = 0; */
-    thr_data_t *thr = get_thr_data( th );
-    bfhead_t *b = BFH(buf);
-    bhead_t *bn;
-
-    __kmp_bget_dequeue( th );         /* Release any queued buffers */
+  __kmp_bget_dequeue(th); /* Release any queued buffers */
 
 #ifdef SizeQuant
-    len &= ~(SizeQuant - 1);
+  len &= ~(SizeQuant - 1);
 #endif
-    if (thr->pool_len == 0) {
-        thr->pool_len = len;
-    } else if (len != thr->pool_len) {
-        thr->pool_len = -1;
-    }
+  if (thr->pool_len == 0) {
+    thr->pool_len = len;
+  } else if (len != thr->pool_len) {
+    thr->pool_len = -1;
+  }
 #if BufStats
-    thr->numpget++;                   /* Number of block acquisitions */
-    thr->numpblk++;                   /* Number of blocks total */
-    KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
+  thr->numpget++; /* Number of block acquisitions */
+  thr->numpblk++; /* Number of blocks total */
+  KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
 #endif /* BufStats */
 
-    /* Since the block is initially occupied by a single free  buffer,
-       it  had  better  not  be  (much) larger than the largest buffer
-       whose size we can store in bhead.bb.bsize. */
-
-    KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize) ESent + 1));
-
-    /* Clear  the  backpointer at  the start of the block to indicate that
-       there  is  no  free  block  prior  to  this   one.    That   blocks
-       recombination when the first block in memory is released. */
-
-    b->bh.bb.prevfree = 0;
-
-    /* Create a dummy allocated buffer at the end of the pool.  This dummy
-       buffer is seen when a buffer at the end of the pool is released and
-       blocks  recombination  of  the last buffer with the dummy buffer at
-       the end.  The length in the dummy buffer  is  set  to  the  largest
-       negative  number  to  denote  the  end  of  the pool for diagnostic
-       routines (this specific value is  not  counted  on  by  the  actual
-       allocation and release functions). */
-
-    len -= sizeof(bhead_t);
-    b->bh.bb.bsize = (bufsize) len;
-    /* Set the owner of this buffer */
-    TCW_PTR( b->bh.bb.bthr, (kmp_info_t*)((kmp_uintptr_t)th | 1) ); // mark the buffer as allocated address
+  /* Since the block is initially occupied by a single free  buffer,
+     it  had  better  not  be  (much) larger than the largest buffer
+     whose size we can store in bhead.bb.bsize. */
+  KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize)ESent + 1));
+
+  /* Clear  the  backpointer at  the start of the block to indicate that
+     there  is  no  free  block  prior  to  this   one.    That   blocks
+     recombination when the first block in memory is released. */
+  b->bh.bb.prevfree = 0;
+
+  /* Create a dummy allocated buffer at the end of the pool.  This dummy
+     buffer is seen when a buffer at the end of the pool is released and
+     blocks  recombination  of  the last buffer with the dummy buffer at
+     the end.  The length in the dummy buffer  is  set  to  the  largest
+     negative  number  to  denote  the  end  of  the pool for diagnostic
+     routines (this specific value is  not  counted  on  by  the  actual
+     allocation and release functions). */
+  len -= sizeof(bhead_t);
+  b->bh.bb.bsize = (bufsize)len;
+  /* Set the owner of this buffer */
+  TCW_PTR(b->bh.bb.bthr,
+          (kmp_info_t *)((kmp_uintptr_t)th |
+                         1)); // mark the buffer as allocated address
 
-    /* Chain the new block to the free list. */
-    __kmp_bget_insert_into_freelist( thr, b );
+  /* Chain the new block to the free list. */
+  __kmp_bget_insert_into_freelist(thr, b);
 
 #ifdef FreeWipe
-    (void) memset(((char *) b) + sizeof(bfhead_t), 0x55,
-             (size_t) (len - sizeof(bfhead_t)));
+  (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
+               (size_t)(len - sizeof(bfhead_t)));
 #endif
-    bn = BH(((char *) b) + len);
-    bn->bb.prevfree = (bufsize) len;
-    /* Definition of ESent assumes two's complement! */
-    KMP_DEBUG_ASSERT( (~0) == -1 && (bn != 0) );
+  bn = BH(((char *)b) + len);
+  bn->bb.prevfree = (bufsize)len;
+  /* Definition of ESent assumes two's complement! */
+  KMP_DEBUG_ASSERT((~0) == -1 && (bn != 0));
 
-    bn->bb.bsize = ESent;
+  bn->bb.bsize = ESent;
 }
 
-/* ------------------------------------------------------------------------ */
-
 /*  BFREED  --  Dump the free lists for this thread. */
-
-static void
-bfreed(  kmp_info_t *th )
-{
-    int bin = 0, count = 0;
-    int gtid = __kmp_gtid_from_thread( th );
-    thr_data_t *thr = get_thr_data( th );
+static void bfreed(kmp_info_t *th) {
+  int bin = 0, count = 0;
+  int gtid = __kmp_gtid_from_thread(th);
+  thr_data_t *thr = get_thr_data(th);
 
 #if BufStats
-    __kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC " get=%" KMP_INT64_SPEC " rel=%" \
-           KMP_INT64_SPEC " pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC " prel=%" KMP_INT64_SPEC \
-           " dget=%" KMP_INT64_SPEC " drel=%" KMP_INT64_SPEC "\n",
-           gtid, (kmp_uint64) thr->totalloc,
-           (kmp_int64) thr->numget,  (kmp_int64) thr->numrel,
-           (kmp_int64) thr->numpblk,
-           (kmp_int64) thr->numpget, (kmp_int64) thr->numprel,
-           (kmp_int64) thr->numdget, (kmp_int64) thr->numdrel );
+  __kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC
+                       " get=%" KMP_INT64_SPEC " rel=%" KMP_INT64_SPEC
+                       " pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC
+                       " prel=%" KMP_INT64_SPEC " dget=%" KMP_INT64_SPEC
+                       " drel=%" KMP_INT64_SPEC "\n",
+                       gtid, (kmp_uint64)thr->totalloc, (kmp_int64)thr->numget,
+                       (kmp_int64)thr->numrel, (kmp_int64)thr->numpblk,
+                       (kmp_int64)thr->numpget, (kmp_int64)thr->numprel,
+                       (kmp_int64)thr->numdget, (kmp_int64)thr->numdrel);
 #endif
 
-    for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
-        bfhead_t *b;
-
-        for (b = thr->freelist[ bin ].ql.flink; b != &thr->freelist[ bin ]; b = b->ql.flink) {
-            bufsize bs = b->bh.bb.bsize;
-
-            KMP_DEBUG_ASSERT( b->ql.blink->ql.flink == b );
-            KMP_DEBUG_ASSERT( b->ql.flink->ql.blink == b );
-            KMP_DEBUG_ASSERT( bs > 0 );
-
-            count += 1;
+  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+    bfhead_t *b;
 
-            __kmp_printf_no_lock("__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b, (long) bs );
+    for (b = thr->freelist[bin].ql.flink; b != &thr->freelist[bin];
+         b = b->ql.flink) {
+      bufsize bs = b->bh.bb.bsize;
+
+      KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
+      KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
+      KMP_DEBUG_ASSERT(bs > 0);
+
+      count += 1;
+
+      __kmp_printf_no_lock(
+          "__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b,
+          (long)bs);
 #ifdef FreeWipe
-            {
-                char *lerr = ((char *) b) + sizeof(bfhead_t);
-                if ((bs > sizeof(bfhead_t)) && ((*lerr != 0x55) || (memcmp(lerr, lerr + 1, (size_t) (bs - (sizeof(bfhead_t) + 1))) != 0))) {
-                    __kmp_printf_no_lock( "__kmp_printpool: T#%d     (Contents of above free block have been overstored.)\n", gtid );
-                }
-            }
-#endif
+      {
+        char *lerr = ((char *)b) + sizeof(bfhead_t);
+        if ((bs > sizeof(bfhead_t)) &&
+            ((*lerr != 0x55) ||
+             (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
+              0))) {
+          __kmp_printf_no_lock("__kmp_printpool: T#%d     (Contents of above "
+                               "free block have been overstored.)\n",
+                               gtid);
         }
+      }
+#endif
     }
+  }
 
-    if (count == 0)
-        __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid );
+  if (count == 0)
+    __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid);
 }
 
-/* ------------------------------------------------------------------------ */
-
 #ifdef KMP_DEBUG
 
 #if BufStats
 
 /*  BSTATS  --  Return buffer allocation free space statistics.  */
-
-static void
-bstats(  kmp_info_t *th, bufsize *curalloc,  bufsize *totfree,  bufsize *maxfree, long *nget, long *nrel)
-{
-    int bin = 0;
-    thr_data_t *thr = get_thr_data( th );
-
-    *nget = thr->numget;
-    *nrel = thr->numrel;
-    *curalloc = (bufsize) thr->totalloc;
-    *totfree = 0;
-    *maxfree = -1;
-
-    for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
-        bfhead_t *b = thr->freelist[ bin ].ql.flink;
-
-        while (b != &thr->freelist[ bin ]) {
-            KMP_DEBUG_ASSERT(b->bh.bb.bsize > 0);
-            *totfree += b->bh.bb.bsize;
-            if (b->bh.bb.bsize > *maxfree) {
-                *maxfree = b->bh.bb.bsize;
-            }
-            b = b->ql.flink;              /* Link to next buffer */
-        }
+static void bstats(kmp_info_t *th, bufsize *curalloc, bufsize *totfree,
+                   bufsize *maxfree, long *nget, long *nrel) {
+  int bin = 0;
+  thr_data_t *thr = get_thr_data(th);
+
+  *nget = thr->numget;
+  *nrel = thr->numrel;
+  *curalloc = (bufsize)thr->totalloc;
+  *totfree = 0;
+  *maxfree = -1;
+
+  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+    bfhead_t *b = thr->freelist[bin].ql.flink;
+
+    while (b != &thr->freelist[bin]) {
+      KMP_DEBUG_ASSERT(b->bh.bb.bsize > 0);
+      *totfree += b->bh.bb.bsize;
+      if (b->bh.bb.bsize > *maxfree) {
+        *maxfree = b->bh.bb.bsize;
+      }
+      b = b->ql.flink; /* Link to next buffer */
     }
+  }
 }
 
 /*  BSTATSE  --  Return extended statistics  */
-
-static void
-bstatse(  kmp_info_t *th, bufsize *pool_incr, long *npool, long *npget, long *nprel, long *ndget, long *ndrel)
-{
-    thr_data_t *thr = get_thr_data( th );
-
-    *pool_incr = (thr->pool_len < 0) ? -thr->exp_incr : thr->exp_incr;
-    *npool = thr->numpblk;
-    *npget = thr->numpget;
-    *nprel = thr->numprel;
-    *ndget = thr->numdget;
-    *ndrel = thr->numdrel;
+static void bstatse(kmp_info_t *th, bufsize *pool_incr, long *npool,
+                    long *npget, long *nprel, long *ndget, long *ndrel) {
+  thr_data_t *thr = get_thr_data(th);
+
+  *pool_incr = (thr->pool_len < 0) ? -thr->exp_incr : thr->exp_incr;
+  *npool = thr->numpblk;
+  *npget = thr->numpget;
+  *nprel = thr->numprel;
+  *ndget = thr->numdget;
+  *ndrel = thr->numdrel;
 }
 
 #endif /* BufStats */
@@ -1150,59 +1067,56 @@ bstatse(  kmp_info_t *th, bufsize *pool_
 /*  BUFDUMP  --  Dump the data in a buffer.  This is called with the  user
                  data pointer, and backs up to the buffer header.  It will
                  dump either a free block or an allocated one.  */
-
-static void
-bufdump(  kmp_info_t *th, void *buf )
-{
-    bfhead_t *b;
-    unsigned char *bdump;
-    bufsize bdlen;
-
-    b = BFH(((char *) buf) - sizeof(bhead_t));
-    KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
-    if (b->bh.bb.bsize < 0) {
-        bdump = (unsigned char *) buf;
-        bdlen = (-b->bh.bb.bsize) - (bufsize) sizeof(bhead_t);
-    } else {
-        bdump = (unsigned char *) (((char *) b) + sizeof(bfhead_t));
-        bdlen = b->bh.bb.bsize - (bufsize) sizeof(bfhead_t);
-    }
-
-    while (bdlen > 0) {
-        int i, dupes = 0;
-        bufsize l = bdlen;
-        char bhex[50], bascii[20];
-
-        if (l > 16) {
-            l = 16;
-        }
-
-        for (i = 0; i < l; i++) {
-            (void) KMP_SNPRINTF(bhex + i * 3, sizeof(bhex) - i * 3, "%02X ", bdump[i]);
-            if (bdump[i] > 0x20 && bdump[i] < 0x7F)
-                bascii[ i ] = bdump[ i ];
-            else
-                bascii[ i ] = ' ';
-        }
-        bascii[i] = 0;
-        (void) __kmp_printf_no_lock("%-48s   %s\n", bhex, bascii);
-        bdump += l;
-        bdlen -= l;
-        while ((bdlen > 16) && (memcmp((char *) (bdump - 16),
-                                       (char *) bdump, 16) == 0)) {
-            dupes++;
-            bdump += 16;
-            bdlen -= 16;
-        }
-        if (dupes > 1) {
-            (void) __kmp_printf_no_lock(
-                "     (%d lines [%d bytes] identical to above line skipped)\n",
-                dupes, dupes * 16);
-        } else if (dupes == 1) {
-            bdump -= 16;
-            bdlen += 16;
-        }
+static void bufdump(kmp_info_t *th, void *buf) {
+  bfhead_t *b;
+  unsigned char *bdump;
+  bufsize bdlen;
+
+  b = BFH(((char *)buf) - sizeof(bhead_t));
+  KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
+  if (b->bh.bb.bsize < 0) {
+    bdump = (unsigned char *)buf;
+    bdlen = (-b->bh.bb.bsize) - (bufsize)sizeof(bhead_t);
+  } else {
+    bdump = (unsigned char *)(((char *)b) + sizeof(bfhead_t));
+    bdlen = b->bh.bb.bsize - (bufsize)sizeof(bfhead_t);
+  }
+
+  while (bdlen > 0) {
+    int i, dupes = 0;
+    bufsize l = bdlen;
+    char bhex[50], bascii[20];
+
+    if (l > 16) {
+      l = 16;
+    }
+
+    for (i = 0; i < l; i++) {
+      (void)KMP_SNPRINTF(bhex + i * 3, sizeof(bhex) - i * 3, "%02X ", bdump[i]);
+      if (bdump[i] > 0x20 && bdump[i] < 0x7F)
+        bascii[i] = bdump[i];
+      else
+        bascii[i] = ' ';
+    }
+    bascii[i] = 0;
+    (void)__kmp_printf_no_lock("%-48s   %s\n", bhex, bascii);
+    bdump += l;
+    bdlen -= l;
+    while ((bdlen > 16) &&
+           (memcmp((char *)(bdump - 16), (char *)bdump, 16) == 0)) {
+      dupes++;
+      bdump += 16;
+      bdlen -= 16;
+    }
+    if (dupes > 1) {
+      (void)__kmp_printf_no_lock(
+          "     (%d lines [%d bytes] identical to above line skipped)\n", dupes,
+          dupes * 16);
+    } else if (dupes == 1) {
+      bdump -= 16;
+      bdlen += 16;
     }
+  }
 }
 
 /*  BPOOLD  --  Dump a buffer pool.  The buffer headers are always listed.
@@ -1210,611 +1124,519 @@ bufdump(  kmp_info_t *th, void *buf )
                 are  dumped.   If  DUMPFREE  is  nonzero,  free blocks are
                 dumped as well.  If FreeWipe  checking  is  enabled,  free
                 blocks  which  have  been clobbered will always be dumped. */
+static void bpoold(kmp_info_t *th, void *buf, int dumpalloc, int dumpfree) {
+  bfhead_t *b = BFH((char *)buf - sizeof(bhead_t));
 
-static void
-bpoold(  kmp_info_t *th, void *buf, int dumpalloc, int dumpfree)
-{
-    bfhead_t *b = BFH( (char*)buf - sizeof(bhead_t));
-
-    while (b->bh.bb.bsize != ESent) {
-        bufsize bs = b->bh.bb.bsize;
-
-        if (bs < 0) {
-            bs = -bs;
-            (void) __kmp_printf_no_lock("Allocated buffer: size %6ld bytes.\n", (long) bs);
-            if (dumpalloc) {
-                bufdump( th, (void *) (((char *) b) + sizeof(bhead_t)));
-            }
-        } else {
-            const char *lerr = "";
+  while (b->bh.bb.bsize != ESent) {
+    bufsize bs = b->bh.bb.bsize;
 
-            KMP_DEBUG_ASSERT(bs > 0);
-            if ((b->ql.blink->ql.flink != b) || (b->ql.flink->ql.blink != b)) {
-                lerr = "  (Bad free list links)";
-            }
-            (void) __kmp_printf_no_lock("Free block:       size %6ld bytes.%s\n",
-                (long) bs, lerr);
+    if (bs < 0) {
+      bs = -bs;
+      (void)__kmp_printf_no_lock("Allocated buffer: size %6ld bytes.\n",
+                                 (long)bs);
+      if (dumpalloc) {
+        bufdump(th, (void *)(((char *)b) + sizeof(bhead_t)));
+      }
+    } else {
+      const char *lerr = "";
+
+      KMP_DEBUG_ASSERT(bs > 0);
+      if ((b->ql.blink->ql.flink != b) || (b->ql.flink->ql.blink != b)) {
+        lerr = "  (Bad free list links)";
+      }
+      (void)__kmp_printf_no_lock("Free block:       size %6ld bytes.%s\n",
+                                 (long)bs, lerr);
 #ifdef FreeWipe
-            lerr = ((char *) b) + sizeof(bfhead_t);
-            if ((bs > sizeof(bfhead_t)) && ((*lerr != 0x55) ||
-                (memcmp(lerr, lerr + 1,
-                  (size_t) (bs - (sizeof(bfhead_t) + 1))) != 0))) {
-                (void) __kmp_printf_no_lock(
-                    "(Contents of above free block have been overstored.)\n");
-                bufdump( th, (void *) (((char *) b) + sizeof(bhead_t)));
-            } else
-#endif
-            if (dumpfree) {
-                bufdump( th, (void *) (((char *) b) + sizeof(bhead_t)));
-            }
-        }
-        b = BFH(((char *) b) + bs);
+      lerr = ((char *)b) + sizeof(bfhead_t);
+      if ((bs > sizeof(bfhead_t)) &&
+          ((*lerr != 0x55) ||
+           (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
+            0))) {
+        (void)__kmp_printf_no_lock(
+            "(Contents of above free block have been overstored.)\n");
+        bufdump(th, (void *)(((char *)b) + sizeof(bhead_t)));
+      } else
+#endif
+          if (dumpfree) {
+        bufdump(th, (void *)(((char *)b) + sizeof(bhead_t)));
+      }
     }
+    b = BFH(((char *)b) + bs);
+  }
 }
 
 /*  BPOOLV  --  Validate a buffer pool. */
+static int bpoolv(kmp_info_t *th, void *buf) {
+  bfhead_t *b = BFH(buf);
+
+  while (b->bh.bb.bsize != ESent) {
+    bufsize bs = b->bh.bb.bsize;
 
-static int
-bpoolv(  kmp_info_t *th, void *buf )
-{
-    bfhead_t *b = BFH(buf);
-
-    while (b->bh.bb.bsize != ESent) {
-        bufsize bs = b->bh.bb.bsize;
-
-        if (bs < 0) {
-            bs = -bs;
-        } else {
+    if (bs < 0) {
+      bs = -bs;
+    } else {
 #ifdef FreeWipe
-            char *lerr = "";
+      char *lerr = "";
 #endif
 
-            KMP_DEBUG_ASSERT(bs > 0);
-            if (bs <= 0) {
-                return 0;
-            }
-            if ((b->ql.blink->ql.flink != b) || (b->ql.flink->ql.blink != b)) {
-                (void) __kmp_printf_no_lock("Free block: size %6ld bytes.  (Bad free list links)\n",
-                     (long) bs);
-                KMP_DEBUG_ASSERT(0);
-                return 0;
-            }
+      KMP_DEBUG_ASSERT(bs > 0);
+      if (bs <= 0) {
+        return 0;
+      }
+      if ((b->ql.blink->ql.flink != b) || (b->ql.flink->ql.blink != b)) {
+        (void)__kmp_printf_no_lock(
+            "Free block: size %6ld bytes.  (Bad free list links)\n", (long)bs);
+        KMP_DEBUG_ASSERT(0);
+        return 0;
+      }
 #ifdef FreeWipe
-            lerr = ((char *) b) + sizeof(bfhead_t);
-            if ((bs > sizeof(bfhead_t)) && ((*lerr != 0x55) ||
-                (memcmp(lerr, lerr + 1,
-                  (size_t) (bs - (sizeof(bfhead_t) + 1))) != 0))) {
-                (void) __kmp_printf_no_lock(
-                    "(Contents of above free block have been overstored.)\n");
-                bufdump( th, (void *) (((char *) b) + sizeof(bhead_t)));
-                KMP_DEBUG_ASSERT(0);
-                return 0;
-            }
+      lerr = ((char *)b) + sizeof(bfhead_t);
+      if ((bs > sizeof(bfhead_t)) &&
+          ((*lerr != 0x55) ||
+           (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
+            0))) {
+        (void)__kmp_printf_no_lock(
+            "(Contents of above free block have been overstored.)\n");
+        bufdump(th, (void *)(((char *)b) + sizeof(bhead_t)));
+        KMP_DEBUG_ASSERT(0);
+        return 0;
+      }
 #endif /* FreeWipe */
-        }
-        b = BFH(((char *) b) + bs);
     }
-    return 1;
+    b = BFH(((char *)b) + bs);
+  }
+  return 1;
 }
 
 #endif /* KMP_DEBUG */
 
-/* ------------------------------------------------------------------------ */
-
-void
-__kmp_initialize_bget( kmp_info_t *th )
-{
-    KMP_DEBUG_ASSERT( SizeQuant >= sizeof( void * ) && (th != 0) );
+void __kmp_initialize_bget(kmp_info_t *th) {
+  KMP_DEBUG_ASSERT(SizeQuant >= sizeof(void *) && (th != 0));
 
-    set_thr_data( th );
+  set_thr_data(th);
 
-    bectl( th, (bget_compact_t) 0, (bget_acquire_t) malloc, (bget_release_t) free,
-           (bufsize) __kmp_malloc_pool_incr );
+  bectl(th, (bget_compact_t)0, (bget_acquire_t)malloc, (bget_release_t)free,
+        (bufsize)__kmp_malloc_pool_incr);
 }
 
-void
-__kmp_finalize_bget( kmp_info_t *th )
-{
-    thr_data_t *thr;
-    bfhead_t *b;
+void __kmp_finalize_bget(kmp_info_t *th) {
+  thr_data_t *thr;
+  bfhead_t *b;
 
-    KMP_DEBUG_ASSERT( th != 0 );
+  KMP_DEBUG_ASSERT(th != 0);
 
 #if BufStats
-    thr = (thr_data_t *) th->th.th_local.bget_data;
-    KMP_DEBUG_ASSERT( thr != NULL );
-    b = thr->last_pool;
-
-    /*  If  a  block-release function is defined, and this free buffer
-        constitutes the entire block, release it.  Note that  pool_len
-        is  defined  in  such a way that the test will fail unless all
-        pool blocks are the same size.  */
-
-    /* Deallocate the last pool if one exists because we no longer do it in brel() */
-    if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&
-        b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t)))
-    {
-        KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
-        KMP_DEBUG_ASSERT(BH((char *) b + b->bh.bb.bsize)->bb.bsize == ESent);
-        KMP_DEBUG_ASSERT(BH((char *) b + b->bh.bb.bsize)->bb.prevfree == b->bh.bb.bsize);
-
-        /*  Unlink the buffer from the free list  */
-        __kmp_bget_remove_from_freelist( b );
-
-        KE_TRACE( 10, ("%%%%%% FREE( %p )\n", (void *) b ) );
-
-        (*thr->relfcn)(b);
-        thr->numprel++;               /* Nr of expansion block releases */
-        thr->numpblk--;               /* Total number of blocks */
-        KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
-    }
+  thr = (thr_data_t *)th->th.th_local.bget_data;
+  KMP_DEBUG_ASSERT(thr != NULL);
+  b = thr->last_pool;
+
+  /*  If a block-release function is defined, and this free buffer constitutes
+      the entire block, release it. Note that pool_len is defined in such a way
+      that the test will fail unless all pool blocks are the same size.  */
+
+  // Deallocate the last pool if one exists because we no longer do it in brel()
+  if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&
+      b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
+    KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
+    KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
+    KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
+                     b->bh.bb.bsize);
+
+    /*  Unlink the buffer from the free list  */
+    __kmp_bget_remove_from_freelist(b);
+
+    KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
+
+    (*thr->relfcn)(b);
+    thr->numprel++; /* Nr of expansion block releases */
+    thr->numpblk--; /* Total number of blocks */
+    KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
+  }
 #endif /* BufStats */
 
-    /* Deallocate bget_data */
-    if ( th->th.th_local.bget_data != NULL ) {
-        __kmp_free( th->th.th_local.bget_data );
-        th->th.th_local.bget_data = NULL;
-    }; // if
+  /* Deallocate bget_data */
+  if (th->th.th_local.bget_data != NULL) {
+    __kmp_free(th->th.th_local.bget_data);
+    th->th.th_local.bget_data = NULL;
+  }; // if
 }
 
-void
-kmpc_set_poolsize( size_t size )
-{
-    bectl( __kmp_get_thread(), (bget_compact_t) 0, (bget_acquire_t) malloc,
-           (bget_release_t) free, (bufsize) size );
+void kmpc_set_poolsize(size_t size) {
+  bectl(__kmp_get_thread(), (bget_compact_t)0, (bget_acquire_t)malloc,
+        (bget_release_t)free, (bufsize)size);
 }
 
-size_t
-kmpc_get_poolsize( void )
-{
-    thr_data_t *p;
+size_t kmpc_get_poolsize(void) {
+  thr_data_t *p;
 
-    p = get_thr_data( __kmp_get_thread() );
+  p = get_thr_data(__kmp_get_thread());
 
-    return p->exp_incr;
+  return p->exp_incr;
 }
 
-void
-kmpc_set_poolmode( int mode )
-{
-    thr_data_t *p;
+void kmpc_set_poolmode(int mode) {
+  thr_data_t *p;
 
-    if (mode == bget_mode_fifo || mode == bget_mode_lifo || mode == bget_mode_best) {
-        p = get_thr_data( __kmp_get_thread() );
-        p->mode = (bget_mode_t) mode;
-    }
+  if (mode == bget_mode_fifo || mode == bget_mode_lifo ||
+      mode == bget_mode_best) {
+    p = get_thr_data(__kmp_get_thread());
+    p->mode = (bget_mode_t)mode;
+  }
 }
 
-int
-kmpc_get_poolmode( void )
-{
-    thr_data_t *p;
+int kmpc_get_poolmode(void) {
+  thr_data_t *p;
 
-    p = get_thr_data( __kmp_get_thread() );
+  p = get_thr_data(__kmp_get_thread());
 
-    return p->mode;
+  return p->mode;
 }
 
-void
-kmpc_get_poolstat( size_t *maxmem, size_t *allmem )
-{
-    kmp_info_t *th = __kmp_get_thread();
-    bufsize a, b;
+void kmpc_get_poolstat(size_t *maxmem, size_t *allmem) {
+  kmp_info_t *th = __kmp_get_thread();
+  bufsize a, b;
 
-    __kmp_bget_dequeue( th );         /* Release any queued buffers */
+  __kmp_bget_dequeue(th); /* Release any queued buffers */
 
-    bcheck( th, &a, &b );
+  bcheck(th, &a, &b);
 
-    *maxmem = a;
-    *allmem = b;
+  *maxmem = a;
+  *allmem = b;
 }
 
-void
-kmpc_poolprint( void )
-{
-    kmp_info_t *th = __kmp_get_thread();
+void kmpc_poolprint(void) {
+  kmp_info_t *th = __kmp_get_thread();
 
-    __kmp_bget_dequeue( th );         /* Release any queued buffers */
+  __kmp_bget_dequeue(th); /* Release any queued buffers */
 
-    bfreed( th );
+  bfreed(th);
 }
 
 #endif // #if KMP_USE_BGET
 
-/* ------------------------------------------------------------------------ */
-
-void *
-kmpc_malloc( size_t size )
-{
-    void * ptr;
-    ptr = bget( __kmp_entry_thread(), (bufsize)(size + sizeof(ptr)) );
-    if( ptr != NULL ) {
-        // save allocated pointer just before one returned to user
-        *(void**)ptr = ptr;
-        ptr = (void**)ptr + 1;
-    }
-    return ptr;
-}
-
-#define IS_POWER_OF_TWO(n) (((n)&((n)-1))==0)
-
-void *
-kmpc_aligned_malloc( size_t size, size_t alignment )
-{
-    void * ptr;
-    void * ptr_allocated;
-    KMP_DEBUG_ASSERT( alignment < 32 * 1024 ); // Alignment should not be too big
-    if( !IS_POWER_OF_TWO(alignment) ) {
-        // AC: do we need to issue a warning here?
-        errno = EINVAL;
-        return NULL;
-    }
-    size = size + sizeof( void* ) + alignment;
-    ptr_allocated = bget( __kmp_entry_thread(), (bufsize)size );
-    if( ptr_allocated != NULL ) {
-        // save allocated pointer just before one returned to user
-        ptr = (void*)(((kmp_uintptr_t)ptr_allocated + sizeof( void* ) + alignment) & ~(alignment - 1));
-        *((void**)ptr - 1) = ptr_allocated;
-    } else {
-        ptr = NULL;
-    }
-    return ptr;
-}
-
-void *
-kmpc_calloc( size_t nelem, size_t elsize )
-{
-    void * ptr;
-    ptr = bgetz( __kmp_entry_thread(), (bufsize) (nelem * elsize + sizeof(ptr)) );
-    if( ptr != NULL ) {
-        // save allocated pointer just before one returned to user
-        *(void**)ptr = ptr;
-        ptr = (void**)ptr + 1;
-    }
-    return ptr;
-}
-
-void *
-kmpc_realloc( void * ptr, size_t size )
-{
-    void * result = NULL;
-    if ( ptr == NULL ) {
-        // If pointer is NULL, realloc behaves like malloc.
-        result = bget( __kmp_entry_thread(), (bufsize)(size + sizeof(ptr)) );
-        // save allocated pointer just before one returned to user
-        if( result != NULL ) {
-            *(void**)result = result;
-            result = (void**)result + 1;
-        }
-    } else if ( size == 0 ) {
-        // If size is 0, realloc behaves like free.
-        // The thread must be registered by the call to kmpc_malloc() or kmpc_calloc() before.
-        // So it should be safe to call __kmp_get_thread(), not __kmp_entry_thread().
-        KMP_ASSERT(*((void**)ptr - 1));
-        brel( __kmp_get_thread(), *((void**)ptr - 1) );
-    } else {
-        result = bgetr( __kmp_entry_thread(), *((void**)ptr - 1), (bufsize)(size + sizeof(ptr)) );
-        if( result != NULL ) {
-            *(void**)result = result;
-            result = (void**)result + 1;
-        }
-    }; // if
-    return result;
-}
-
-/* NOTE: the library must have already been initialized by a previous allocate */
-
-void
-kmpc_free( void * ptr )
-{
-    if ( ! __kmp_init_serial ) {
-        return;
-    }; // if
-    if ( ptr != NULL ) {
-        kmp_info_t *th = __kmp_get_thread();
-        __kmp_bget_dequeue( th );         /* Release any queued buffers */
-        // extract allocated pointer and free it
-        KMP_ASSERT(*((void**)ptr - 1));
-        brel( th, *((void**)ptr - 1) );
-    };
-}
-
-
-/* ------------------------------------------------------------------------ */
-
-void *
-___kmp_thread_malloc( kmp_info_t *th, size_t size KMP_SRC_LOC_DECL )
-{
-    void * ptr;
-    KE_TRACE( 30, (
-        "-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n",
-        th,
-        (int) size
-        KMP_SRC_LOC_PARM
-    ) );
-    ptr = bget( th, (bufsize) size );
-    KE_TRACE( 30, ( "<- __kmp_thread_malloc() returns %p\n", ptr ) );
-    return ptr;
-}
-
-void *
-___kmp_thread_calloc( kmp_info_t *th, size_t nelem, size_t elsize KMP_SRC_LOC_DECL )
-{
-    void * ptr;
-    KE_TRACE( 30, (
-        "-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n",
-        th,
-        (int) nelem,
-        (int) elsize
-        KMP_SRC_LOC_PARM
-    ) );
-    ptr = bgetz( th, (bufsize) (nelem * elsize) );
-    KE_TRACE( 30, ( "<- __kmp_thread_calloc() returns %p\n", ptr ) );
-    return ptr;
-}
-
-void *
-___kmp_thread_realloc( kmp_info_t *th, void *ptr, size_t size KMP_SRC_LOC_DECL )
-{
-    KE_TRACE( 30, (
-        "-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n",
-        th,
-        ptr,
-        (int) size
-        KMP_SRC_LOC_PARM
-    ) );
-    ptr = bgetr( th, ptr, (bufsize) size );
-    KE_TRACE( 30, ( "<- __kmp_thread_realloc() returns %p\n", ptr ) );
-    return ptr;
-}
-
-void
-___kmp_thread_free( kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL )
-{
-    KE_TRACE( 30, (
-        "-> __kmp_thread_free( %p, %p ) called from %s:%d\n",
-        th,
-        ptr
-        KMP_SRC_LOC_PARM
-    ) );
-    if ( ptr != NULL ) {
-        __kmp_bget_dequeue( th );         /* Release any queued buffers */
-        brel( th, ptr );
-    }
-    KE_TRACE( 30, ( "<- __kmp_thread_free()\n" ) );
-}
-
-/* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
-/*
-    If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes memory leaks, but it
-    may be useful for debugging memory corruptions, used freed pointers, etc.
-*/
+void *kmpc_malloc(size_t size) {
+  void *ptr;
+  ptr = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
+  if (ptr != NULL) {
+    // save allocated pointer just before one returned to user
+    *(void **)ptr = ptr;
+    ptr = (void **)ptr + 1;
+  }
+  return ptr;
+}
+
+#define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)
+
+void *kmpc_aligned_malloc(size_t size, size_t alignment) {
+  void *ptr;
+  void *ptr_allocated;
+  KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too big
+  if (!IS_POWER_OF_TWO(alignment)) {
+    // AC: do we need to issue a warning here?
+    errno = EINVAL;
+    return NULL;
+  }
+  size = size + sizeof(void *) + alignment;
+  ptr_allocated = bget(__kmp_entry_thread(), (bufsize)size);
+  if (ptr_allocated != NULL) {
+    // save allocated pointer just before one returned to user
+    ptr = (void *)(((kmp_uintptr_t)ptr_allocated + sizeof(void *) + alignment) &
+                   ~(alignment - 1));
+    *((void **)ptr - 1) = ptr_allocated;
+  } else {
+    ptr = NULL;
+  }
+  return ptr;
+}
+
+void *kmpc_calloc(size_t nelem, size_t elsize) {
+  void *ptr;
+  ptr = bgetz(__kmp_entry_thread(), (bufsize)(nelem * elsize + sizeof(ptr)));
+  if (ptr != NULL) {
+    // save allocated pointer just before one returned to user
+    *(void **)ptr = ptr;
+    ptr = (void **)ptr + 1;
+  }
+  return ptr;
+}
+
+void *kmpc_realloc(void *ptr, size_t size) {
+  void *result = NULL;
+  if (ptr == NULL) {
+    // If pointer is NULL, realloc behaves like malloc.
+    result = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
+    // save allocated pointer just before one returned to user
+    if (result != NULL) {
+      *(void **)result = result;
+      result = (void **)result + 1;
+    }
+  } else if (size == 0) {
+    // If size is 0, realloc behaves like free.
+    // The thread must be registered by the call to kmpc_malloc() or
+    // kmpc_calloc() before.
+    // So it should be safe to call __kmp_get_thread(), not
+    // __kmp_entry_thread().
+    KMP_ASSERT(*((void **)ptr - 1));
+    brel(__kmp_get_thread(), *((void **)ptr - 1));
+  } else {
+    result = bgetr(__kmp_entry_thread(), *((void **)ptr - 1),
+                   (bufsize)(size + sizeof(ptr)));
+    if (result != NULL) {
+      *(void **)result = result;
+      result = (void **)result + 1;
+    }
+  }; // if
+  return result;
+}
+
+// NOTE: the library must have already been initialized by a previous allocate
+void kmpc_free(void *ptr) {
+  if (!__kmp_init_serial) {
+    return;
+  }; // if
+  if (ptr != NULL) {
+    kmp_info_t *th = __kmp_get_thread();
+    __kmp_bget_dequeue(th); /* Release any queued buffers */
+    // extract allocated pointer and free it
+    KMP_ASSERT(*((void **)ptr - 1));
+    brel(th, *((void **)ptr - 1));
+  };
+}
+
+void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL) {
+  void *ptr;
+  KE_TRACE(30, ("-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n", th,
+                (int)size KMP_SRC_LOC_PARM));
+  ptr = bget(th, (bufsize)size);
+  KE_TRACE(30, ("<- __kmp_thread_malloc() returns %p\n", ptr));
+  return ptr;
+}
+
+void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
+                           size_t elsize KMP_SRC_LOC_DECL) {
+  void *ptr;
+  KE_TRACE(30, ("-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n", th,
+                (int)nelem, (int)elsize KMP_SRC_LOC_PARM));
+  ptr = bgetz(th, (bufsize)(nelem * elsize));
+  KE_TRACE(30, ("<- __kmp_thread_calloc() returns %p\n", ptr));
+  return ptr;
+}
+
+void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
+                            size_t size KMP_SRC_LOC_DECL) {
+  KE_TRACE(30, ("-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n", th,
+                ptr, (int)size KMP_SRC_LOC_PARM));
+  ptr = bgetr(th, ptr, (bufsize)size);
+  KE_TRACE(30, ("<- __kmp_thread_realloc() returns %p\n", ptr));
+  return ptr;
+}
+
+void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL) {
+  KE_TRACE(30, ("-> __kmp_thread_free( %p, %p ) called from %s:%d\n", th,
+                ptr KMP_SRC_LOC_PARM));
+  if (ptr != NULL) {
+    __kmp_bget_dequeue(th); /* Release any queued buffers */
+    brel(th, ptr);
+  }
+  KE_TRACE(30, ("<- __kmp_thread_free()\n"));
+}
+
+/* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
+   memory leaks, but it may be useful for debugging memory corruptions, used
+   freed pointers, etc. */
 /* #define LEAK_MEMORY */
-
-struct kmp_mem_descr {      // Memory block descriptor.
-    void * ptr_allocated;   // Pointer returned by malloc(), subject for free().
-    size_t size_allocated;  // Size of allocated memory block.
-    void * ptr_aligned;     // Pointer to aligned memory, to be used by client code.
-    size_t size_aligned;    // Size of aligned memory block.
+struct kmp_mem_descr { // Memory block descriptor.
+  void *ptr_allocated; // Pointer returned by malloc(), subject for free().
+  size_t size_allocated; // Size of allocated memory block.
+  void *ptr_aligned; // Pointer to aligned memory, to be used by client code.
+  size_t size_aligned; // Size of aligned memory block.
 };
 typedef struct kmp_mem_descr kmp_mem_descr_t;
 
-/*
-    Allocate memory on requested boundary, fill allocated memory with 0x00.
-    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation error.
-    Must use __kmp_free when freeing memory allocated by this routine!
- */
-static
-void *
-___kmp_allocate_align( size_t size, size_t alignment KMP_SRC_LOC_DECL )
-{
-    /*
-            __kmp_allocate() allocates (by call to malloc()) bigger memory block than requested to
-        return properly aligned pointer. Original pointer returned by malloc() and size of allocated
-        block is saved in descriptor just before the aligned pointer. This information used by
-        __kmp_free() -- it has to pass to free() original pointer, not aligned one.
-
-            +---------+------------+-----------------------------------+---------+
-            | padding | descriptor |           aligned block           | padding |
-            +---------+------------+-----------------------------------+---------+
-            ^                      ^
-            |                      |
-            |                      +- Aligned pointer returned to caller
-            +- Pointer returned by malloc()
-
-        Aligned block is filled with zeros, paddings are filled with 0xEF.
-    */
-
-    kmp_mem_descr_t  descr;
-    kmp_uintptr_t    addr_allocated;        // Address returned by malloc().
-    kmp_uintptr_t    addr_aligned;          // Aligned address to return to caller.
-    kmp_uintptr_t    addr_descr;            // Address of memory block descriptor.
-
-    KE_TRACE( 25, (
-        "-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",
-        (int) size,
-        (int) alignment
-        KMP_SRC_LOC_PARM
-    ) );
-
-    KMP_DEBUG_ASSERT( alignment < 32 * 1024 ); // Alignment should not be too
-    KMP_DEBUG_ASSERT( sizeof( void * ) <= sizeof( kmp_uintptr_t ) );
-        // Make sure kmp_uintptr_t is enough to store addresses.
-
-    descr.size_aligned = size;
-    descr.size_allocated = descr.size_aligned + sizeof( kmp_mem_descr_t ) + alignment;
+/* Allocate memory on requested boundary, fill allocated memory with 0x00.
+   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
+   error. Must use __kmp_free when freeing memory allocated by this routine! */
+static void *___kmp_allocate_align(size_t size,
+                                   size_t alignment KMP_SRC_LOC_DECL) {
+  /* __kmp_allocate() allocates (by call to malloc()) bigger memory block than
+     requested to return properly aligned pointer. Original pointer returned
+     by malloc() and size of allocated block is saved in descriptor just
+     before the aligned pointer. This information used by __kmp_free() -- it
+     has to pass to free() original pointer, not aligned one.
+
+          +---------+------------+-----------------------------------+---------+
+          | padding | descriptor |           aligned block           | padding |
+          +---------+------------+-----------------------------------+---------+
+          ^                      ^
+          |                      |
+          |                      +- Aligned pointer returned to caller
+          +- Pointer returned by malloc()
+
+      Aligned block is filled with zeros, paddings are filled with 0xEF. */
+
+  kmp_mem_descr_t descr;
+  kmp_uintptr_t addr_allocated; // Address returned by malloc().
+  kmp_uintptr_t addr_aligned; // Aligned address to return to caller.
+  kmp_uintptr_t addr_descr; // Address of memory block descriptor.
+
+  KE_TRACE(25, ("-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",
+                (int)size, (int)alignment KMP_SRC_LOC_PARM));
+
+  KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too
+  KMP_DEBUG_ASSERT(sizeof(void *) <= sizeof(kmp_uintptr_t));
+  // Make sure kmp_uintptr_t is enough to store addresses.
+
+  descr.size_aligned = size;
+  descr.size_allocated =
+      descr.size_aligned + sizeof(kmp_mem_descr_t) + alignment;
 
 #if KMP_DEBUG
-    descr.ptr_allocated = _malloc_src_loc( descr.size_allocated, _file_, _line_ );
+  descr.ptr_allocated = _malloc_src_loc(descr.size_allocated, _file_, _line_);
 #else
-    descr.ptr_allocated = malloc_src_loc( descr.size_allocated KMP_SRC_LOC_PARM );
+  descr.ptr_allocated = malloc_src_loc(descr.size_allocated KMP_SRC_LOC_PARM);
 #endif
-    KE_TRACE( 10, (
-        "   malloc( %d ) returned %p\n",
-        (int) descr.size_allocated,
-        descr.ptr_allocated
-    ) );
-    if ( descr.ptr_allocated == NULL ) {
-        KMP_FATAL( OutOfHeapMemory );
-    };
-
-    addr_allocated = (kmp_uintptr_t) descr.ptr_allocated;
-    addr_aligned =
-        ( addr_allocated + sizeof( kmp_mem_descr_t ) + alignment )
-        & ~ ( alignment - 1 );
-    addr_descr = addr_aligned - sizeof( kmp_mem_descr_t );
-
-    descr.ptr_aligned = (void *) addr_aligned;
-
-    KE_TRACE( 26, (
-        "   ___kmp_allocate_align: "
-            "ptr_allocated=%p, size_allocated=%d, "
-            "ptr_aligned=%p, size_aligned=%d\n",
-        descr.ptr_allocated,
-        (int) descr.size_allocated,
-        descr.ptr_aligned,
-        (int) descr.size_aligned
-    ) );
-
-    KMP_DEBUG_ASSERT( addr_allocated <= addr_descr );
-    KMP_DEBUG_ASSERT( addr_descr + sizeof( kmp_mem_descr_t ) == addr_aligned );
-    KMP_DEBUG_ASSERT( addr_aligned + descr.size_aligned <= addr_allocated + descr.size_allocated );
-    KMP_DEBUG_ASSERT( addr_aligned % alignment == 0 );
+  KE_TRACE(10, ("   malloc( %d ) returned %p\n", (int)descr.size_allocated,
+                descr.ptr_allocated));
+  if (descr.ptr_allocated == NULL) {
+    KMP_FATAL(OutOfHeapMemory);
+  };
+
+  addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
+  addr_aligned =
+      (addr_allocated + sizeof(kmp_mem_descr_t) + alignment) & ~(alignment - 1);
+  addr_descr = addr_aligned - sizeof(kmp_mem_descr_t);
+
+  descr.ptr_aligned = (void *)addr_aligned;
+
+  KE_TRACE(26, ("   ___kmp_allocate_align: "
+                "ptr_allocated=%p, size_allocated=%d, "
+                "ptr_aligned=%p, size_aligned=%d\n",
+                descr.ptr_allocated, (int)descr.size_allocated,
+                descr.ptr_aligned, (int)descr.size_aligned));
+
+  KMP_DEBUG_ASSERT(addr_allocated <= addr_descr);
+  KMP_DEBUG_ASSERT(addr_descr + sizeof(kmp_mem_descr_t) == addr_aligned);
+  KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
+                   addr_allocated + descr.size_allocated);
+  KMP_DEBUG_ASSERT(addr_aligned % alignment == 0);
 #ifdef KMP_DEBUG
-    memset( descr.ptr_allocated, 0xEF, descr.size_allocated );
-        // Fill allocated memory block with 0xEF.
+  memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
+// Fill allocated memory block with 0xEF.
 #endif
-    memset( descr.ptr_aligned, 0x00, descr.size_aligned );
-        // Fill the aligned memory block (which is intended for using by caller) with 0x00. Do not
-        // put this filling under KMP_DEBUG condition! Many callers expect zeroed memory. (Padding
-        // bytes remain filled with 0xEF in debugging library.)
-    * ( (kmp_mem_descr_t *) addr_descr ) = descr;
+  memset(descr.ptr_aligned, 0x00, descr.size_aligned);
+  // Fill the aligned memory block (which is intended for using by caller) with
+  // 0x00. Do not
+  // put this filling under KMP_DEBUG condition! Many callers expect zeroed
+  // memory. (Padding
+  // bytes remain filled with 0xEF in debugging library.)
+  *((kmp_mem_descr_t *)addr_descr) = descr;
 
-    KMP_MB();
+  KMP_MB();
 
-    KE_TRACE( 25, ( "<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned ) );
-    return descr.ptr_aligned;
+  KE_TRACE(25, ("<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned));
+  return descr.ptr_aligned;
 } // func ___kmp_allocate_align
 
-
-/*
-    Allocate memory on cache line boundary, fill allocated memory with 0x00.
-    Do not call this func directly! Use __kmp_allocate macro instead.
-    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation error.
-    Must use __kmp_free when freeing memory allocated by this routine!
- */
-void *
-___kmp_allocate( size_t size KMP_SRC_LOC_DECL )
-{
-    void * ptr;
-    KE_TRACE( 25, ( "-> __kmp_allocate( %d ) called from %s:%d\n", (int) size KMP_SRC_LOC_PARM ) );
-    ptr = ___kmp_allocate_align( size, __kmp_align_alloc KMP_SRC_LOC_PARM );
-    KE_TRACE( 25, ( "<- __kmp_allocate() returns %p\n", ptr ) );
-    return ptr;
+/* Allocate memory on cache line boundary, fill allocated memory with 0x00.
+   Do not call this func directly! Use __kmp_allocate macro instead.
+   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
+   error. Must use __kmp_free when freeing memory allocated by this routine! */
+void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL) {
+  void *ptr;
+  KE_TRACE(25, ("-> __kmp_allocate( %d ) called from %s:%d\n",
+                (int)size KMP_SRC_LOC_PARM));
+  ptr = ___kmp_allocate_align(size, __kmp_align_alloc KMP_SRC_LOC_PARM);
+  KE_TRACE(25, ("<- __kmp_allocate() returns %p\n", ptr));
+  return ptr;
 } // func ___kmp_allocate
 
-#if (BUILD_MEMORY==FIRST_TOUCH)
-void *
-__kmp_ft_page_allocate(size_t size)
-{
+#if (BUILD_MEMORY == FIRST_TOUCH)
+void *__kmp_ft_page_allocate(size_t size) {
   void *adr, *aadr;
 
   const int page_size = KMP_GET_PAGE_SIZE();
 
-  adr = (void *) __kmp_thread_malloc( __kmp_get_thread(),
+  adr = (void *)__kmp_thread_malloc(__kmp_get_thread(),
                                     size + page_size + KMP_PTR_SKIP);
-  if ( adr == 0 )
-    KMP_FATAL( OutOfHeapMemory );
+  if (adr == 0)
+    KMP_FATAL(OutOfHeapMemory);
 
   /* check to see if adr is on a page boundary. */
-  if ( ( (kmp_uintptr_t) adr & (page_size - 1)) == 0)
+  if (((kmp_uintptr_t)adr & (page_size - 1)) == 0)
     /* nothing to do if adr is already on a page boundary. */
     aadr = adr;
   else
     /* else set aadr to the first page boundary in the allocated memory. */
-    aadr = (void *) ( ( (kmp_uintptr_t) adr + page_size) & ~(page_size - 1) );
+    aadr = (void *)(((kmp_uintptr_t)adr + page_size) & ~(page_size - 1));
 
   /* the first touch by the owner thread. */
-  *((void**)aadr) = adr;
+  *((void **)aadr) = adr;
 
   /* skip the memory space used for storing adr above. */
-  return (void*)((char*)aadr + KMP_PTR_SKIP);
+  return (void *)((char *)aadr + KMP_PTR_SKIP);
 }
 #endif
 
-/*
-    Allocate memory on page boundary, fill allocated memory with 0x00.
-    Does not call this func directly! Use __kmp_page_allocate macro instead.
-    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation error.
-    Must use __kmp_free when freeing memory allocated by this routine!
- */
-void *
-___kmp_page_allocate( size_t size KMP_SRC_LOC_DECL )
-{
-    int    page_size = 8 * 1024;
-    void * ptr;
-
-    KE_TRACE( 25, (
-        "-> __kmp_page_allocate( %d ) called from %s:%d\n",
-        (int) size
-        KMP_SRC_LOC_PARM
-    ) );
-    ptr = ___kmp_allocate_align( size, page_size KMP_SRC_LOC_PARM );
-    KE_TRACE( 25, ( "<- __kmp_page_allocate( %d ) returns %p\n", (int) size, ptr ) );
-    return ptr;
+/* Allocate memory on page boundary, fill allocated memory with 0x00.
+   Does not call this func directly! Use __kmp_page_allocate macro instead.
+   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
+   error. Must use __kmp_free when freeing memory allocated by this routine! */
+void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {
+  int page_size = 8 * 1024;
+  void *ptr;
+
+  KE_TRACE(25, ("-> __kmp_page_allocate( %d ) called from %s:%d\n",
+                (int)size KMP_SRC_LOC_PARM));
+  ptr = ___kmp_allocate_align(size, page_size KMP_SRC_LOC_PARM);
+  KE_TRACE(25, ("<- __kmp_page_allocate( %d ) returns %p\n", (int)size, ptr));
+  return ptr;
 } // ___kmp_page_allocate
 
-/*
-    Free memory allocated by __kmp_allocate() and __kmp_page_allocate().
-    In debug mode, fill the memory block with 0xEF before call to free().
-*/
-void
-___kmp_free( void * ptr KMP_SRC_LOC_DECL )
-{
-    kmp_mem_descr_t descr;
-    kmp_uintptr_t   addr_allocated;        // Address returned by malloc().
-    kmp_uintptr_t   addr_aligned;          // Aligned address passed by caller.
-
-    KE_TRACE( 25, ( "-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM ) );
-    KMP_ASSERT( ptr != NULL );
-
-    descr = * ( kmp_mem_descr_t *) ( (kmp_uintptr_t) ptr - sizeof( kmp_mem_descr_t ) );
-
-    KE_TRACE( 26, ( "   __kmp_free:     "
-                    "ptr_allocated=%p, size_allocated=%d, "
-                    "ptr_aligned=%p, size_aligned=%d\n",
-                    descr.ptr_allocated, (int) descr.size_allocated,
-                    descr.ptr_aligned, (int) descr.size_aligned ));
-
-    addr_allocated = (kmp_uintptr_t) descr.ptr_allocated;
-    addr_aligned   = (kmp_uintptr_t) descr.ptr_aligned;
-
-    KMP_DEBUG_ASSERT( addr_aligned % CACHE_LINE == 0 );
-    KMP_DEBUG_ASSERT( descr.ptr_aligned == ptr );
-    KMP_DEBUG_ASSERT( addr_allocated + sizeof( kmp_mem_descr_t ) <= addr_aligned );
-    KMP_DEBUG_ASSERT( descr.size_aligned < descr.size_allocated );
-    KMP_DEBUG_ASSERT( addr_aligned + descr.size_aligned <= addr_allocated + descr.size_allocated );
-
-    #ifdef KMP_DEBUG
-        memset( descr.ptr_allocated, 0xEF, descr.size_allocated );
-            // Fill memory block with 0xEF, it helps catch using freed memory.
-    #endif
-
-    #ifndef LEAK_MEMORY
-        KE_TRACE( 10, ( "   free( %p )\n", descr.ptr_allocated ) );
-    # ifdef KMP_DEBUG
-        _free_src_loc( descr.ptr_allocated, _file_, _line_ );
-    # else
-        free_src_loc( descr.ptr_allocated KMP_SRC_LOC_PARM );
-    # endif
-    #endif
-    KMP_MB();
-    KE_TRACE( 25, ( "<- __kmp_free() returns\n" ) );
-} // func ___kmp_free
+/* Free memory allocated by __kmp_allocate() and __kmp_page_allocate().
+   In debug mode, fill the memory block with 0xEF before call to free(). */
+void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
+  kmp_mem_descr_t descr;
+  kmp_uintptr_t addr_allocated; // Address returned by malloc().
+  kmp_uintptr_t addr_aligned; // Aligned address passed by caller.
+
+  KE_TRACE(25,
+           ("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));
+  KMP_ASSERT(ptr != NULL);
+
+  descr = *(kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t));
+
+  KE_TRACE(26, ("   __kmp_free:     "
+                "ptr_allocated=%p, size_allocated=%d, "
+                "ptr_aligned=%p, size_aligned=%d\n",
+                descr.ptr_allocated, (int)descr.size_allocated,
+                descr.ptr_aligned, (int)descr.size_aligned));
+
+  addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
+  addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;
+
+  KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);
+  KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);
+  KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);
+  KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);
+  KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
+                   addr_allocated + descr.size_allocated);
 
-/* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
+#ifdef KMP_DEBUG
+  memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
+// Fill memory block with 0xEF, it helps catch using freed memory.
+#endif
+
+#ifndef LEAK_MEMORY
+  KE_TRACE(10, ("   free( %p )\n", descr.ptr_allocated));
+#ifdef KMP_DEBUG
+  _free_src_loc(descr.ptr_allocated, _file_, _line_);
+#else
+  free_src_loc(descr.ptr_allocated KMP_SRC_LOC_PARM);
+#endif
+#endif
+  KMP_MB();
+  KE_TRACE(25, ("<- __kmp_free() returns\n"));
+} // func ___kmp_free
 
 #if USE_FAST_MEMORY == 3
 // Allocate fast memory by first scanning the thread's free lists
@@ -1825,254 +1647,257 @@ ___kmp_free( void * ptr KMP_SRC_LOC_DECL
 #define KMP_FREE_LIST_LIMIT 16
 
 // Always use 128 bytes for determining buckets for caching memory blocks
-#define DCACHE_LINE  128
+#define DCACHE_LINE 128
 
-void *
-___kmp_fast_allocate( kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL )
-{
-    void            * ptr;
-    int               num_lines;
-    int               idx;
-    int               index;
-    void            * alloc_ptr;
-    size_t            alloc_size;
-    kmp_mem_descr_t * descr;
-
-    KE_TRACE( 25, ( "-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",
-      __kmp_gtid_from_thread(this_thr), (int) size KMP_SRC_LOC_PARM ) );
-
-    num_lines = ( size + DCACHE_LINE - 1 ) / DCACHE_LINE;
-    idx = num_lines - 1;
-    KMP_DEBUG_ASSERT( idx >= 0 );
-    if ( idx < 2 ) {
-        index = 0;       // idx is [ 0, 1 ], use first free list
-        num_lines = 2;   // 1, 2 cache lines or less than cache line
-    } else if ( ( idx >>= 2 ) == 0 ) {
-        index = 1;       // idx is [ 2, 3 ], use second free list
-        num_lines = 4;   // 3, 4 cache lines
-    } else if ( ( idx >>= 2 ) == 0 ) {
-        index = 2;       // idx is [ 4, 15 ], use third free list
-        num_lines = 16;  // 5, 6, ..., 16 cache lines
-    } else if ( ( idx >>= 2 ) == 0 ) {
-        index = 3;       // idx is [ 16, 63 ], use fourth free list
-        num_lines = 64;  // 17, 18, ..., 64 cache lines
-    } else {
-        goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists
-    }
+void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
+  void *ptr;
+  int num_lines;
+  int idx;
+  int index;
+  void *alloc_ptr;
+  size_t alloc_size;
+  kmp_mem_descr_t *descr;
+
+  KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",
+                __kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM));
+
+  num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE;
+  idx = num_lines - 1;
+  KMP_DEBUG_ASSERT(idx >= 0);
+  if (idx < 2) {
+    index = 0; // idx is [ 0, 1 ], use first free list
+    num_lines = 2; // 1, 2 cache lines or less than cache line
+  } else if ((idx >>= 2) == 0) {
+    index = 1; // idx is [ 2, 3 ], use second free list
+    num_lines = 4; // 3, 4 cache lines
+  } else if ((idx >>= 2) == 0) {
+    index = 2; // idx is [ 4, 15 ], use third free list
+    num_lines = 16; // 5, 6, ..., 16 cache lines
+  } else if ((idx >>= 2) == 0) {
+    index = 3; // idx is [ 16, 63 ], use fourth free list
+    num_lines = 64; // 17, 18, ..., 64 cache lines
+  } else {
+    goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists
+  }
+
+  ptr = this_thr->th.th_free_lists[index].th_free_list_self;
+  if (ptr != NULL) {
+    // pop the head of no-sync free list
+    this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
+    KMP_DEBUG_ASSERT(
+        this_thr ==
+        ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t)))
+            ->ptr_aligned);
+    goto end;
+  };
+  ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
+  if (ptr != NULL) {
+    // no-sync free list is empty, use sync free list (filled in by other
+    // threads only)
+    // pop the head of the sync free list, push NULL instead
+    while (!KMP_COMPARE_AND_STORE_PTR(
+        &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, NULL)) {
+      KMP_CPU_PAUSE();
+      ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
+    }
+    // push the rest of chain into no-sync free list (can be NULL if there was
+    // the only block)
+    this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
+    KMP_DEBUG_ASSERT(
+        this_thr ==
+        ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t)))
+            ->ptr_aligned);
+    goto end;
+  }
 
-    ptr = this_thr->th.th_free_lists[index].th_free_list_self;
-    if ( ptr != NULL ) {
-        // pop the head of no-sync free list
-        this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
-        KMP_DEBUG_ASSERT( this_thr ==
-            ((kmp_mem_descr_t *)( (kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t) ))->ptr_aligned );
-        goto end;
-    };
-    ptr = TCR_SYNC_PTR( this_thr->th.th_free_lists[index].th_free_list_sync );
-    if ( ptr != NULL ) {
-        // no-sync free list is empty, use sync free list (filled in by other threads only)
-        // pop the head of the sync free list, push NULL instead
-        while ( ! KMP_COMPARE_AND_STORE_PTR(
-            &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, NULL ) )
-        {
-            KMP_CPU_PAUSE();
-            ptr = TCR_SYNC_PTR( this_thr->th.th_free_lists[index].th_free_list_sync );
-        }
-        // push the rest of chain into no-sync free list (can be NULL if there was the only block)
-        this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
-        KMP_DEBUG_ASSERT( this_thr ==
-            ((kmp_mem_descr_t *)( (kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t) ))->ptr_aligned );
-        goto end;
-    }
-
-    alloc_call:
-    // haven't found block in the free lists, thus allocate it
-    size = num_lines * DCACHE_LINE;
-
-    alloc_size = size + sizeof( kmp_mem_descr_t ) + DCACHE_LINE;
-    KE_TRACE( 25, ( "__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with alloc_size %d\n",
-                   __kmp_gtid_from_thread( this_thr ), alloc_size ) );
-    alloc_ptr = bget( this_thr, (bufsize) alloc_size );
-
-    // align ptr to DCACHE_LINE
-    ptr = (void *)(( ((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) + DCACHE_LINE ) & ~( DCACHE_LINE - 1 ));
-    descr = (kmp_mem_descr_t *)( ((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t) );
-
-    descr->ptr_allocated = alloc_ptr;        // remember allocated pointer
-    // we don't need size_allocated
-    descr->ptr_aligned   = (void *)this_thr; // remember allocating thread
-                                             // (it is already saved in bget buffer,
-                                             // but we may want to use another allocator in future)
-    descr->size_aligned  = size;
-
-    end:
-    KE_TRACE( 25, ( "<- __kmp_fast_allocate( T#%d ) returns %p\n",
-                    __kmp_gtid_from_thread( this_thr ), ptr ) );
-    return ptr;
+alloc_call:
+  // haven't found block in the free lists, thus allocate it
+  size = num_lines * DCACHE_LINE;
+
+  alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE;
+  KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with "
+                "alloc_size %d\n",
+                __kmp_gtid_from_thread(this_thr), alloc_size));
+  alloc_ptr = bget(this_thr, (bufsize)alloc_size);
+
+  // align ptr to DCACHE_LINE
+  ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) +
+                  DCACHE_LINE) &
+                 ~(DCACHE_LINE - 1));
+  descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
+
+  descr->ptr_allocated = alloc_ptr; // remember allocated pointer
+  // we don't need size_allocated
+  descr->ptr_aligned = (void *)this_thr; // remember allocating thread
+  // (it is already saved in bget buffer,
+  // but we may want to use another allocator in future)
+  descr->size_aligned = size;
+
+end:
+  KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n",
+                __kmp_gtid_from_thread(this_thr), ptr));
+  return ptr;
 } // func __kmp_fast_allocate
 
 // Free fast memory and place it on the thread's free list if it is of
 // the correct size.
-void
-___kmp_fast_free( kmp_info_t *this_thr, void * ptr KMP_SRC_LOC_DECL )
-{
-    kmp_mem_descr_t * descr;
-    kmp_info_t      * alloc_thr;
-    size_t            size;
-    size_t            idx;
-    int               index;
-
-    KE_TRACE( 25, ( "-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",
-      __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM ) );
-    KMP_ASSERT( ptr != NULL );
-
-    descr = (kmp_mem_descr_t *)( ((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t) );
-
-    KE_TRACE(26, ("   __kmp_fast_free:     size_aligned=%d\n",
-                  (int) descr->size_aligned ) );
-
-    size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines
-
-    idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block
-    if ( idx == size ) {
-        index = 0;       // 2 cache lines
-    } else if ( ( idx <<= 1 ) == size ) {
-        index = 1;       // 4 cache lines
-    } else if ( ( idx <<= 2 ) == size ) {
-        index = 2;       // 16 cache lines
-    } else if ( ( idx <<= 2 ) == size ) {
-        index = 3;       // 64 cache lines
+void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {
+  kmp_mem_descr_t *descr;
+  kmp_info_t *alloc_thr;
+  size_t size;
+  size_t idx;
+  int index;
+
+  KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",
+                __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM));
+  KMP_ASSERT(ptr != NULL);
+
+  descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
+
+  KE_TRACE(26, ("   __kmp_fast_free:     size_aligned=%d\n",
+                (int)descr->size_aligned));
+
+  size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines
+
+  idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block
+  if (idx == size) {
+    index = 0; // 2 cache lines
+  } else if ((idx <<= 1) == size) {
+    index = 1; // 4 cache lines
+  } else if ((idx <<= 2) == size) {
+    index = 2; // 16 cache lines
+  } else if ((idx <<= 2) == size) {
+    index = 3; // 64 cache lines
+  } else {
+    KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64);
+    goto free_call; // 65 or more cache lines ( > 8KB )
+  }
+
+  alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block
+  if (alloc_thr == this_thr) {
+    // push block to self no-sync free list, linking previous head (LIFO)
+    *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;
+    this_thr->th.th_free_lists[index].th_free_list_self = ptr;
+  } else {
+    void *head = this_thr->th.th_free_lists[index].th_free_list_other;
+    if (head == NULL) {
+      // Create new free list
+      this_thr->th.th_free_lists[index].th_free_list_other = ptr;
+      *((void **)ptr) = NULL; // mark the tail of the list
+      descr->size_allocated = (size_t)1; // head of the list keeps its length
     } else {
-        KMP_DEBUG_ASSERT( size > DCACHE_LINE * 64 );
-        goto free_call;  // 65 or more cache lines ( > 8KB )
-    }
-
-    alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block
-    if ( alloc_thr == this_thr ) {
-        // push block to self no-sync free list, linking previous head (LIFO)
-        *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;
-        this_thr->th.th_free_lists[index].th_free_list_self = ptr;
-    } else {
-        void * head = this_thr->th.th_free_lists[index].th_free_list_other;
-        if ( head == NULL ) {
-            // Create new free list
-            this_thr->th.th_free_lists[index].th_free_list_other = ptr;
-            *((void **)ptr) = NULL;             // mark the tail of the list
-            descr->size_allocated = (size_t)1;  // head of the list keeps its length
-        } else {
-            // need to check existed "other" list's owner thread and size of queue
-            kmp_mem_descr_t * dsc  = (kmp_mem_descr_t *)( (char*)head - sizeof(kmp_mem_descr_t) );
-            kmp_info_t      * q_th = (kmp_info_t *)(dsc->ptr_aligned); // allocating thread, same for all queue nodes
-            size_t            q_sz = dsc->size_allocated + 1;          // new size in case we add current task
-            if ( q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT ) {
-                // we can add current task to "other" list, no sync needed
-                *((void **)ptr) = head;
-                descr->size_allocated = q_sz;
-                this_thr->th.th_free_lists[index].th_free_list_other = ptr;
-            } else {
-                // either queue blocks owner is changing or size limit exceeded
-                // return old queue to allocating thread (q_th) synchroneously,
-                // and start new list for alloc_thr's tasks
-                void * old_ptr;
-                void * tail = head;
-                void * next = *((void **)head);
-                while ( next != NULL ) {
-                    KMP_DEBUG_ASSERT(
-                        // queue size should decrease by 1 each step through the list
-                        ((kmp_mem_descr_t*)((char*)next - sizeof(kmp_mem_descr_t)))->size_allocated + 1 ==
-                        ((kmp_mem_descr_t*)((char*)tail - sizeof(kmp_mem_descr_t)))->size_allocated );
-                    tail = next;   // remember tail node
-                    next = *((void **)next);
-                }
-                KMP_DEBUG_ASSERT( q_th != NULL );
-                // push block to owner's sync free list
-                old_ptr = TCR_PTR( q_th->th.th_free_lists[index].th_free_list_sync );
-                /* the next pointer must be set before setting free_list to ptr to avoid
-                   exposing a broken list to other threads, even for an instant. */
-                *((void **)tail) = old_ptr;
-
-                while ( ! KMP_COMPARE_AND_STORE_PTR(
-                    &q_th->th.th_free_lists[index].th_free_list_sync,
-                    old_ptr,
-                    head ) )
-                {
-                    KMP_CPU_PAUSE();
-                    old_ptr = TCR_PTR( q_th->th.th_free_lists[index].th_free_list_sync );
-                    *((void **)tail) = old_ptr;
-                }
-
-                // start new list of not-selt tasks
-                this_thr->th.th_free_lists[index].th_free_list_other = ptr;
-                *((void **)ptr) = NULL;
-                descr->size_allocated = (size_t)1;  // head of queue keeps its length
-            }
-        }
-    }
-    goto end;
+      // need to check existed "other" list's owner thread and size of queue
+      kmp_mem_descr_t *dsc =
+          (kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t));
+      // allocating thread, same for all queue nodes
+      kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned);
+      size_t q_sz =
+          dsc->size_allocated + 1; // new size in case we add current task
+      if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) {
+        // we can add current task to "other" list, no sync needed
+        *((void **)ptr) = head;
+        descr->size_allocated = q_sz;
+        this_thr->th.th_free_lists[index].th_free_list_other = ptr;
+      } else {
+        // either queue blocks owner is changing or size limit exceeded
+        // return old queue to allocating thread (q_th) synchroneously,
+        // and start new list for alloc_thr's tasks
+        void *old_ptr;
+        void *tail = head;
+        void *next = *((void **)head);
+        while (next != NULL) {
+          KMP_DEBUG_ASSERT(
+              // queue size should decrease by 1 each step through the list
+              ((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t)))
+                      ->size_allocated +
+                  1 ==
+              ((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t)))
+                  ->size_allocated);
+          tail = next; // remember tail node
+          next = *((void **)next);
+        }
+        KMP_DEBUG_ASSERT(q_th != NULL);
+        // push block to owner's sync free list
+        old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
+        /* the next pointer must be set before setting free_list to ptr to avoid
+           exposing a broken list to other threads, even for an instant. */
+        *((void **)tail) = old_ptr;
 
-    free_call:
-    KE_TRACE(25, ( "__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",
-                   __kmp_gtid_from_thread( this_thr), size ) );
-    __kmp_bget_dequeue( this_thr );         /* Release any queued buffers */
-    brel( this_thr, descr->ptr_allocated );
+        while (!KMP_COMPARE_AND_STORE_PTR(
+            &q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) {
+          KMP_CPU_PAUSE();
+          old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
+          *((void **)tail) = old_ptr;
+        }
+
+        // start new list of not-selt tasks
+        this_thr->th.th_free_lists[index].th_free_list_other = ptr;
+        *((void **)ptr) = NULL;
+        descr->size_allocated = (size_t)1; // head of queue keeps its length
+      }
+    }
+  }
+  goto end;
+
+free_call:
+  KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",
+                __kmp_gtid_from_thread(this_thr), size));
+  __kmp_bget_dequeue(this_thr); /* Release any queued buffers */
+  brel(this_thr, descr->ptr_allocated);
 
-    end:
-    KE_TRACE( 25, ( "<- __kmp_fast_free() returns\n" ) );
+end:
+  KE_TRACE(25, ("<- __kmp_fast_free() returns\n"));
 
 } // func __kmp_fast_free
 
-
 // Initialize the thread free lists related to fast memory
 // Only do this when a thread is initially created.
-void
-__kmp_initialize_fast_memory( kmp_info_t *this_thr )
-{
-    KE_TRACE(10, ( "__kmp_initialize_fast_memory: Called from th %p\n", this_thr ) );
+void __kmp_initialize_fast_memory(kmp_info_t *this_thr) {
+  KE_TRACE(10, ("__kmp_initialize_fast_memory: Called from th %p\n", this_thr));
 
-    memset ( this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof( kmp_free_list_t ) );
+  memset(this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof(kmp_free_list_t));
 }
 
 // Free the memory in the thread free lists related to fast memory
 // Only do this when a thread is being reaped (destroyed).
-void
-__kmp_free_fast_memory( kmp_info_t *th )
-{
-    // Suppose we use BGET underlying allocator, walk through its structures...
-    int          bin;
-    thr_data_t * thr = get_thr_data( th );
-    void      ** lst = NULL;
-
-    KE_TRACE(5, ( "__kmp_free_fast_memory: Called T#%d\n",
-                   __kmp_gtid_from_thread( th ) ) );
-
-    __kmp_bget_dequeue( th );         // Release any queued buffers
-
-    // Dig through free lists and extract all allocated blocks
-    for ( bin = 0; bin < MAX_BGET_BINS; ++bin ) {
-        bfhead_t * b = thr->freelist[ bin ].ql.flink;
-        while ( b != &thr->freelist[ bin ] ) {
-            if ( (kmp_uintptr_t)b->bh.bb.bthr & 1 ) {   // if the buffer is an allocated address?
-                *((void**)b) = lst;   // link the list (override bthr, but keep flink yet)
-                lst = (void**)b;      // push b into lst
-            }
-            b = b->ql.flink;          // get next buffer
-        }
-    }
-    while ( lst != NULL ) {
-        void * next = *lst;
-        KE_TRACE(10, ( "__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",
-                      lst, next, th, __kmp_gtid_from_thread( th ) ) );
-        (*thr->relfcn)(lst);
-        #if BufStats
-            // count blocks to prevent problems in __kmp_finalize_bget()
-            thr->numprel++;       /* Nr of expansion block releases */
-            thr->numpblk--;       /* Total number of blocks */
-        #endif
-        lst = (void**)next;
-    }
+void __kmp_free_fast_memory(kmp_info_t *th) {
+  // Suppose we use BGET underlying allocator, walk through its structures...
+  int bin;
+  thr_data_t *thr = get_thr_data(th);
+  void **lst = NULL;
+
+  KE_TRACE(
+      5, ("__kmp_free_fast_memory: Called T#%d\n", __kmp_gtid_from_thread(th)));
+
+  __kmp_bget_dequeue(th); // Release any queued buffers
+
+  // Dig through free lists and extract all allocated blocks
+  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+    bfhead_t *b = thr->freelist[bin].ql.flink;
+    while (b != &thr->freelist[bin]) {
+      if ((kmp_uintptr_t)b->bh.bb.bthr & 1) { // the buffer is allocated address
+        *((void **)b) =
+            lst; // link the list (override bthr, but keep flink yet)
+        lst = (void **)b; // push b into lst
+      }
+      b = b->ql.flink; // get next buffer
+    }
+  }
+  while (lst != NULL) {
+    void *next = *lst;
+    KE_TRACE(10, ("__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",
+                  lst, next, th, __kmp_gtid_from_thread(th)));
+    (*thr->relfcn)(lst);
+#if BufStats
+    // count blocks to prevent problems in __kmp_finalize_bget()
+    thr->numprel++; /* Nr of expansion block releases */
+    thr->numpblk--; /* Total number of blocks */
+#endif
+    lst = (void **)next;
+  }
 
-    KE_TRACE(5, ( "__kmp_free_fast_memory: Freed T#%d\n",
-                  __kmp_gtid_from_thread( th ) ) );
+  KE_TRACE(
+      5, ("__kmp_free_fast_memory: Freed T#%d\n", __kmp_gtid_from_thread(th)));
 }
 
 #endif // USE_FAST_MEMORY