[llvm] 53913a6 - Optimize path::remove_dots

Reid Kleckner via llvm-commits llvm-commits at lists.llvm.org
Sun May 3 08:20:06 PDT 2020


Author: Reid Kleckner
Date: 2020-05-03T07:58:05-07:00
New Revision: 53913a65b408ade2956061b4c0aaed6bba907403

URL: https://github.com/llvm/llvm-project/commit/53913a65b408ade2956061b4c0aaed6bba907403
DIFF: https://github.com/llvm/llvm-project/commit/53913a65b408ade2956061b4c0aaed6bba907403.diff

LOG: Optimize path::remove_dots

LLD calls this on every source file string in every object file when
writing PDBs, so it is somewhat hot.

Avoid rewriting paths that do not contain path traversal components
(./..). Use find_first_not_of(separators) directly instead of using the
path iterators. The path component iterators appear to be slow, and
directly searching for slashes makes it easier to find double separators
that need to be canonicalized.

I discovered that the VFS relies on remote_dots to not canonicalize
early slashes (/foo or C:/foo) on Windows, so I had to leave that
behavior behind with unit tests for it. This is undesirable, but I claim
that my change is NFC.

Added: 
    

Modified: 
    llvm/lib/Support/Path.cpp
    llvm/lib/Support/Windows/Path.inc
    llvm/unittests/Support/Path.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index 6f065b9c5a3c..f5ab2504a83f 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -684,43 +684,66 @@ StringRef remove_leading_dotslash(StringRef Path, Style style) {
   return Path;
 }
 
-static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot,
-                                    Style style) {
+// Remove path traversal components ("." and "..") when possible, and
+// canonicalize slashes.
+bool remove_dots(SmallVectorImpl<char> &the_path, bool remove_dot_dot,
+                 Style style) {
+  style = real_style(style);
+  StringRef remaining(the_path.data(), the_path.size());
+  bool needs_change = false;
   SmallVector<StringRef, 16> components;
 
-  // Skip the root path, then look for traversal in the components.
-  StringRef rel = path::relative_path(path, style);
-  for (StringRef C :
-       llvm::make_range(path::begin(rel, style), path::end(rel))) {
-    if (C == ".")
-      continue;
-    // Leading ".." will remain in the path unless it's at the root.
-    if (remove_dot_dot && C == "..") {
+  // Consume the root path, if present.
+  StringRef root = path::root_path(remaining, style);
+  bool absolute = !root.empty();
+  if (absolute)
+    remaining = remaining.drop_front(root.size());
+
+  // Loop over path components manually. This makes it easier to detect
+  // non-preferred slashes and double separators that must be canonicalized.
+  while (!remaining.empty()) {
+    size_t next_slash = remaining.find_first_of(separators(style));
+    if (next_slash == StringRef::npos)
+      next_slash = remaining.size();
+    StringRef component = remaining.take_front(next_slash);
+    remaining = remaining.drop_front(next_slash);
+
+    // Eat the slash, and check if it is the preferred separator.
+    if (!remaining.empty()) {
+      needs_change |= remaining.front() != preferred_separator(style);
+      remaining = remaining.drop_front();
+    }
+
+    // Check for path traversal components or double separators.
+    if (component.empty() || component == ".") {
+      needs_change = true;
+    } else if (remove_dot_dot && component == "..") {
+      needs_change = true;
+      // Do not allow ".." to remove the root component. If this is the
+      // beginning of a relative path, keep the ".." component.
       if (!components.empty() && components.back() != "..") {
         components.pop_back();
-        continue;
+      } else if (!absolute) {
+        components.push_back(component);
       }
-      if (path::is_absolute(path, style))
-        continue;
+    } else {
+      components.push_back(component);
     }
-    components.push_back(C);
   }
 
-  SmallString<256> buffer = path::root_path(path, style);
-  for (StringRef C : components)
-    path::append(buffer, style, C);
-  return buffer;
-}
-
-bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot,
-                 Style style) {
-  StringRef p(path.data(), path.size());
-
-  SmallString<256> result = remove_dots(p, remove_dot_dot, style);
-  if (result == path)
+  // Avoid rewriting the path unless we have to.
+  if (!needs_change)
     return false;
 
-  path.swap(result);
+  SmallString<256> buffer = root;
+  if (!components.empty()) {
+    buffer += components[0];
+    for (StringRef C : makeArrayRef(components).drop_front()) {
+      buffer += preferred_separator(style);
+      buffer += C;
+    }
+  }
+  the_path.swap(buffer);
   return true;
 }
 

diff  --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index 2e1488479095..ec62e656ddf0 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -101,18 +101,13 @@ std::error_code widenPath(const Twine &Path8, SmallVectorImpl<wchar_t> &Path16,
   }
 
   // Remove '.' and '..' because long paths treat these as real path components.
+  llvm::sys::path::native(Path8Str, path::Style::windows);
   llvm::sys::path::remove_dots(Path8Str, true);
 
   const StringRef RootName = llvm::sys::path::root_name(Path8Str);
   assert(!RootName.empty() &&
          "Root name cannot be empty for an absolute path!");
 
-  // llvm::sys::path::remove_dots, used above, can leave a '/' after the root
-  // name and long paths must use '\' as the separator.
-  const size_t RootNameSize = RootName.size();
-  if (RootNameSize < Path8Str.size() && Path8Str[RootNameSize] == '/')
-    Path8Str[RootNameSize] = '\\';
-
   SmallString<2 * MAX_PATH> FullPath(LongPathPrefix);
   if (RootName[1] != ':') { // Check if UNC.
     FullPath.append("UNC\\");

diff  --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp
index 3030fb2ebb2e..8c530487c154 100644
--- a/llvm/unittests/Support/Path.cpp
+++ b/llvm/unittests/Support/Path.cpp
@@ -1253,6 +1253,22 @@ TEST(Support, RemoveDots) {
             remove_dots("..\\a\\b\\..\\c", true, path::Style::windows));
   EXPECT_EQ("..\\..\\a\\c",
             remove_dots("..\\..\\a\\b\\..\\c", true, path::Style::windows));
+  EXPECT_EQ("C:\\a\\c", remove_dots("C:\\foo\\bar//..\\..\\a\\c", true,
+                                    path::Style::windows));
+
+  // FIXME: These leading forward slashes are emergent behavior. VFS depends on
+  // this behavior now.
+  EXPECT_EQ("C:/bar",
+            remove_dots("C:/foo/../bar", true, path::Style::windows));
+  EXPECT_EQ("C:/foo\\bar",
+            remove_dots("C:/foo/bar", true, path::Style::windows));
+  EXPECT_EQ("C:/foo\\bar",
+            remove_dots("C:/foo\\bar", true, path::Style::windows));
+  EXPECT_EQ("/", remove_dots("/", true, path::Style::windows));
+  EXPECT_EQ("C:/", remove_dots("C:/", true, path::Style::windows));
+
+  // A double separator is rewritten.
+  EXPECT_EQ("C:/foo\\bar", remove_dots("C:/foo//bar", true, path::Style::windows));
 
   SmallString<64> Path1(".\\.\\c");
   EXPECT_TRUE(path::remove_dots(Path1, true, path::Style::windows));
@@ -1271,6 +1287,7 @@ TEST(Support, RemoveDots) {
   EXPECT_EQ("/a/c", remove_dots("/../../a/c", true, path::Style::posix));
   EXPECT_EQ("/a/c",
             remove_dots("/../a/b//../././/c", true, path::Style::posix));
+  EXPECT_EQ("/", remove_dots("/", true, path::Style::posix));
 
   SmallString<64> Path2("././c");
   EXPECT_TRUE(path::remove_dots(Path2, true, path::Style::posix));


        


More information about the llvm-commits mailing list