[clang] d42de86 - reland: [clang][ScanDeps] Canonicalize -D and -U flags (#82568)

via cfe-commits cfe-commits at lists.llvm.org
Fri Feb 23 17:44:36 PST 2024


Author: Michael Spencer
Date: 2024-02-23T17:44:32-08:00
New Revision: d42de86eb37b08b3007a67650b3ca73b9ae174b1

URL: https://github.com/llvm/llvm-project/commit/d42de86eb37b08b3007a67650b3ca73b9ae174b1
DIFF: https://github.com/llvm/llvm-project/commit/d42de86eb37b08b3007a67650b3ca73b9ae174b1.diff

LOG: reland: [clang][ScanDeps] Canonicalize -D and -U flags (#82568)

Canonicalize `-D` and `-U` flags by sorting them and only keeping the
last instance of a given name.

This optimization will only fire if all `-D` and `-U` flags start with a
simple identifier that we can guarantee a simple analysis of can
determine if two flags refer to the same identifier or not. See the
comment on `getSimpleMacroName()` for details of what the issues are.

Previous version of this had issues with sed differences between macOS,
Linux, and Windows. This test doesn't check paths, so just don't run
sed.
Other tests should use `sed -E 's:\\\\?:/:g'` to get portable behavior.

Windows has different command line parsing behavior than Linux for
compilation databases, so the test has been adjusted to ignore that
difference.

Added: 
    clang/test/ClangScanDeps/optimize-canonicalize-macros.m

Modified: 
    clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
    clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
    clang/tools/clang-scan-deps/ClangScanDeps.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
index 4f9867262a275c..557f0e547ab4a8 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
@@ -60,7 +60,10 @@ enum class ScanningOptimizations {
   /// Remove unused -ivfsoverlay arguments.
   VFS = 4,
 
-  DSS_LAST_BITMASK_ENUM(VFS),
+  /// Canonicalize -D and -U options.
+  Macros = 8,
+
+  DSS_LAST_BITMASK_ENUM(Macros),
   Default = All
 };
 

diff  --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 3cf3ad8a4e4907..7477b930188b4f 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -179,6 +179,78 @@ static void sanitizeDiagOpts(DiagnosticOptions &DiagOpts) {
   DiagOpts.IgnoreWarnings = true;
 }
 
+// Clang implements -D and -U by splatting text into a predefines buffer. This
+// allows constructs such as `-DFඞ=3 "-D F\u{0D9E} 4 3 2”` to be accepted and
+// define the same macro, or adding C++ style comments before the macro name.
+//
+// This function checks that the first non-space characters in the macro
+// obviously form an identifier that can be uniqued on without lexing. Failing
+// to do this could lead to changing the final definition of a macro.
+//
+// We could set up a preprocessor and actually lex the name, but that's very
+// heavyweight for a situation that will almost never happen in practice.
+static std::optional<StringRef> getSimpleMacroName(StringRef Macro) {
+  StringRef Name = Macro.split("=").first.ltrim(" \t");
+  std::size_t I = 0;
+
+  auto FinishName = [&]() -> std::optional<StringRef> {
+    StringRef SimpleName = Name.slice(0, I);
+    if (SimpleName.empty())
+      return std::nullopt;
+    return SimpleName;
+  };
+
+  for (; I != Name.size(); ++I) {
+    switch (Name[I]) {
+    case '(': // Start of macro parameter list
+    case ' ': // End of macro name
+    case '\t':
+      return FinishName();
+    case '_':
+      continue;
+    default:
+      if (llvm::isAlnum(Name[I]))
+        continue;
+      return std::nullopt;
+    }
+  }
+  return FinishName();
+}
+
+static void canonicalizeDefines(PreprocessorOptions &PPOpts) {
+  using MacroOpt = std::pair<StringRef, std::size_t>;
+  std::vector<MacroOpt> SimpleNames;
+  SimpleNames.reserve(PPOpts.Macros.size());
+  std::size_t Index = 0;
+  for (const auto &M : PPOpts.Macros) {
+    auto SName = getSimpleMacroName(M.first);
+    // Skip optimizing if we can't guarantee we can preserve relative order.
+    if (!SName)
+      return;
+    SimpleNames.emplace_back(*SName, Index);
+    ++Index;
+  }
+
+  llvm::stable_sort(SimpleNames, [](const MacroOpt &A, const MacroOpt &B) {
+    return A.first < B.first;
+  });
+  // Keep the last instance of each macro name by going in reverse
+  auto NewEnd = std::unique(
+      SimpleNames.rbegin(), SimpleNames.rend(),
+      [](const MacroOpt &A, const MacroOpt &B) { return A.first == B.first; });
+  SimpleNames.erase(SimpleNames.begin(), NewEnd.base());
+
+  // Apply permutation.
+  decltype(PPOpts.Macros) NewMacros;
+  NewMacros.reserve(SimpleNames.size());
+  for (std::size_t I = 0, E = SimpleNames.size(); I != E; ++I) {
+    std::size_t OriginalIndex = SimpleNames[I].second;
+    // We still emit undefines here as they may be undefining a predefined macro
+    NewMacros.push_back(std::move(PPOpts.Macros[OriginalIndex]));
+  }
+  std::swap(PPOpts.Macros, NewMacros);
+}
+
 /// A clang tool that runs the preprocessor in a mode that's optimized for
 /// dependency scanning for the given compiler invocation.
 class DependencyScanningAction : public tooling::ToolAction {
@@ -203,6 +275,8 @@ class DependencyScanningAction : public tooling::ToolAction {
     CompilerInvocation OriginalInvocation(*Invocation);
     // Restore the value of DisableFree, which may be modified by Tooling.
     OriginalInvocation.getFrontendOpts().DisableFree = DisableFree;
+    if (any(OptimizeArgs & ScanningOptimizations::Macros))
+      canonicalizeDefines(OriginalInvocation.getPreprocessorOpts());
 
     if (Scanned) {
       // Scanning runs once for the first -cc1 invocation in a chain of driver

diff  --git a/clang/test/ClangScanDeps/optimize-canonicalize-macros.m b/clang/test/ClangScanDeps/optimize-canonicalize-macros.m
new file mode 100644
index 00000000000000..b1b351ba56afc2
--- /dev/null
+++ b/clang/test/ClangScanDeps/optimize-canonicalize-macros.m
@@ -0,0 +1,87 @@
+// This test verifies that command lines with equivalent -D and -U arguments
+// are canonicalized to the same module variant.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DIR|%/t|g" %t/build/compile-commands.json.in > %t/build/compile-commands.json
+// RUN: clang-scan-deps -compilation-database %t/build/compile-commands.json \
+// RUN:   -j 1 -format experimental-full -optimize-args=canonicalize-macros > %t/deps.db
+// RUN: cat %t/deps.db | FileCheck %s -DPREFIX=%/t
+
+// Verify that there are only two variants and that the expected merges have
+// happened.
+
+// CHECK:      {
+// CHECK-NEXT:   "modules": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [],
+// CHECK-NEXT:       "clang-modulemap-file":
+// CHECK-NEXT:       "command-line": [
+// CHECK-NOT:          "J=1"
+// CHECK-NOT:          "J"
+// CHECK-NOT:          "K"
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK:            ],
+// CHECK-NEXT:       "name": "A"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [],
+// CHECK-NEXT:       "clang-modulemap-file":
+// CHECK-NEXT:       "command-line": [
+// CHECK:              "Fඞ"
+// CHECK:              0D9E
+// CHECK:              "K"
+// CHECK:              "K"
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK:            ],
+// CHECK-NEXT:       "name": "A"
+// CHECK-NEXT:     }
+// CHECK-NEXT:   ],
+// CHECK-NEXT:   "translation-units": [
+// CHECK:        ]
+// CHECK:      }
+
+
+//--- build/compile-commands.json.in
+
+[
+{
+  "directory": "DIR",
+  "command": "clang -c DIR/tu0.m -DJ=1 -UJ -DJ=2 -DI -DK(x)=x -I modules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps",
+  "file": "DIR/tu0.m"
+},
+{
+  "directory": "DIR",
+  "command": "clang -c DIR/tu1.m -DK -DK(x)=x -DI -D \"J=2\" -I modules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps",
+  "file": "DIR/tu1.m"
+},
+{
+  "directory": "DIR",
+  "command": "clang -c DIR/tu2.m -I modules/A -DFඞ \"-DF\\\\u{0D9E}\" -DK -DK -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps",
+  "file": "DIR/tu2.m"
+}
+]
+
+//--- modules/A/module.modulemap
+
+module A {
+  umbrella header "A.h"
+}
+
+//--- modules/A/A.h
+
+//--- tu0.m
+
+#include <A.h>
+
+//--- tu1.m
+
+#include <A.h>
+
+//--- tu2.m
+
+#include <A.h>

diff  --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index 0458a4b3ecec39..9811d2a8753359 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -157,6 +157,7 @@ static void ParseArgs(int argc, char **argv) {
             .Case("header-search", ScanningOptimizations::HeaderSearch)
             .Case("system-warnings", ScanningOptimizations::SystemWarnings)
             .Case("vfs", ScanningOptimizations::VFS)
+            .Case("canonicalize-macros", ScanningOptimizations::Macros)
             .Case("all", ScanningOptimizations::All)
             .Default(std::nullopt);
     if (!Optimization) {


        


More information about the cfe-commits mailing list