[clang-tools-extra] [clangd] Improve filtering logic for undesired proto symbols (PR #110091)

kadir çetinkaya via cfe-commits cfe-commits at lists.llvm.org
Thu Sep 26 07:41:48 PDT 2024


https://github.com/kadircet updated https://github.com/llvm/llvm-project/pull/110091

>From 08e2a37c8b028efcacd5ee8a269aed837ba16698 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet at google.com>
Date: Thu, 26 Sep 2024 10:55:49 +0200
Subject: [PATCH] [clangd] Improve filtering logic for undesired proto symbols

This used to filter any names with `_` in them, apart from
enum-constants. Resulting in discrepancies in behavior when we had
fields that have `_` in the name, or for accessors like `set_`, `has_`.

The logic seems to be trying to filter mangled names for nested entries,
so adjusted logic to only do so for top-level decls, while still
preserving some public top-level helpers.

Heuristics are still leaning towards false-negatives, e.g. if a
top-level entity has `_` in its name (`message Foo_Bar {}`), it'll be
filtered, or an enum that prefixes its type name to constants
(`enum Foo { Foo_OK }`).
---
 .../clangd/index/SymbolCollector.cpp          | 68 +++++++++++++++----
 .../clangd/unittests/SymbolCollectorTests.cpp | 64 ++++++++++++++---
 2 files changed, 110 insertions(+), 22 deletions(-)

diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp
index a76894cf0855f3..60739032eab003 100644
--- a/clang-tools-extra/clangd/index/SymbolCollector.cpp
+++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp
@@ -41,6 +41,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -75,18 +76,61 @@ bool isPrivateProtoDecl(const NamedDecl &ND) {
   if (ND.getIdentifier() == nullptr)
     return false;
   auto Name = ND.getIdentifier()->getName();
-  if (!Name.contains('_'))
-    return false;
-  // Nested proto entities (e.g. Message::Nested) have top-level decls
-  // that shouldn't be used (Message_Nested). Ignore them completely.
-  // The nested entities are dangling type aliases, we may want to reconsider
-  // including them in the future.
-  // For enum constants, SOME_ENUM_CONSTANT is not private and should be
-  // indexed. Outer_INNER is private. This heuristic relies on naming style, it
-  // will include OUTER_INNER and exclude some_enum_constant.
-  // FIXME: the heuristic relies on naming style (i.e. no underscore in
-  // user-defined names) and can be improved.
-  return (ND.getKind() != Decl::EnumConstant) || llvm::any_of(Name, islower);
+  // There are some internal helpers like _internal_set_foo();
+  if (Name.contains("_internal_"))
+    return true;
+
+  // https://protobuf.dev/reference/cpp/cpp-generated/#nested-types
+  // Nested entities (messages/enums) has two names, one at the top-level scope,
+  // with a mangled name created by prepending all the outer types. These names
+  // are almost never preferred by the developers, so exclude them from index.
+  // e.g.
+  // message Foo {
+  //  message Bar {}
+  //  enum E { A }
+  // }
+  // yields:
+  // class Foo_Bar {};
+  // enum Foo_E { Foo_E_A };
+  // class Foo {
+  //  using Bar = Foo_Bar;
+  //  static constexpr Foo_E A = Foo_E_A;
+  // };
+
+  // We get rid of Foo_Bar and Foo_E by discarding any top-level entries with
+  // `_` in the name. This relies on original message/enum not having `_` in the
+  // name. Hence might go wrong in certain cases.
+  if (ND.getDeclContext()->isNamespace()) {
+    // Strip off some known public suffix helpers for enums, rest of the helpers
+    // are generated inside record decls so we don't care.
+    // https://protobuf.dev/reference/cpp/cpp-generated/#enum
+    Name.consume_back("_descriptor");
+    Name.consume_back("_IsValid");
+    Name.consume_back("_Name");
+    Name.consume_back("_Parse");
+    Name.consume_back("_MIN");
+    Name.consume_back("_MAX");
+    Name.consume_back("_ARRAYSIZE");
+    return Name.contains('_');
+  }
+
+  // EnumConstantDecls need some special attention, despite being nested in a
+  // TagDecl, they might still have mangled names. We filter those by checking
+  // if it has parent's name as a prefix.
+  // This might go wrong if a nested entity has a name that starts with parent's
+  // name, e.g: enum Foo { Foo_X }.
+  if (llvm::isa<EnumConstantDecl>(&ND)) {
+    auto *DC = llvm::cast<EnumDecl>(ND.getDeclContext());
+    if (!DC || !DC->getIdentifier())
+      return false;
+    auto CtxName = DC->getIdentifier()->getName();
+    return !CtxName.empty() && Name.consume_front(CtxName) &&
+           Name.consume_front("_");
+  }
+
+  // Now we're only left with fields/methods without an `_internal_` in the
+  // name, they're intended for public use.
+  return false;
 }
 
 // We only collect #include paths for symbols that are suitable for global code
diff --git a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
index 0666be95b6b9ee..e8088cb37fa51c 100644
--- a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
@@ -201,19 +201,63 @@ TEST_F(ShouldCollectSymbolTest, NoPrivateProtoSymbol) {
   build(
       R"(// Generated by the protocol buffer compiler.  DO NOT EDIT!
          namespace nx {
-           class Top_Level {};
-           class TopLevel {};
-           enum Kind {
-             KIND_OK,
-             Kind_Not_Ok,
+           enum Outer_Enum : int {
+             Outer_Enum_KIND1,
+             Outer_Enum_Kind_2,
            };
+           bool Outer_Enum_IsValid(int);
+
+           class Outer_Inner {};
+           class Outer {
+             using Inner = Outer_Inner;
+             using Enum = Outer_Enum;
+             static constexpr Enum KIND1 = Outer_Enum_KIND1;
+             static constexpr Enum Kind_2 = Outer_Enum_Kind_2;
+             static bool Enum_IsValid(int);
+             int &x();
+             void set_x();
+             void _internal_set_x();
+
+             int &Outer_y();
+           };
+           enum Foo {
+             FOO_VAL1,
+             Foo_VAL2,
+           };
+           bool Foo_IsValid(int);
          })");
-  EXPECT_TRUE(shouldCollect("nx::TopLevel"));
-  EXPECT_TRUE(shouldCollect("nx::Kind::KIND_OK"));
-  EXPECT_TRUE(shouldCollect("nx::Kind"));
 
-  EXPECT_FALSE(shouldCollect("nx::Top_Level"));
-  EXPECT_FALSE(shouldCollect("nx::Kind::Kind_Not_Ok"));
+  // Make sure all the mangled names for Outer::Enum is discarded.
+  EXPECT_FALSE(shouldCollect("nx::Outer_Enum"));
+  EXPECT_FALSE(shouldCollect("nx::Outer_Enum_KIND1"));
+  EXPECT_FALSE(shouldCollect("nx::Outer_Enum_Kind_2"));
+  EXPECT_FALSE(shouldCollect("nx::Outer_Enum_IsValid"));
+  // But nested aliases are preserved.
+  EXPECT_TRUE(shouldCollect("nx::Outer::Enum"));
+  EXPECT_TRUE(shouldCollect("nx::Outer::KIND1"));
+  EXPECT_TRUE(shouldCollect("nx::Outer::Kind_2"));
+  EXPECT_TRUE(shouldCollect("nx::Outer::Enum_IsValid"));
+
+  // Check for Outer::Inner.
+  EXPECT_FALSE(shouldCollect("nx::Outer_Inner"));
+  EXPECT_TRUE(shouldCollect("nx::Outer"));
+  EXPECT_TRUE(shouldCollect("nx::Outer::Inner"));
+
+  // Make sure field related information is preserved, unless it's explicitly
+  // marked with `_internal_`.
+  EXPECT_TRUE(shouldCollect("nx::Outer::x"));
+  EXPECT_TRUE(shouldCollect("nx::Outer::set_x"));
+  EXPECT_FALSE(shouldCollect("nx::Outer::_internal_set_x"));
+  EXPECT_TRUE(shouldCollect("nx::Outer::Outer_y"));
+
+  // Handling of a top-level enum
+  EXPECT_TRUE(shouldCollect("nx::Foo::FOO_VAL1"));
+  EXPECT_TRUE(shouldCollect("nx::FOO_VAL1"));
+  EXPECT_TRUE(shouldCollect("nx::Foo_IsValid"));
+  // Our heuristic goes wrong here, if the user has a nested name that starts
+  // with parent's name.
+  EXPECT_FALSE(shouldCollect("nx::Foo::Foo_VAL2"));
+  EXPECT_FALSE(shouldCollect("nx::Foo_VAL2"));
 }
 
 TEST_F(ShouldCollectSymbolTest, DoubleCheckProtoHeaderComment) {



More information about the cfe-commits mailing list