[clang-tools-extra] 307b1fd - [clang-tidy] Always open files using UTF-8 encoding

Mon Aug 2 02:37:38 PDT 2021

Author: Andy Yankovsky
Date: 2021-08-02T11:36:04+02:00
New Revision: 307b1fddd4d84b559b154ff7744ae68bf9c6f503

URL: https://github.com/llvm/llvm-project/commit/307b1fddd4d84b559b154ff7744ae68bf9c6f503
DIFF: https://github.com/llvm/llvm-project/commit/307b1fddd4d84b559b154ff7744ae68bf9c6f503.diff

LOG: [clang-tidy] Always open files using UTF-8 encoding

The encoding used for opening files depends on the OS and might be different
from UTF-8 (e.g. on Windows it can be CP-1252). The documentation files use
UTF-8 and might be incompatible with other encodings. For example, right now
`clang-tools-extra/docs/clang-tidy/checks/abseil-no-internal-dependencies.rst`
has non-ASCII quotes and running `add_new_check.py` fails on Windows, because
it tries to read the file with incompatible encoding.

Use `io.open` for compatibility with both Python 2 and Python 3.

Reviewed By: kbobyrev

Differential Revision: https://reviews.llvm.org/D106792

Added: 
    

Modified: 
    clang-tools-extra/clang-tidy/add_new_check.py
    clang-tools-extra/clang-tidy/rename_check.py

Removed: 
    


################################################################################
diff  --git a/clang-tools-extra/clang-tidy/add_new_check.py b/clang-tools-extra/clang-tidy/add_new_check.py
index 14fcfe8d49ff..9239ca5953cd 100755

--- a/clang-tools-extra/clang-tidy/add_new_check.py
+++ b/clang-tools-extra/clang-tidy/add_new_check.py
@@ -11,16 +11,21 @@
 from __future__ import print_function
 
 import argparse
+import io
 import os
 import re
 import sys
 
-
 # Adapts the module's CMakelist file. Returns 'True' if it could add a new
 # entry and 'False' if the entry already existed.
 def adapt_cmake(module_path, check_name_camel):
   filename = os.path.join(module_path, 'CMakeLists.txt')
-  with open(filename, 'r') as f:
+
+  # The documentation files are encoded using UTF-8, however on Windows the
+  # default encoding might be 
diff erent (e.g. CP-1252). To make sure UTF-8 is
+  # always used, use `io.open(filename, mode, encoding='utf8')` for reading and
+  # writing files here and elsewhere.
+  with io.open(filename, 'r', encoding='utf8') as f:
     lines = f.readlines()
 
   cpp_file = check_name_camel + '.cpp'
@@ -31,7 +36,7 @@ def adapt_cmake(module_path, check_name_camel):
       return False
 
   print('Updating %s...' % filename)
-  with open(filename, 'w') as f:
+  with io.open(filename, 'w', encoding='utf8') as f:
     cpp_found = False
     file_added = False
     for line in lines:
@@ -51,7 +56,7 @@ def write_header(module_path, module, namespace, check_name, check_name_camel):
   check_name_dashes = module + '-' + check_name
   filename = os.path.join(module_path, check_name_camel) + '.h'
   print('Creating %s...' % filename)
-  with open(filename, 'w') as f:
+  with io.open(filename, 'w', encoding='utf8') as f:
     header_guard = ('LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_' + module.upper() + '_'
                     + check_name_camel.upper() + '_H')
     f.write('//===--- ')
@@ -104,7 +109,7 @@ class %(check_name)s : public ClangTidyCheck {
 def write_implementation(module_path, module, namespace, check_name_camel):
   filename = os.path.join(module_path, check_name_camel) + '.cpp'
   print('Creating %s...' % filename)
-  with open(filename, 'w') as f:
+  with io.open(filename, 'w', encoding='utf8') as f:
     f.write('//===--- ')
     f.write(os.path.basename(filename))
     f.write(' - clang-tidy ')
@@ -158,11 +163,11 @@ def adapt_module(module_path, module, check_name, check_name_camel):
       lambda p: p.lower() == module.lower() + 'tidymodule.cpp',
       os.listdir(module_path)))[0]
   filename = os.path.join(module_path, modulecpp)
-  with open(filename, 'r') as f:
+  with io.open(filename, 'r', encoding='utf8') as f:
     lines = f.readlines()
 
   print('Updating %s...' % filename)
-  with open(filename, 'w') as f:
+  with io.open(filename, 'w', encoding='utf8') as f:
     header_added = False
     header_found = False
     check_added = False
@@ -217,7 +222,7 @@ def add_release_notes(module_path, module, check_name):
   check_name_dashes = module + '-' + check_name
   filename = os.path.normpath(os.path.join(module_path,
                                            '../../docs/ReleaseNotes.rst'))
-  with open(filename, 'r') as f:
+  with io.open(filename, 'r', encoding='utf8') as f:
     lines = f.readlines()
 
   lineMatcher = re.compile('New checks')
@@ -225,7 +230,7 @@ def add_release_notes(module_path, module, check_name):
   checkMatcher = re.compile('- New :doc:`(.*)')
 
   print('Updating %s...' % filename)
-  with open(filename, 'w') as f:
+  with io.open(filename, 'w', encoding='utf8') as f:
     note_added = False
     header_found = False
     add_note_here = False
@@ -271,7 +276,7 @@ def write_test(module_path, module, check_name, test_extension):
   filename = os.path.normpath(os.path.join(module_path, '../../test/clang-tidy/checkers',
                                            check_name_dashes + '.' + test_extension))
   print('Creating %s...' % filename)
-  with open(filename, 'w') as f:
+  with io.open(filename, 'w', encoding='utf8') as f:
     f.write("""// RUN: %%check_clang_tidy %%s %(check_name_dashes)s %%t
 
 // FIXME: Add something that triggers the check here.
@@ -307,7 +312,7 @@ def update_checks_list(clang_tidy_path):
   docs_dir = os.path.join(clang_tidy_path, '../docs/clang-tidy/checks')
   filename = os.path.normpath(os.path.join(docs_dir, 'list.rst'))
   # Read the content of the current list.rst file
-  with open(filename, 'r') as f:
+  with io.open(filename, 'r', encoding='utf8') as f:
     lines = f.readlines()
   # Get all existing docs
   doc_files = list(filter(lambda s: s.endswith('.rst') and s != 'list.rst',
@@ -323,7 +328,7 @@ def has_auto_fix(check_name):
     if not os.path.isfile(checkerCode):
       return ""
 
-    with open(checkerCode) as f:
+    with io.open(checkerCode, encoding='utf8') as f:
       code = f.read()
       if 'FixItHint' in code or "ReplacementText" in code or "fixit" in code:
         # Some simple heuristics to figure out if a checker has an autofix or not.
@@ -333,7 +338,7 @@ def has_auto_fix(check_name):
   def process_doc(doc_file):
     check_name = doc_file.replace('.rst', '')
 
-    with open(os.path.join(docs_dir, doc_file), 'r') as doc:
+    with io.open(os.path.join(docs_dir, doc_file), 'r', encoding='utf8') as doc:
       content = doc.read()
       match = re.search('.*:orphan:.*', content)
 
@@ -376,7 +381,7 @@ def format_link_alias(doc_file):
   checks_alias = map(format_link_alias, doc_files)
 
   print('Updating %s...' % filename)
-  with open(filename, 'w') as f:
+  with io.open(filename, 'w', encoding='utf8') as f:
     for line in lines:
       f.write(line)
       if line.strip() == ".. csv-table::":
@@ -397,7 +402,7 @@ def write_docs(module_path, module, check_name):
   filename = os.path.normpath(os.path.join(
       module_path, '../../docs/clang-tidy/checks/', check_name_dashes + '.rst'))
   print('Creating %s...' % filename)
-  with open(filename, 'w') as f:
+  with io.open(filename, 'w', encoding='utf8') as f:
     f.write(""".. title:: clang-tidy - %(check_name_dashes)s
 
 %(check_name_dashes)s

diff  --git a/clang-tools-extra/clang-tidy/rename_check.py b/clang-tools-extra/clang-tidy/rename_check.py
index 2410041fd5d2..0c48634ac62b 100755
--- a/clang-tools-extra/clang-tidy/rename_check.py
+++ b/clang-tools-extra/clang-tidy/rename_check.py
@@ -10,20 +10,25 @@
 
 import argparse
 import glob
+import io
 import os
 import re
 
-
 def replaceInFileRegex(fileName, sFrom, sTo):
   if sFrom == sTo:
     return
+
+  # The documentation files are encoded using UTF-8, however on Windows the
+  # default encoding might be 
diff erent (e.g. CP-1252). To make sure UTF-8 is
+  # always used, use `io.open(filename, mode, encoding='utf8')` for reading and
+  # writing files here and elsewhere.
   txt = None
-  with open(fileName, "r") as f:
+  with io.open(fileName, 'r', encoding='utf8') as f:
     txt = f.read()
 
   txt = re.sub(sFrom, sTo, txt)
   print("Replacing '%s' -> '%s' in '%s'..." % (sFrom, sTo, fileName))
-  with open(fileName, "w") as f:
+  with io.open(fileName, 'w', encoding='utf8') as f:
     f.write(txt)
 
 
@@ -31,7 +36,7 @@ def replaceInFile(fileName, sFrom, sTo):
   if sFrom == sTo:
     return
   txt = None
-  with open(fileName, "r") as f:
+  with io.open(fileName, 'r', encoding='utf8') as f:
     txt = f.read()
 
   if sFrom not in txt:
@@ -39,7 +44,7 @@ def replaceInFile(fileName, sFrom, sTo):
 
   txt = txt.replace(sFrom, sTo)
   print("Replacing '%s' -> '%s' in '%s'..." % (sFrom, sTo, fileName))
-  with open(fileName, "w") as f:
+  with io.open(fileName, 'w', encoding='utf8') as f:
     f.write(txt)
 
 
@@ -70,7 +75,7 @@ def fileRename(fileName, sFrom, sTo):
 
 def deleteMatchingLines(fileName, pattern):
   lines = None
-  with open(fileName, "r") as f:
+  with io.open(fileName, 'r', encoding='utf8') as f:
     lines = f.readlines()
 
   not_matching_lines = [l for l in lines if not re.search(pattern, l)]
@@ -79,7 +84,7 @@ def deleteMatchingLines(fileName, pattern):
 
   print("Removing lines matching '%s' in '%s'..." % (pattern, fileName))
   print('  ' + '  '.join([l for l in lines if re.search(pattern, l)]))
-  with open(fileName, "w") as f:
+  with io.open(fileName, 'w', encoding='utf8') as f:
     f.writelines(not_matching_lines)
 
   return True
@@ -101,7 +106,7 @@ def getListOfFiles(clang_tidy_path):
 # entry and 'False' if the entry already existed.
 def adapt_cmake(module_path, check_name_camel):
   filename = os.path.join(module_path, 'CMakeLists.txt')
-  with open(filename, 'r') as f:
+  with io.open(filename, 'r', encoding='utf8') as f:
     lines = f.readlines()
 
   cpp_file = check_name_camel + '.cpp'
@@ -112,7 +117,7 @@ def adapt_cmake(module_path, check_name_camel):
       return False
 
   print('Updating %s...' % filename)
-  with open(filename, 'wb') as f:
+  with io.open(filename, 'wb', encoding='utf8') as f:
     cpp_found = False
     file_added = False
     for line in lines:
@@ -130,11 +135,11 @@ def adapt_cmake(module_path, check_name_camel):
 def adapt_module(module_path, module, check_name, check_name_camel):
   modulecpp = next(filter(lambda p: p.lower() == module.lower() + 'tidymodule.cpp', os.listdir(module_path)))
   filename = os.path.join(module_path, modulecpp)
-  with open(filename, 'r') as f:
+  with io.open(filename, 'r', encoding='utf8') as f:
     lines = f.readlines()
 
   print('Updating %s...' % filename)
-  with open(filename, 'wb') as f:
+  with io.open(filename, 'wb', encoding='utf8') as f:
     header_added = False
     header_found = False
     check_added = False
@@ -169,7 +174,7 @@ def adapt_module(module_path, module, check_name, check_name_camel):
 def add_release_notes(clang_tidy_path, old_check_name, new_check_name):
   filename = os.path.normpath(os.path.join(clang_tidy_path,
                                            '../docs/ReleaseNotes.rst'))
-  with open(filename, 'r') as f:
+  with io.open(filename, 'r', encoding='utf8') as f:
     lines = f.readlines()
 
   lineMatcher = re.compile('Renamed checks')
@@ -177,7 +182,7 @@ def add_release_notes(clang_tidy_path, old_check_name, new_check_name):
   checkMatcher = re.compile('- The \'(.*)')
 
   print('Updating %s...' % filename)
-  with open(filename, 'wb') as f:
+  with io.open(filename, 'wb', encoding='utf8') as f:
     note_added = False
     header_found = False
     add_note_here = False