[clang] [llvm] [SystemZ][z/OS] Update autoconversion functions to improve support for UTF-8 (PR #98652)

Abhina Sree via cfe-commits cfe-commits at lists.llvm.org
Wed Sep 11 05:42:33 PDT 2024


https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/98652

>From e7be53314994b9a051ba2ff99dfd029937ebcc07 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 12 Jul 2024 11:17:24 -0400
Subject: [PATCH 1/3] update autoconversion functionality to fix error: source
 file is not valid UTF-8

---
 clang/include/clang/Basic/FileEntry.h   |  9 ++++++
 clang/lib/Basic/SourceManager.cpp       | 25 ++++++++++++++++
 llvm/include/llvm/Support/AutoConvert.h |  7 +++++
 llvm/lib/Support/AutoConvert.cpp        | 40 ++++++++++++++++++++++++-
 llvm/lib/Support/MemoryBuffer.cpp       | 16 ++++++++--
 llvm/lib/Support/VirtualFileSystem.cpp  |  2 +-
 6 files changed, 95 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/FileEntry.h b/clang/include/clang/Basic/FileEntry.h
index 68d4bf60930037..1fe6c3617582ce 100644
--- a/clang/include/clang/Basic/FileEntry.h
+++ b/clang/include/clang/Basic/FileEntry.h
@@ -70,6 +70,11 @@ class FileEntryRef {
   const FileEntry &getFileEntry() const {
     return *getBaseMapEntry().second->V.get<FileEntry *>();
   }
+#ifdef __MVS__
+  FileEntry &getFileEntry() {
+    return *getBaseMapEntry().second->V.get<FileEntry *>();
+  }
+#endif
   DirectoryEntryRef getDir() const { return ME->second->Dir; }
 
   inline off_t getSize() const;
@@ -323,6 +328,10 @@ class FileEntry {
 
   StringRef tryGetRealPathName() const { return RealPathName; }
   off_t getSize() const { return Size; }
+#ifdef __MVS__
+  // Size may increase due to potential z/OS EBCDIC -> UTF-8 conversion.
+  void setSize(off_t NewSize) { Size = NewSize; }
+#endif
   unsigned getUID() const { return UID; }
   const llvm::sys::fs::UniqueID &getUniqueID() const { return UniqueID; }
   time_t getModificationTime() const { return ModTime; }
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index d6ec26af80aadd..44b56a352dfc4e 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/AutoConvert.h"
 #include "llvm/Support/Capacity.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
@@ -166,8 +167,15 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM,
   // Unless this is a named pipe (in which case we can handle a mismatch),
   // check that the file's size is the same as in the file entry (which may
   // have come from a stat cache).
+#ifndef __MVS__
   if (!ContentsEntry->isNamedPipe() &&
       Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) {
+#else
+  // The buffer will always be larger than the file size on z/OS in the presence
+  // of characters outside the base character set.
+  if (!ContentsEntry->isNamedPipe() &&
+      Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
+#endif
     if (Diag.isDiagnosticInFlight())
       Diag.SetDelayedDiagnostic(diag::err_file_modified,
                                 ContentsEntry->getName());
@@ -617,6 +625,23 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename,
     return FileID::get(LoadedID);
   }
   unsigned FileSize = File.getSize();
+#ifdef __MVS__
+  llvm::ErrorOr<bool> NeedConversion =
+      llvm::needConversion(Filename.str().c_str());
+  if (NeedConversion && *NeedConversion) {
+    // Buffer size may increase due to potential z/OS EBCDIC to UTF-8
+    // conversion.
+    if (std::optional<llvm::MemoryBufferRef> Buffer =
+            File.getBufferOrNone(Diag, getFileManager())) {
+      unsigned BufSize = Buffer->getBufferSize();
+      if (BufSize > FileSize) {
+        if (File.ContentsEntry.has_value())
+          File.ContentsEntry->getFileEntry().setSize(BufSize);
+        FileSize = BufSize;
+      }
+    }
+  }
+#endif
   if (!(NextLocalOffset + FileSize + 1 > NextLocalOffset &&
         NextLocalOffset + FileSize + 1 <= CurrentLoadedOffset)) {
     Diag.Report(IncludePos, diag::err_sloc_space_too_large);
diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h
index 6f45c4683f7775..a9b3e0357589bc 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -17,6 +17,7 @@
 #ifdef __MVS__
 #include <_Ccsid.h>
 #ifdef __cplusplus
+#include "llvm/Support/ErrorOr.h"
 #include <system_error>
 #endif // __cplusplus
 
@@ -52,6 +53,12 @@ std::error_code restorezOSStdHandleAutoConversion(int FD);
 /// \brief Set the tag information for a file descriptor.
 std::error_code setzOSFileTag(int FD, int CCSID, bool Text);
 
+// Get the the tag ccsid for a file name or a file descriptor.
+ErrorOr<__ccsid_t> getFileTag(const char *FileName, const int FD = -1);
+
+// Query the file tag to determine if it needs conversion to UTF-8 codepage.
+ErrorOr<bool> needConversion(const char *FileName, const int FD = -1);
+
 } // namespace llvm
 #endif // __cplusplus
 
diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp
index 66570735f8fc88..4b0463ee5c92ac 100644
--- a/llvm/lib/Support/AutoConvert.cpp
+++ b/llvm/lib/Support/AutoConvert.cpp
@@ -20,6 +20,8 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+using namespace llvm;
+
 static int savedStdHandleAutoConversionMode[3] = {-1, -1, -1};
 
 int disablezOSAutoConversion(int FD) {
@@ -116,4 +118,40 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) {
   return std::error_code();
 }
 
-#endif // __MVS__
+ErrorOr<__ccsid_t> llvm::getFileTag(const char *FileName, const int FD) {
+  // If we have a file descriptor, use it to find out file tagging. Otherwise we
+  // need to use stat() with the file path.
+  if (FD != -1) {
+    struct f_cnvrt Query = {
+        QUERYCVT, // cvtcmd
+        0,        // pccsid
+        0,        // fccsid
+    };
+    if (fcntl(FD, F_CONTROL_CVT, &Query) == -1)
+      return std::error_code(errno, std::generic_category());
+    return Query.fccsid;
+  }
+  struct stat Attr;
+  if (stat(FileName, &Attr) == -1)
+    return std::error_code(errno, std::generic_category());
+  return Attr.st_tag.ft_ccsid;
+}
+
+ErrorOr<bool> llvm::needConversion(const char *FileName, const int FD) {
+  ErrorOr<__ccsid_t> Ccsid = getFileTag(FileName, FD);
+  if (std::error_code EC = Ccsid.getError())
+    return EC;
+  // We don't need conversion for UTF-8 tagged files or binary files.
+  // TODO: Remove the assumption of ISO8859-1 = UTF-8 here when we fully resolve
+  // problems related to UTF-8 tagged source files.
+  switch (*Ccsid) {
+  case CCSID_UTF_8:
+  case CCSID_ISO8859_1:
+  case FT_BINARY:
+    return false;
+  default:
+    return true;
+  }
+}
+
+#endif //__MVS__
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index aea81964ba9fde..be351129879ea4 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -362,6 +362,11 @@ static bool shouldUseMmap(sys::fs::file_t FD,
                           bool RequiresNullTerminator,
                           int PageSize,
                           bool IsVolatile) {
+#if defined(__MVS__)
+  // zOS Enhanced ASCII auto convert does not support mmap.
+  return false;
+#endif
+
   // mmap may leave the buffer without null terminator if the file size changed
   // by the time the last page is mapped in, so avoid it if the file size is
   // likely to change.
@@ -504,9 +509,16 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
   }
 
 #ifdef __MVS__
-  // Set codepage auto-conversion for z/OS.
-  if (auto EC = llvm::enablezOSAutoConversion(FD))
+  ErrorOr<bool> NeedConversion = needConversion(Filename.str().c_str(), FD);
+  if (std::error_code EC = NeedConversion.getError())
     return EC;
+  // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we
+  // cannot trust the file size and we create the memory buffer by copying
+  // off the stream.
+  // Note: This only works with the assumption of reading a full file (i.e,
+  // Offset == 0 and MapSize == FileSize). Reading a file slice does not work.
+  if (Offset == 0 && MapSize == FileSize && *NeedConversion)
+    return getMemoryBufferForStream(FD, Filename);
 #endif
 
   auto Buf =
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 928c0b5a24ed65..64585ec5b3f5f5 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -326,7 +326,7 @@ ErrorOr<std::unique_ptr<File>>
 RealFileSystem::openFileForRead(const Twine &Name) {
   SmallString<256> RealName, Storage;
   Expected<file_t> FDOrErr = sys::fs::openNativeFileForRead(
-      adjustPath(Name, Storage), sys::fs::OF_None, &RealName);
+      adjustPath(Name, Storage), sys::fs::OF_Text, &RealName);
   if (!FDOrErr)
     return errorToErrorCode(FDOrErr.takeError());
   return std::unique_ptr<File>(

>From ea68964c6be407b2efb9831ccbc3258b412489ea Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 9 Sep 2024 13:09:16 -0400
Subject: [PATCH 2/3] rename functions

---
 llvm/include/llvm/Support/AutoConvert.h | 4 ++--
 llvm/lib/Support/AutoConvert.cpp        | 6 +++---
 llvm/lib/Support/MemoryBuffer.cpp       | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h
index a9b3e0357589bc..f6ace802dab2e3 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -54,10 +54,10 @@ std::error_code restorezOSStdHandleAutoConversion(int FD);
 std::error_code setzOSFileTag(int FD, int CCSID, bool Text);
 
 // Get the the tag ccsid for a file name or a file descriptor.
-ErrorOr<__ccsid_t> getFileTag(const char *FileName, const int FD = -1);
+ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1);
 
 // Query the file tag to determine if it needs conversion to UTF-8 codepage.
-ErrorOr<bool> needConversion(const char *FileName, const int FD = -1);
+ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1);
 
 } // namespace llvm
 #endif // __cplusplus
diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp
index 4b0463ee5c92ac..f7918548df1d0d 100644
--- a/llvm/lib/Support/AutoConvert.cpp
+++ b/llvm/lib/Support/AutoConvert.cpp
@@ -118,7 +118,7 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) {
   return std::error_code();
 }
 
-ErrorOr<__ccsid_t> llvm::getFileTag(const char *FileName, const int FD) {
+ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) {
   // If we have a file descriptor, use it to find out file tagging. Otherwise we
   // need to use stat() with the file path.
   if (FD != -1) {
@@ -137,8 +137,8 @@ ErrorOr<__ccsid_t> llvm::getFileTag(const char *FileName, const int FD) {
   return Attr.st_tag.ft_ccsid;
 }
 
-ErrorOr<bool> llvm::needConversion(const char *FileName, const int FD) {
-  ErrorOr<__ccsid_t> Ccsid = getFileTag(FileName, FD);
+ErrorOr<bool> llvm::needzOSConversion(const char *FileName, const int FD) {
+  ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD);
   if (std::error_code EC = Ccsid.getError())
     return EC;
   // We don't need conversion for UTF-8 tagged files or binary files.
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index be351129879ea4..25a15b70829050 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -509,7 +509,7 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
   }
 
 #ifdef __MVS__
-  ErrorOr<bool> NeedConversion = needConversion(Filename.str().c_str(), FD);
+  ErrorOr<bool> NeedConversion = needzOSConversion(Filename.str().c_str(), FD);
   if (std::error_code EC = NeedConversion.getError())
     return EC;
   // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we

>From 358fd3fee19d3bde61b5437c01a7ccc85c3ab5bd Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 9 Sep 2024 15:22:52 -0400
Subject: [PATCH 3/3] revert text assumption

---
 clang/lib/Basic/SourceManager.cpp      | 2 +-
 llvm/lib/Support/VirtualFileSystem.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 44b56a352dfc4e..aa8553d3b16604 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -627,7 +627,7 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename,
   unsigned FileSize = File.getSize();
 #ifdef __MVS__
   llvm::ErrorOr<bool> NeedConversion =
-      llvm::needConversion(Filename.str().c_str());
+      llvm::needzOSConversion(Filename.str().c_str());
   if (NeedConversion && *NeedConversion) {
     // Buffer size may increase due to potential z/OS EBCDIC to UTF-8
     // conversion.
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 64585ec5b3f5f5..928c0b5a24ed65 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -326,7 +326,7 @@ ErrorOr<std::unique_ptr<File>>
 RealFileSystem::openFileForRead(const Twine &Name) {
   SmallString<256> RealName, Storage;
   Expected<file_t> FDOrErr = sys::fs::openNativeFileForRead(
-      adjustPath(Name, Storage), sys::fs::OF_Text, &RealName);
+      adjustPath(Name, Storage), sys::fs::OF_None, &RealName);
   if (!FDOrErr)
     return errorToErrorCode(FDOrErr.takeError());
   return std::unique_ptr<File>(



More information about the cfe-commits mailing list