[clang] [llvm] [SystemZ][z/OS] Update autoconversion functions to improve support for UTF-8 (PR #98652)
Abhina Sree via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 6 09:20:34 PST 2024
https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/98652
>From 4583cacec6daa1e980d5149be063120b34731f4c Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 12 Jul 2024 11:17:24 -0400
Subject: [PATCH 1/2] update autoconversion functionality to fix error: source
file is not valid UTF-8
---
clang/include/clang/Basic/FileEntry.h | 9 ++++++
clang/lib/Basic/SourceManager.cpp | 26 +++++++++++++++-
llvm/include/llvm/Support/AutoConvert.h | 11 +++++--
llvm/lib/Support/AutoConvert.cpp | 40 ++++++++++++++++++++++++-
llvm/lib/Support/MemoryBuffer.cpp | 16 ++++++++--
5 files changed, 96 insertions(+), 6 deletions(-)
diff --git a/clang/include/clang/Basic/FileEntry.h b/clang/include/clang/Basic/FileEntry.h
index 68d4bf60930037..1fe6c3617582ce 100644
--- a/clang/include/clang/Basic/FileEntry.h
+++ b/clang/include/clang/Basic/FileEntry.h
@@ -70,6 +70,11 @@ class FileEntryRef {
const FileEntry &getFileEntry() const {
return *getBaseMapEntry().second->V.get<FileEntry *>();
}
+#ifdef __MVS__
+ FileEntry &getFileEntry() {
+ return *getBaseMapEntry().second->V.get<FileEntry *>();
+ }
+#endif
DirectoryEntryRef getDir() const { return ME->second->Dir; }
inline off_t getSize() const;
@@ -323,6 +328,10 @@ class FileEntry {
StringRef tryGetRealPathName() const { return RealPathName; }
off_t getSize() const { return Size; }
+#ifdef __MVS__
+ // Size may increase due to potential z/OS EBCDIC -> UTF-8 conversion.
+ void setSize(off_t NewSize) { Size = NewSize; }
+#endif
unsigned getUID() const { return UID; }
const llvm::sys::fs::UniqueID &getUniqueID() const { return UniqueID; }
time_t getModificationTime() const { return ModTime; }
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 6e588ce63d813f..21cf40fc0cb9e2 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -24,6 +24,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/Allocator.h"
+#include "llvm/Support/AutoConvert.h"
#include "llvm/Support/Capacity.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Endian.h"
@@ -156,10 +157,16 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM,
// Unless this is a named pipe (in which case we can handle a mismatch),
// check that the file's size is the same as in the file entry (which may
// have come from a stat cache).
+#ifndef __MVS__
if (!ContentsEntry->isNamedPipe() &&
Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) {
+#else
+ // The buffer will always be larger than the file size on z/OS in the presence
+ // of characters outside the base character set.
+ if (!ContentsEntry->isNamedPipe() &&
+ Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
+#endif
Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName();
-
return std::nullopt;
}
@@ -602,6 +609,23 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename,
return FileID::get(LoadedID);
}
unsigned FileSize = File.getSize();
+#ifdef __MVS__
+ llvm::ErrorOr<bool> NeedConversion =
+ llvm::needzOSConversion(Filename.str().c_str());
+ if (NeedConversion && *NeedConversion) {
+ // Buffer size may increase due to potential z/OS EBCDIC to UTF-8
+ // conversion.
+ if (std::optional<llvm::MemoryBufferRef> Buffer =
+ File.getBufferOrNone(Diag, getFileManager())) {
+ unsigned BufSize = Buffer->getBufferSize();
+ if (BufSize > FileSize) {
+ if (File.ContentsEntry.has_value())
+ File.ContentsEntry->getFileEntry().setSize(BufSize);
+ FileSize = BufSize;
+ }
+ }
+ }
+#endif
if (!(NextLocalOffset + FileSize + 1 > NextLocalOffset &&
NextLocalOffset + FileSize + 1 <= CurrentLoadedOffset)) {
Diag.Report(IncludePos, diag::err_sloc_space_too_large);
diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h
index 65ac576ae5676b..5d6d9394ef1d81 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -17,6 +17,7 @@
#ifdef __MVS__
#include <_Ccsid.h>
#ifdef __cplusplus
+#include "llvm/Support/ErrorOr.h"
#include <system_error>
#endif /* __cplusplus */
@@ -54,8 +55,14 @@ std::error_code restorezOSStdHandleAutoConversion(int FD);
/** \brief Set the tag information for a file descriptor. */
std::error_code setzOSFileTag(int FD, int CCSID, bool Text);
-} /* namespace llvm */
-#endif /* __cplusplus */
+// Get the the tag ccsid for a file name or a file descriptor.
+ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1);
+
+// Query the file tag to determine if it needs conversion to UTF-8 codepage.
+ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1);
+
+} // namespace llvm
+#endif // __cplusplus
#endif /* __MVS__ */
diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp
index 66570735f8fc88..f7918548df1d0d 100644
--- a/llvm/lib/Support/AutoConvert.cpp
+++ b/llvm/lib/Support/AutoConvert.cpp
@@ -20,6 +20,8 @@
#include <sys/stat.h>
#include <unistd.h>
+using namespace llvm;
+
static int savedStdHandleAutoConversionMode[3] = {-1, -1, -1};
int disablezOSAutoConversion(int FD) {
@@ -116,4 +118,40 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) {
return std::error_code();
}
-#endif // __MVS__
+ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) {
+ // If we have a file descriptor, use it to find out file tagging. Otherwise we
+ // need to use stat() with the file path.
+ if (FD != -1) {
+ struct f_cnvrt Query = {
+ QUERYCVT, // cvtcmd
+ 0, // pccsid
+ 0, // fccsid
+ };
+ if (fcntl(FD, F_CONTROL_CVT, &Query) == -1)
+ return std::error_code(errno, std::generic_category());
+ return Query.fccsid;
+ }
+ struct stat Attr;
+ if (stat(FileName, &Attr) == -1)
+ return std::error_code(errno, std::generic_category());
+ return Attr.st_tag.ft_ccsid;
+}
+
+ErrorOr<bool> llvm::needzOSConversion(const char *FileName, const int FD) {
+ ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD);
+ if (std::error_code EC = Ccsid.getError())
+ return EC;
+ // We don't need conversion for UTF-8 tagged files or binary files.
+ // TODO: Remove the assumption of ISO8859-1 = UTF-8 here when we fully resolve
+ // problems related to UTF-8 tagged source files.
+ switch (*Ccsid) {
+ case CCSID_UTF_8:
+ case CCSID_ISO8859_1:
+ case FT_BINARY:
+ return false;
+ default:
+ return true;
+ }
+}
+
+#endif //__MVS__
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index 7ea68ee4cafd76..e2044bcc4e4f08 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -361,6 +361,11 @@ static bool shouldUseMmap(sys::fs::file_t FD,
bool RequiresNullTerminator,
int PageSize,
bool IsVolatile) {
+#if defined(__MVS__)
+ // zOS Enhanced ASCII auto convert does not support mmap.
+ return false;
+#endif
+
// mmap may leave the buffer without null terminator if the file size changed
// by the time the last page is mapped in, so avoid it if the file size is
// likely to change.
@@ -503,9 +508,16 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
}
#ifdef __MVS__
- // Set codepage auto-conversion for z/OS.
- if (auto EC = llvm::enablezOSAutoConversion(FD))
+ ErrorOr<bool> NeedConversion = needzOSConversion(Filename.str().c_str(), FD);
+ if (std::error_code EC = NeedConversion.getError())
return EC;
+ // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we
+ // cannot trust the file size and we create the memory buffer by copying
+ // off the stream.
+ // Note: This only works with the assumption of reading a full file (i.e,
+ // Offset == 0 and MapSize == FileSize). Reading a file slice does not work.
+ if (Offset == 0 && MapSize == FileSize && *NeedConversion)
+ return getMemoryBufferForStream(FD, Filename);
#endif
auto Buf =
>From bb32463301001162494abd8d7b41325a43a0e2d5 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 6 Dec 2024 12:10:29 -0500
Subject: [PATCH 2/2] remove #ifdefs
---
clang/include/clang/Basic/FileEntry.h | 10 +++++-----
clang/lib/Basic/SourceManager.cpp | 26 +++++++++++++++-----------
2 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/clang/include/clang/Basic/FileEntry.h b/clang/include/clang/Basic/FileEntry.h
index 1fe6c3617582ce..7f24eb33084a19 100644
--- a/clang/include/clang/Basic/FileEntry.h
+++ b/clang/include/clang/Basic/FileEntry.h
@@ -70,11 +70,13 @@ class FileEntryRef {
const FileEntry &getFileEntry() const {
return *getBaseMapEntry().second->V.get<FileEntry *>();
}
-#ifdef __MVS__
- FileEntry &getFileEntry() {
+
+ // This is a non const version of getFileEntry() which is used if the buffer
+ // size needs to be increased due to potential z/OS EBCDIC -> UTF-8 conversion
+ FileEntry &getFileEntryToUpdate() {
return *getBaseMapEntry().second->V.get<FileEntry *>();
}
-#endif
+
DirectoryEntryRef getDir() const { return ME->second->Dir; }
inline off_t getSize() const;
@@ -328,10 +330,8 @@ class FileEntry {
StringRef tryGetRealPathName() const { return RealPathName; }
off_t getSize() const { return Size; }
-#ifdef __MVS__
// Size may increase due to potential z/OS EBCDIC -> UTF-8 conversion.
void setSize(off_t NewSize) { Size = NewSize; }
-#endif
unsigned getUID() const { return UID; }
const llvm::sys::fs::UniqueID &getUniqueID() const { return UniqueID; }
time_t getModificationTime() const { return ModTime; }
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 21cf40fc0cb9e2..4ca2cc3f65f17c 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -157,16 +157,13 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM,
// Unless this is a named pipe (in which case we can handle a mismatch),
// check that the file's size is the same as in the file entry (which may
// have come from a stat cache).
-#ifndef __MVS__
- if (!ContentsEntry->isNamedPipe() &&
- Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) {
-#else
// The buffer will always be larger than the file size on z/OS in the presence
// of characters outside the base character set.
+ assert(Buffer->getBufferSize() <= (size_t)ContentsEntry->getSize());
if (!ContentsEntry->isNamedPipe() &&
Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
-#endif
Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName();
+
return std::nullopt;
}
@@ -590,6 +587,15 @@ SourceManager::getOrCreateFileID(FileEntryRef SourceFile,
FileCharacter);
}
+/// Helper function to determine if an input file requires conversion
+llvm::ErrorOr<bool> needConversion(StringRef Filename) {
+#ifdef __MVS__
+ return llvm::needzOSConversion(Filename.str().c_str());
+#else
+ return false;
+#endif
+}
+
/// createFileID - Create a new FileID for the specified ContentCache and
/// include position. This works regardless of whether the ContentCache
/// corresponds to a file or some other input source.
@@ -609,10 +615,9 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename,
return FileID::get(LoadedID);
}
unsigned FileSize = File.getSize();
-#ifdef __MVS__
- llvm::ErrorOr<bool> NeedConversion =
- llvm::needzOSConversion(Filename.str().c_str());
- if (NeedConversion && *NeedConversion) {
+ llvm::ErrorOr<bool> NeedConversion = needConversion(Filename);
+ assert(NeedConversion && "Filename was not found");
+ if (*NeedConversion) {
// Buffer size may increase due to potential z/OS EBCDIC to UTF-8
// conversion.
if (std::optional<llvm::MemoryBufferRef> Buffer =
@@ -620,12 +625,11 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename,
unsigned BufSize = Buffer->getBufferSize();
if (BufSize > FileSize) {
if (File.ContentsEntry.has_value())
- File.ContentsEntry->getFileEntry().setSize(BufSize);
+ File.ContentsEntry->getFileEntryToUpdate().setSize(BufSize);
FileSize = BufSize;
}
}
}
-#endif
if (!(NextLocalOffset + FileSize + 1 > NextLocalOffset &&
NextLocalOffset + FileSize + 1 <= CurrentLoadedOffset)) {
Diag.Report(IncludePos, diag::err_sloc_space_too_large);
More information about the llvm-commits
mailing list