[PATCH] Don't duplicate entire file contents in memory

Alp Toker alp at nuanti.com
Tue Jun 24 22:29:02 PDT 2014


The attached clang patches:

  * Eliminate copying of file contents into memory in several places.
  * Avoid redundant file IO to re-stat and re-read file contents at each 
pass.
  * Enable sharing of parse buffers with the backend.

The changes will also enable future removal of old hacks like 
RetainRemappedFileBuffers/OwnsRemappedFileBuffers and SourceManager's 
DoNotFreeFlag.

A patch adding minor supporting changes on the LLVM side has been sent 
to llvm-commits.

Background:

The key feature of this approach is that reference counting is only 
handled by source managers and otherwise transparent to anything else 
that works with MemoryBuffer pointers.

That means traditional new/delete MemoryBuffer usage isn't affected and 
we maintain essentially the same MemoryBuffer lifetime guarantees as 
before, just adding the efficient transfer semantics that was sorely 
missing and saving memory allocations.

Alp.

-- 
http://www.nuanti.com
the browser experts

-------------- next part --------------
>From ea1f573a1e061cdd2ec7e7b4d99805049f20363f Mon Sep 17 00:00:00 2001
From: Alp Toker <alp at nuanti.com>
Date: Wed, 25 Jun 2014 03:37:51 +0300
Subject: [PATCH 1/3] SourceManager: use reference counting for MemoryBuffers

This change will make it possible to share ownership of mmap and malloc buffers
between different source managers instead of duplicating entire files into
memory.
---
 include/clang/Basic/SourceManager.h     | 49 +++++++++------------
 include/clang/Lex/PreprocessorOptions.h |  1 +
 lib/Basic/SourceManager.cpp             | 77 ++++++++++++++++-----------------
 3 files changed, 60 insertions(+), 67 deletions(-)

diff --git a/include/clang/Basic/SourceManager.h b/include/clang/Basic/SourceManager.h
index 585d138..777b25f 100644
--- a/include/clang/Basic/SourceManager.h
+++ b/include/clang/Basic/SourceManager.h
@@ -42,7 +42,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
-#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Allocator.h"
@@ -104,7 +103,8 @@ namespace SrcMgr {
     ///
     /// This is owned by the ContentCache object.  The bits indicate
     /// whether the buffer is invalid.
-    mutable llvm::PointerIntPair<const llvm::MemoryBuffer *, 2> Buffer;
+    mutable IntrusiveRefCntPtr<const llvm::MemoryBuffer> Buffer;
+    mutable unsigned BufferFlags;
 
   public:
     /// \brief Reference to the file entry representing this ContentCache.
@@ -142,32 +142,31 @@ namespace SrcMgr {
     /// \brief True if this content cache was initially created for a source
     /// file considered as a system one.
     unsigned IsSystemFile : 1;
-    
+
     ContentCache(const FileEntry *Ent = nullptr)
-      : Buffer(nullptr, false), OrigEntry(Ent), ContentsEntry(Ent),
-        SourceLineCache(nullptr), NumLines(0), BufferOverridden(false),
-        IsSystemFile(false) {
+        : BufferFlags(0), OrigEntry(Ent), ContentsEntry(Ent),
+          SourceLineCache(nullptr), NumLines(0), BufferOverridden(false),
+          IsSystemFile(false) {
       (void)NonceAligner; // Silence warnings about unused member.
     }
-    
+
     ContentCache(const FileEntry *Ent, const FileEntry *contentEnt)
-      : Buffer(nullptr, false), OrigEntry(Ent), ContentsEntry(contentEnt),
-        SourceLineCache(nullptr), NumLines(0), BufferOverridden(false),
-        IsSystemFile(false) {}
-    
+        : BufferFlags(0), OrigEntry(Ent), ContentsEntry(contentEnt),
+          SourceLineCache(nullptr), NumLines(0), BufferOverridden(false),
+          IsSystemFile(false) {}
+
     ~ContentCache();
     
     /// The copy ctor does not allow copies where source object has either
     /// a non-NULL Buffer or SourceLineCache.  Ownership of allocated memory
     /// is not transferred, so this is a logical error.
     ContentCache(const ContentCache &RHS)
-      : Buffer(nullptr, false), SourceLineCache(nullptr),
-        BufferOverridden(false), IsSystemFile(false) {
+        : BufferFlags(0), SourceLineCache(nullptr), BufferOverridden(false),
+          IsSystemFile(false) {
       OrigEntry = RHS.OrigEntry;
       ContentsEntry = RHS.ContentsEntry;
 
-      assert(RHS.Buffer.getPointer() == nullptr &&
-             RHS.SourceLineCache == nullptr &&
+      assert(!RHS.Buffer && RHS.SourceLineCache == nullptr &&
              "Passed ContentCache object cannot own a buffer.");
 
       NumLines = RHS.NumLines;
@@ -206,30 +205,24 @@ namespace SrcMgr {
     llvm::MemoryBuffer::BufferKind getMemoryBufferKind() const;
 
     void setBuffer(const llvm::MemoryBuffer *B) {
-      assert(!Buffer.getPointer() && "MemoryBuffer already set.");
-      Buffer.setPointer(B);
-      Buffer.setInt(false);
+      assert(!Buffer && "MemoryBuffer already set.");
+      Buffer = B;
+      BufferFlags = 0;
     }
 
     /// \brief Get the underlying buffer, returning NULL if the buffer is not
     /// yet available.
-    const llvm::MemoryBuffer *getRawBuffer() const {
-      return Buffer.getPointer();
-    }
+    const llvm::MemoryBuffer *getRawBuffer() const { return Buffer.getPtr(); }
 
     /// \brief Replace the existing buffer (which will be deleted)
     /// with the given buffer.
     void replaceBuffer(const llvm::MemoryBuffer *B, bool DoNotFree = false);
 
     /// \brief Determine whether the buffer itself is invalid.
-    bool isBufferInvalid() const {
-      return Buffer.getInt() & InvalidFlag;
-    }
+    bool isBufferInvalid() const { return BufferFlags & InvalidFlag; }
 
     /// \brief Determine whether the buffer should be freed.
-    bool shouldFreeBuffer() const {
-      return (Buffer.getInt() & DoNotFreeFlag) == 0;
-    }
+    bool shouldFreeBuffer() const { return (BufferFlags & DoNotFreeFlag) == 0; }
 
   private:
     // Disable assignments.
@@ -687,7 +680,7 @@ class SourceManager : public RefCountedBase<SourceManager> {
   InBeforeInTUCacheEntry &getInBeforeInTUCache(FileID LFID, FileID RFID) const;
 
   // Cache for the "fake" buffer used for error-recovery purposes.
-  mutable llvm::MemoryBuffer *FakeBufferForRecovery;
+  mutable IntrusiveRefCntPtr<llvm::MemoryBuffer> FakeBufferForRecovery;
 
   mutable SrcMgr::ContentCache *FakeContentCacheForRecovery;
 
diff --git a/include/clang/Lex/PreprocessorOptions.h b/include/clang/Lex/PreprocessorOptions.h
index eba2a13..18a4186 100644
--- a/include/clang/Lex/PreprocessorOptions.h
+++ b/include/clang/Lex/PreprocessorOptions.h
@@ -111,6 +111,7 @@ public:
   /// This flag defaults to false; it can be set true only through direct
   /// manipulation of the compiler invocation object, in cases where the 
   /// compiler invocation and its buffers will be reused.
+  // FIXME: This scheme is obsoleted by refcounting. Remove it!
   bool RetainRemappedFileBuffers;
   
   /// \brief The Objective-C++ ARC standard library that we should support,
diff --git a/lib/Basic/SourceManager.cpp b/lib/Basic/SourceManager.cpp
index d2d5562..2b6f9dc 100644
--- a/lib/Basic/SourceManager.cpp
+++ b/lib/Basic/SourceManager.cpp
@@ -37,27 +37,27 @@ using llvm::MemoryBuffer;
 //===----------------------------------------------------------------------===//
 
 ContentCache::~ContentCache() {
-  if (shouldFreeBuffer())
-    delete Buffer.getPointer();
+  // FIXME: This scheme is no longer necessary with refcounting.
+  if (!shouldFreeBuffer())
+    Buffer.resetWithoutRelease();
 }
 
 /// getSizeBytesMapped - Returns the number of bytes actually mapped for this
 /// ContentCache. This can be 0 if the MemBuffer was not actually expanded.
 unsigned ContentCache::getSizeBytesMapped() const {
-  return Buffer.getPointer() ? Buffer.getPointer()->getBufferSize() : 0;
+  return Buffer ? Buffer->getBufferSize() : 0;
 }
 
 /// Returns the kind of memory used to back the memory buffer for
 /// this content cache.  This is used for performance analysis.
 llvm::MemoryBuffer::BufferKind ContentCache::getMemoryBufferKind() const {
-  assert(Buffer.getPointer());
+  assert(Buffer);
 
   // Should be unreachable, but keep for sanity.
-  if (!Buffer.getPointer())
+  if (!Buffer)
     return llvm::MemoryBuffer::MemoryBuffer_Malloc;
-  
-  const llvm::MemoryBuffer *buf = Buffer.getPointer();
-  return buf->getBufferKind();
+
+  return Buffer->getBufferKind();
 }
 
 /// getSize - Returns the size of the content encapsulated by this ContentCache.
@@ -65,22 +65,23 @@ llvm::MemoryBuffer::BufferKind ContentCache::getMemoryBufferKind() const {
 ///  scratch buffer.  If the ContentCache encapsulates a source file, that
 ///  file is not lazily brought in from disk to satisfy this query.
 unsigned ContentCache::getSize() const {
-  return Buffer.getPointer() ? (unsigned) Buffer.getPointer()->getBufferSize()
-                             : (unsigned) ContentsEntry->getSize();
+  return Buffer ? (unsigned)Buffer->getBufferSize()
+                : (unsigned)ContentsEntry->getSize();
 }
 
 void ContentCache::replaceBuffer(const llvm::MemoryBuffer *B,
                                  bool DoNotFree) {
-  if (B && B == Buffer.getPointer()) {
+  if (B && B == Buffer.getPtr()) {
     assert(0 && "Replacing with the same buffer");
-    Buffer.setInt(DoNotFree? DoNotFreeFlag : 0);
+    BufferFlags = DoNotFree ? DoNotFreeFlag : 0;
     return;
   }
-  
-  if (shouldFreeBuffer())
-    delete Buffer.getPointer();
-  Buffer.setPointer(B);
-  Buffer.setInt(DoNotFree? DoNotFreeFlag : 0);
+
+  if (!shouldFreeBuffer())
+    Buffer.resetWithoutRelease();
+
+  Buffer = B;
+  BufferFlags = DoNotFree ? DoNotFreeFlag : 0;
 }
 
 const llvm::MemoryBuffer *ContentCache::getBuffer(DiagnosticsEngine &Diag,
@@ -89,18 +90,17 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(DiagnosticsEngine &Diag,
                                                   bool *Invalid) const {
   // Lazily create the Buffer for ContentCaches that wrap files.  If we already
   // computed it, just return what we have.
-  if (Buffer.getPointer() || !ContentsEntry) {
+  if (Buffer || !ContentsEntry) {
     if (Invalid)
       *Invalid = isBufferInvalid();
-    
-    return Buffer.getPointer();
+
+    return Buffer.getPtr();
   }    
 
   std::string ErrorStr;
   bool isVolatile = SM.userFilesAreVolatile() && !IsSystemFile;
-  Buffer.setPointer(SM.getFileManager().getBufferForFile(ContentsEntry,
-                                                         &ErrorStr,
-                                                         isVolatile));
+  Buffer = SM.getFileManager().getBufferForFile(ContentsEntry, &ErrorStr,
+                                                isVolatile);
 
   // If we were unable to open the file, then we are in an inconsistent
   // situation where the content cache referenced a file which no longer
@@ -112,11 +112,11 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(DiagnosticsEngine &Diag,
   // currently handle returning a null entry here. Ideally we should detect
   // that we are in an inconsistent situation and error out as quickly as
   // possible.
-  if (!Buffer.getPointer()) {
+  if (!Buffer) {
     const StringRef FillStr("<<<MISSING SOURCE FILE>>>\n");
-    Buffer.setPointer(MemoryBuffer::getNewMemBuffer(ContentsEntry->getSize(), 
-                                                    "<invalid>"));
-    char *Ptr = const_cast<char*>(Buffer.getPointer()->getBufferStart());
+    Buffer =
+        MemoryBuffer::getNewMemBuffer(ContentsEntry->getSize(), "<invalid>");
+    char *Ptr = const_cast<char *>(Buffer->getBufferStart());
     for (unsigned i = 0, e = ContentsEntry->getSize(); i != e; ++i)
       Ptr[i] = FillStr[i % FillStr.size()];
 
@@ -127,10 +127,10 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(DiagnosticsEngine &Diag,
       Diag.Report(Loc, diag::err_cannot_open_file)
         << ContentsEntry->getName() << ErrorStr;
 
-    Buffer.setInt(Buffer.getInt() | InvalidFlag);
-    
+    BufferFlags |= InvalidFlag;
+
     if (Invalid) *Invalid = true;
-    return Buffer.getPointer();
+    return Buffer.getPtr();
   }
   
   // Check that the file's size is the same as in the file entry (which may
@@ -143,15 +143,15 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(DiagnosticsEngine &Diag,
       Diag.Report(Loc, diag::err_file_modified)
         << ContentsEntry->getName();
 
-    Buffer.setInt(Buffer.getInt() | InvalidFlag);
+    BufferFlags |= InvalidFlag;
     if (Invalid) *Invalid = true;
-    return Buffer.getPointer();
+    return Buffer.getPtr();
   }
 
   // If the buffer is valid, check to see if it has a UTF Byte Order Mark
   // (BOM).  We only support UTF-8 with and without a BOM right now.  See
   // http://en.wikipedia.org/wiki/Byte_order_mark for more information.
-  StringRef BufStr = Buffer.getPointer()->getBuffer();
+  StringRef BufStr = Buffer->getBuffer();
   const char *InvalidBOM = llvm::StringSwitch<const char *>(BufStr)
     .StartsWith("\xFE\xFF", "UTF-16 (BE)")
     .StartsWith("\xFF\xFE", "UTF-16 (LE)")
@@ -168,13 +168,13 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(DiagnosticsEngine &Diag,
   if (InvalidBOM) {
     Diag.Report(Loc, diag::err_unsupported_bom)
       << InvalidBOM << ContentsEntry->getName();
-    Buffer.setInt(Buffer.getInt() | InvalidFlag);
+    BufferFlags |= InvalidFlag;
   }
   
   if (Invalid)
     *Invalid = isBufferInvalid();
-  
-  return Buffer.getPointer();
+
+  return Buffer.getPtr();
 }
 
 unsigned LineTableInfo::getLineTableFilenameID(StringRef Name) {
@@ -401,7 +401,6 @@ SourceManager::~SourceManager() {
     }
   }
   
-  delete FakeBufferForRecovery;
   delete FakeContentCacheForRecovery;
 
   llvm::DeleteContainerSeconds(MacroArgsCacheMap);
@@ -509,8 +508,8 @@ const llvm::MemoryBuffer *SourceManager::getFakeBufferForRecovery() const {
   if (!FakeBufferForRecovery)
     FakeBufferForRecovery
       = llvm::MemoryBuffer::getMemBuffer("<<<INVALID BUFFER>>");
-  
-  return FakeBufferForRecovery;
+
+  return FakeBufferForRecovery.getPtr();
 }
 
 /// \brief As part of recovering from missing or changed content, produce a
-- 
1.9.1

-------------- next part --------------
>From 0d4b5b0112b7774e1739f022e23b96f3140380d2 Mon Sep 17 00:00:00 2001
From: Alp Toker <alp at nuanti.com>
Date: Wed, 25 Jun 2014 02:51:20 +0300
Subject: [PATCH 2/3] Don't allocate memory for file contents

Eliminate redundant getMemBufferCopy() calls where all we really need is a
reference to the original mapping.

This is made possible by the newly added refcounting support in SourceManager.
---
 lib/AST/ASTImporter.cpp            | 21 ++++++++-------------
 lib/CodeGen/CodeGenAction.cpp      | 12 +++---------
 tools/clang-format/ClangFormat.cpp | 12 +++++++-----
 3 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/lib/AST/ASTImporter.cpp b/lib/AST/ASTImporter.cpp
index b180326..6f6e418 100644
--- a/lib/AST/ASTImporter.cpp
+++ b/lib/AST/ASTImporter.cpp
@@ -4909,24 +4909,19 @@ FileID ASTImporter::Import(FileID FromID) {
   FileID ToID;
   const SrcMgr::ContentCache *Cache = FromSLoc.getFile().getContentCache();
   if (Cache->OrigEntry) {
-    // FIXME: We probably want to use getVirtualFile(), so we don't hit the
-    // disk again
-    // FIXME: We definitely want to re-use the existing MemoryBuffer, rather
-    // than mmap the files several times.
-    const FileEntry *Entry = ToFileManager.getFile(Cache->OrigEntry->getName());
+    const FileEntry *Entry = ToFileManager.getVirtualFile(
+        Cache->OrigEntry->getName(), Cache->OrigEntry->getSize(),
+        Cache->OrigEntry->getModificationTime());
+    if (!Cache->isBufferInvalid())
+      ToSM.overrideFileContents(Entry, Cache->getRawBuffer());
     ToID = ToSM.createFileID(Entry, ToIncludeLoc, 
                              FromSLoc.getFile().getFileCharacteristic());
   } else {
-    // FIXME: We want to re-use the existing MemoryBuffer!
-    const llvm::MemoryBuffer *
-        FromBuf = Cache->getBuffer(FromContext.getDiagnostics(), FromSM);
-    llvm::MemoryBuffer *ToBuf
-      = llvm::MemoryBuffer::getMemBufferCopy(FromBuf->getBuffer(),
-                                             FromBuf->getBufferIdentifier());
-    ToID = ToSM.createFileID(ToBuf, FromSLoc.getFile().getFileCharacteristic());
+    const llvm::MemoryBuffer *MB =
+        Cache->getBuffer(FromContext.getDiagnostics(), FromSM);
+    ToID = ToSM.createFileID(MB, FromSLoc.getFile().getFileCharacteristic());
   }
   
-  
   ImportedFileIDs[FromID] = ToID;
   return ToID;
 }
diff --git a/lib/CodeGen/CodeGenAction.cpp b/lib/CodeGen/CodeGenAction.cpp
index f162c30..433be66 100644
--- a/lib/CodeGen/CodeGenAction.cpp
+++ b/lib/CodeGen/CodeGenAction.cpp
@@ -261,19 +261,13 @@ static FullSourceLoc ConvertBackendLocation(const llvm::SMDiagnostic &D,
   // a copy to the Clang source manager.
   const llvm::SourceMgr &LSM = *D.getSourceMgr();
 
-  // We need to copy the underlying LLVM memory buffer because llvm::SourceMgr
-  // already owns its one and clang::SourceManager wants to own its one.
+  // TODO: Maintain a file ID map instead of creating new ones each time.
   const MemoryBuffer *LBuf =
   LSM.getMemoryBuffer(LSM.FindBufferContainingLoc(D.getLoc()));
-
-  // Create the copy and transfer ownership to clang::SourceManager.
-  llvm::MemoryBuffer *CBuf =
-  llvm::MemoryBuffer::getMemBufferCopy(LBuf->getBuffer(),
-                                       LBuf->getBufferIdentifier());
-  FileID FID = CSM.createFileID(CBuf);
+  FileID FID = CSM.createFileID(LBuf);
 
   // Translate the offset into the file.
-  unsigned Offset = D.getLoc().getPointer()  - LBuf->getBufferStart();
+  unsigned Offset = D.getLoc().getPointer() - LBuf->getBufferStart();
   SourceLocation NewLoc =
   CSM.getLocForStartOfFile(FID).getLocWithOffset(Offset);
   return FullSourceLoc(NewLoc, CSM);
diff --git a/tools/clang-format/ClangFormat.cpp b/tools/clang-format/ClangFormat.cpp
index e023949..e7dde70 100644
--- a/tools/clang-format/ClangFormat.cpp
+++ b/tools/clang-format/ClangFormat.cpp
@@ -108,7 +108,7 @@ static FileID createInMemoryFile(StringRef FileName, const MemoryBuffer *Source,
   const FileEntry *Entry = Files.getVirtualFile(FileName == "-" ? "<stdin>" :
                                                     FileName,
                                                 Source->getBufferSize(), 0);
-  Sources.overrideFileContents(Entry, Source, true);
+  Sources.overrideFileContents(Entry, Source);
   return Sources.createFileID(Entry, SourceLocation(), SrcMgr::C_User);
 }
 
@@ -209,16 +209,18 @@ static bool format(StringRef FileName) {
       IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs),
       new DiagnosticOptions);
   SourceManager Sources(Diagnostics, Files);
-  std::unique_ptr<MemoryBuffer> Code;
-  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(FileName, Code)) {
+  std::unique_ptr<MemoryBuffer> CodeResult;
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(FileName, CodeResult)) {
     llvm::errs() << ec.message() << "\n";
     return true;
   }
+  IntrusiveRefCntPtr<MemoryBuffer> Code = CodeResult.release();
+
   if (Code->getBufferSize() == 0)
     return false; // Empty files are formatted correctly.
-  FileID ID = createInMemoryFile(FileName, Code.get(), Sources, Files);
+  FileID ID = createInMemoryFile(FileName, Code.getPtr(), Sources, Files);
   std::vector<CharSourceRange> Ranges;
-  if (fillRanges(Sources, ID, Code.get(), Ranges))
+  if (fillRanges(Sources, ID, Code.getPtr(), Ranges))
     return true;
 
   FormatStyle FormatStyle = getStyle(
-- 
1.9.1



More information about the cfe-commits mailing list