LLVM 23.0.0git
OnDiskGraphDB.cpp
Go to the documentation of this file.
1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file implements OnDiskGraphDB, an on-disk CAS nodes database,
11/// independent of a particular hashing algorithm. It only needs to be
12/// configured for the hash size and controls the schema of the storage.
13///
14/// OnDiskGraphDB defines:
15///
16/// - How the data is stored inside database, either as a standalone file, or
17/// allocated inside a datapool.
18/// - How references to other objects inside the same database is stored. They
19/// are stored as internal references, instead of full hash value to save
20/// space.
21/// - How to chain databases together and import objects from upstream
22/// databases.
23///
24/// Here's a top-level description of the current layout:
25///
26/// - db/index.<version>: a file for the "index" table, named by \a
27/// IndexTableName and managed by \a TrieRawHashMap. The contents are 8B
28/// that are accessed atomically, describing the object kind and where/how
29/// it's stored (including an optional file offset). See \a TrieRecord for
30/// more details.
31/// - db/data.<version>: a file for the "data" table, named by \a
32/// DataPoolTableName and managed by \a DataStore. New objects within
33/// TrieRecord::MaxEmbeddedSize are inserted here as \a
34/// TrieRecord::StorageKind::DataPool.
35/// - db/obj.<offset>.<version>: a file storing an object outside the main
36/// "data" table, named by its offset into the "index" table, with the
37/// format of \a TrieRecord::StorageKind::Standalone.
38/// - db/leaf.<offset>.<version>: a file storing a leaf node outside the
39/// main "data" table, named by its offset into the "index" table, with
40/// the format of \a TrieRecord::StorageKind::StandaloneLeaf.
41/// - db/leaf+0.<offset>.<version>: a file storing a null-terminated leaf object
42/// outside the main "data" table, named by its offset into the "index" table,
43/// with the format of \a TrieRecord::StorageKind::StandaloneLeaf0.
44//
45//===----------------------------------------------------------------------===//
46
48#include "OnDiskCommon.h"
49#include "llvm/ADT/DenseMap.h"
50#include "llvm/ADT/ScopeExit.h"
57#include "llvm/Support/Errc.h"
58#include "llvm/Support/Error.h"
63#include "llvm/Support/Path.h"
65#include <atomic>
66#include <mutex>
67#include <optional>
68#include <variant>
69
70#define DEBUG_TYPE "on-disk-cas"
71
72using namespace llvm;
73using namespace llvm::cas;
74using namespace llvm::cas::ondisk;
75
76static constexpr StringLiteral IndexTableName = "llvm.cas.index";
77static constexpr StringLiteral DataPoolTableName = "llvm.cas.data";
78
79static constexpr StringLiteral IndexFilePrefix = "index.";
80static constexpr StringLiteral DataPoolFilePrefix = "data.";
81
82static constexpr StringLiteral FilePrefixObject = "obj.";
83static constexpr StringLiteral FilePrefixLeaf = "leaf.";
84static constexpr StringLiteral FilePrefixLeaf0 = "leaf+0.";
85
87 if (!ID)
88 return ID.takeError();
89
91 "corrupt object '" + toHex(*ID) + "'");
92}
93
94namespace {
95
96/// Trie record data: 8 bytes, atomic<uint64_t>
97/// - 1-byte: StorageKind
98/// - 7-bytes: DataStoreOffset (offset into referenced file)
99class TrieRecord {
100public:
101 enum class StorageKind : uint8_t {
102 /// Unknown object.
103 Unknown = 0,
104
105 /// data.vX: main pool, full DataStore record.
106 DataPool = 1,
107
108 /// obj.<TrieRecordOffset>.vX: standalone, with a full DataStore record.
109 Standalone = 10,
110
111 /// leaf.<TrieRecordOffset>.vX: standalone, just the data. File contents
112 /// exactly the data content and file size matches the data size. No refs.
113 StandaloneLeaf = 11,
114
115 /// leaf+0.<TrieRecordOffset>.vX: standalone, just the data plus an
116 /// extra null character ('\0'). File size is 1 bigger than the data size.
117 /// No refs.
118 StandaloneLeaf0 = 12,
119 };
120
121 static StringRef getStandaloneFilePrefix(StorageKind SK) {
122 switch (SK) {
123 default:
124 llvm_unreachable("Expected standalone storage kind");
125 case TrieRecord::StorageKind::Standalone:
126 return FilePrefixObject;
127 case TrieRecord::StorageKind::StandaloneLeaf:
128 return FilePrefixLeaf;
129 case TrieRecord::StorageKind::StandaloneLeaf0:
130 return FilePrefixLeaf0;
131 }
132 }
133
134 enum Limits : int64_t {
135 /// Saves files bigger than 64KB standalone instead of embedding them.
136 MaxEmbeddedSize = 64LL * 1024LL - 1,
137 };
138
139 struct Data {
140 StorageKind SK = StorageKind::Unknown;
141 FileOffset Offset;
142 };
143
144 /// Pack StorageKind and Offset from Data into 8 byte TrieRecord.
145 static uint64_t pack(Data D) {
146 assert(D.Offset.get() < (int64_t)(1ULL << 56));
147 uint64_t Packed = uint64_t(D.SK) << 56 | D.Offset.get();
148 assert(D.SK != StorageKind::Unknown || Packed == 0);
149#ifndef NDEBUG
150 Data RoundTrip = unpack(Packed);
151 assert(D.SK == RoundTrip.SK);
152 assert(D.Offset.get() == RoundTrip.Offset.get());
153#endif
154 return Packed;
155 }
156
157 // Unpack TrieRecord into Data.
158 static Data unpack(uint64_t Packed) {
159 Data D;
160 if (!Packed)
161 return D;
162 D.SK = (StorageKind)(Packed >> 56);
163 D.Offset = FileOffset(Packed & (UINT64_MAX >> 8));
164 return D;
165 }
166
167 TrieRecord() : Storage(0) {}
168
169 Data load() const { return unpack(Storage); }
170 bool compare_exchange_strong(Data &Existing, Data New);
171
172private:
173 std::atomic<uint64_t> Storage;
174};
175
176/// DataStore record data: 4B + size? + refs? + data + 0
177/// - 4-bytes: Header
178/// - {0,4,8}-bytes: DataSize (may be packed in Header)
179/// - {0,4,8}-bytes: NumRefs (may be packed in Header)
180/// - NumRefs*{4,8}-bytes: Refs[] (end-ptr is 8-byte aligned)
181/// - <data>
182/// - 1-byte: 0-term
183struct DataRecordHandle {
184 /// NumRefs storage: 4B, 2B, 1B, or 0B (no refs). Or, 8B, for alignment
185 /// convenience to avoid computing padding later.
186 enum class NumRefsFlags : uint8_t {
187 Uses0B = 0U,
188 Uses1B = 1U,
189 Uses2B = 2U,
190 Uses4B = 3U,
191 Uses8B = 4U,
192 Max = Uses8B,
193 };
194
195 /// DataSize storage: 8B, 4B, 2B, or 1B.
196 enum class DataSizeFlags {
197 Uses1B = 0U,
198 Uses2B = 1U,
199 Uses4B = 2U,
200 Uses8B = 3U,
201 Max = Uses8B,
202 };
203
204 /// Kind of ref stored in Refs[]: InternalRef or InternalRef4B.
205 enum class RefKindFlags {
206 InternalRef = 0U,
207 InternalRef4B = 1U,
208 Max = InternalRef4B,
209 };
210
211 enum Counts : int {
212 NumRefsShift = 0,
213 NumRefsBits = 3,
214 DataSizeShift = NumRefsShift + NumRefsBits,
215 DataSizeBits = 2,
216 RefKindShift = DataSizeShift + DataSizeBits,
217 RefKindBits = 1,
218 };
219 static_assert(((UINT32_MAX << NumRefsBits) & (uint32_t)NumRefsFlags::Max) ==
220 0,
221 "Not enough bits");
222 static_assert(((UINT32_MAX << DataSizeBits) & (uint32_t)DataSizeFlags::Max) ==
223 0,
224 "Not enough bits");
225 static_assert(((UINT32_MAX << RefKindBits) & (uint32_t)RefKindFlags::Max) ==
226 0,
227 "Not enough bits");
228
229 /// Layout of the DataRecordHandle and how to decode it.
230 struct LayoutFlags {
231 NumRefsFlags NumRefs;
232 DataSizeFlags DataSize;
233 RefKindFlags RefKind;
234
235 static uint64_t pack(LayoutFlags LF) {
236 unsigned Packed = ((unsigned)LF.NumRefs << NumRefsShift) |
237 ((unsigned)LF.DataSize << DataSizeShift) |
238 ((unsigned)LF.RefKind << RefKindShift);
239#ifndef NDEBUG
240 LayoutFlags RoundTrip = unpack(Packed);
241 assert(LF.NumRefs == RoundTrip.NumRefs);
242 assert(LF.DataSize == RoundTrip.DataSize);
243 assert(LF.RefKind == RoundTrip.RefKind);
244#endif
245 return Packed;
246 }
247 static LayoutFlags unpack(uint64_t Storage) {
248 assert(Storage <= UINT8_MAX && "Expect storage to fit in a byte");
249 LayoutFlags LF;
250 LF.NumRefs =
251 (NumRefsFlags)((Storage >> NumRefsShift) & ((1U << NumRefsBits) - 1));
252 LF.DataSize = (DataSizeFlags)((Storage >> DataSizeShift) &
253 ((1U << DataSizeBits) - 1));
254 LF.RefKind =
255 (RefKindFlags)((Storage >> RefKindShift) & ((1U << RefKindBits) - 1));
256 return LF;
257 }
258 };
259
260 /// Header layout:
261 /// - 1-byte: LayoutFlags
262 /// - 1-byte: 1B size field
263 /// - {0,2}-bytes: 2B size field
264 struct Header {
265 using PackTy = uint32_t;
266 PackTy Packed;
267
268 static constexpr unsigned LayoutFlagsShift =
269 (sizeof(PackTy) - 1) * CHAR_BIT;
270 };
271
272 struct Input {
273 InternalRefArrayRef Refs;
274 ArrayRef<char> Data;
275 };
276
277 LayoutFlags getLayoutFlags() const {
278 return LayoutFlags::unpack(H->Packed >> Header::LayoutFlagsShift);
279 }
280
281 uint64_t getDataSize() const;
282 void skipDataSize(LayoutFlags LF, int64_t &RelOffset) const;
283 uint32_t getNumRefs() const;
284 void skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const;
285 int64_t getRefsRelOffset() const;
286 int64_t getDataRelOffset() const;
287
288 static uint64_t getTotalSize(uint64_t DataRelOffset, uint64_t DataSize) {
289 return DataRelOffset + DataSize + 1;
290 }
291 uint64_t getTotalSize() const {
292 return getDataRelOffset() + getDataSize() + 1;
293 }
294
295 /// Describe the layout of data stored and how to decode from
296 /// DataRecordHandle.
297 struct Layout {
298 explicit Layout(const Input &I);
299
300 LayoutFlags Flags;
301 uint64_t DataSize = 0;
302 uint32_t NumRefs = 0;
303 int64_t RefsRelOffset = 0;
304 int64_t DataRelOffset = 0;
305 uint64_t getTotalSize() const {
306 return DataRecordHandle::getTotalSize(DataRelOffset, DataSize);
307 }
308 };
309
310 InternalRefArrayRef getRefs() const {
311 assert(H && "Expected valid handle");
312 auto *BeginByte = reinterpret_cast<const char *>(H) + getRefsRelOffset();
313 size_t Size = getNumRefs();
314 if (!Size)
315 return InternalRefArrayRef();
316 if (getLayoutFlags().RefKind == RefKindFlags::InternalRef4B)
317 return ArrayRef(reinterpret_cast<const InternalRef4B *>(BeginByte), Size);
318 return ArrayRef(reinterpret_cast<const InternalRef *>(BeginByte), Size);
319 }
320
321 ArrayRef<char> getData() const {
322 assert(H && "Expected valid handle");
323 return ArrayRef(reinterpret_cast<const char *>(H) + getDataRelOffset(),
324 getDataSize());
325 }
326
327 static DataRecordHandle create(function_ref<char *(size_t Size)> Alloc,
328 const Input &I);
329 static Expected<DataRecordHandle>
330 createWithError(function_ref<Expected<char *>(size_t Size)> Alloc,
331 const Input &I);
332 static DataRecordHandle construct(char *Mem, const Input &I);
333
334 static DataRecordHandle get(const char *Mem) {
335 return DataRecordHandle(
336 *reinterpret_cast<const DataRecordHandle::Header *>(Mem));
337 }
338 static Expected<DataRecordHandle>
339 getFromDataPool(const OnDiskDataAllocator &Pool, FileOffset Offset);
340
341 explicit operator bool() const { return H; }
342 const Header &getHeader() const { return *H; }
343
344 DataRecordHandle() = default;
345 explicit DataRecordHandle(const Header &H) : H(&H) {}
346
347private:
348 static DataRecordHandle constructImpl(char *Mem, const Input &I,
349 const Layout &L);
350 const Header *H = nullptr;
351};
352
353/// Proxy for any on-disk object or raw data.
354struct OnDiskContent {
355 std::optional<DataRecordHandle> Record;
356 std::optional<ArrayRef<char>> Bytes;
357
358 ArrayRef<char> getData() const {
359 if (Bytes)
360 return *Bytes;
361 assert(Record && "Expected record or bytes");
362 return Record->getData();
363 }
364};
365
366/// Data loaded inside the memory from standalone file.
367class StandaloneDataInMemory {
368public:
369 OnDiskContent getContent() const;
370
371 OnDiskGraphDB::FileBackedData
372 getInternalFileBackedObjectData(StringRef RootPath) const;
373
374 StandaloneDataInMemory(std::unique_ptr<sys::fs::mapped_file_region> Region,
375 TrieRecord::StorageKind SK, FileOffset IndexOffset)
376 : Region(std::move(Region)), SK(SK), IndexOffset(IndexOffset) {
377#ifndef NDEBUG
378 bool IsStandalone = false;
379 switch (SK) {
380 case TrieRecord::StorageKind::Standalone:
381 case TrieRecord::StorageKind::StandaloneLeaf:
382 case TrieRecord::StorageKind::StandaloneLeaf0:
383 IsStandalone = true;
384 break;
385 default:
386 break;
387 }
388 assert(IsStandalone);
389#endif
390 }
391
392private:
393 std::unique_ptr<sys::fs::mapped_file_region> Region;
394 TrieRecord::StorageKind SK;
395 FileOffset IndexOffset;
396};
397
398/// Container to lookup loaded standalone objects.
399template <size_t NumShards> class StandaloneDataMap {
400 static_assert(isPowerOf2_64(NumShards), "Expected power of 2");
401
402public:
403 uintptr_t insert(ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
404 std::unique_ptr<sys::fs::mapped_file_region> Region,
405 FileOffset IndexOffset);
406
407 const StandaloneDataInMemory *lookup(ArrayRef<uint8_t> Hash) const;
408 bool count(ArrayRef<uint8_t> Hash) const { return bool(lookup(Hash)); }
409
410private:
411 struct Shard {
412 /// Needs to store a std::unique_ptr for a stable address identity.
413 DenseMap<const uint8_t *, std::unique_ptr<StandaloneDataInMemory>> Map;
414 mutable std::mutex Mutex;
415 };
416 Shard &getShard(ArrayRef<uint8_t> Hash) {
417 return const_cast<Shard &>(
418 const_cast<const StandaloneDataMap *>(this)->getShard(Hash));
419 }
420 const Shard &getShard(ArrayRef<uint8_t> Hash) const {
421 static_assert(NumShards <= 256, "Expected only 8 bits of shard");
422 return Shards[Hash[0] % NumShards];
423 }
424
425 Shard Shards[NumShards];
426};
427
428using StandaloneDataMapTy = StandaloneDataMap<16>;
429
430/// A vector of internal node references.
431class InternalRefVector {
432public:
433 void push_back(InternalRef Ref) {
434 if (NeedsFull)
435 return FullRefs.push_back(Ref);
436 if (std::optional<InternalRef4B> Small = InternalRef4B::tryToShrink(Ref))
437 return SmallRefs.push_back(*Small);
438 NeedsFull = true;
439 assert(FullRefs.empty());
440 FullRefs.reserve(SmallRefs.size() + 1);
441 for (InternalRef4B Small : SmallRefs)
442 FullRefs.push_back(Small);
443 FullRefs.push_back(Ref);
444 SmallRefs.clear();
445 }
446
447 operator InternalRefArrayRef() const {
448 assert(SmallRefs.empty() || FullRefs.empty());
449 return NeedsFull ? InternalRefArrayRef(FullRefs)
450 : InternalRefArrayRef(SmallRefs);
451 }
452
453private:
454 bool NeedsFull = false;
457};
458
459} // namespace
460
461Expected<DataRecordHandle> DataRecordHandle::createWithError(
462 function_ref<Expected<char *>(size_t Size)> Alloc, const Input &I) {
463 Layout L(I);
464 if (Expected<char *> Mem = Alloc(L.getTotalSize()))
465 return constructImpl(*Mem, I, L);
466 else
467 return Mem.takeError();
468}
469
471 // Store the file offset as it is.
472 assert(!(Offset.get() & 0x1));
473 return ObjectHandle(Offset.get());
474}
475
477 // Store the pointer from memory with lowest bit set.
478 assert(!(Ptr & 0x1));
479 return ObjectHandle(Ptr | 1);
480}
481
482/// Proxy for an on-disk index record.
488
489template <size_t N>
490uintptr_t StandaloneDataMap<N>::insert(
491 ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
492 std::unique_ptr<sys::fs::mapped_file_region> Region,
493 FileOffset IndexOffset) {
494 auto &S = getShard(Hash);
495 std::lock_guard<std::mutex> Lock(S.Mutex);
496 auto &V = S.Map[Hash.data()];
497 if (!V)
498 V = std::make_unique<StandaloneDataInMemory>(std::move(Region), SK,
499 IndexOffset);
500 return reinterpret_cast<uintptr_t>(V.get());
501}
502
503template <size_t N>
504const StandaloneDataInMemory *
505StandaloneDataMap<N>::lookup(ArrayRef<uint8_t> Hash) const {
506 auto &S = getShard(Hash);
507 std::lock_guard<std::mutex> Lock(S.Mutex);
508 auto I = S.Map.find(Hash.data());
509 if (I == S.Map.end())
510 return nullptr;
511 return &*I->second;
512}
513
514namespace {
515
516/// Copy of \a sys::fs::TempFile that skips RemoveOnSignal, which is too
517/// expensive to register/unregister at this rate.
518///
519/// FIXME: Add a TempFileManager that maintains a thread-safe list of open temp
520/// files and has a signal handler registerd that removes them all.
521class TempFile {
522 bool Done = false;
523 TempFile(StringRef Name, int FD, OnDiskCASLogger *Logger)
524 : TmpName(std::string(Name)), FD(FD), Logger(Logger) {}
525
526public:
527 /// This creates a temporary file with createUniqueFile.
528 static Expected<TempFile> create(const Twine &Model, OnDiskCASLogger *Logger);
529 TempFile(TempFile &&Other) { *this = std::move(Other); }
530 TempFile &operator=(TempFile &&Other) {
531 TmpName = std::move(Other.TmpName);
532 FD = Other.FD;
533 Logger = Other.Logger;
534 Other.Done = true;
535 Other.FD = -1;
536 return *this;
537 }
538
539 // Name of the temporary file.
540 std::string TmpName;
541
542 // The open file descriptor.
543 int FD = -1;
544
545 OnDiskCASLogger *Logger = nullptr;
546
547 // Keep this with the given name.
548 Error keep(const Twine &Name);
549 Error discard();
550
551 // This checks that keep or delete was called.
552 ~TempFile() { consumeError(discard()); }
553};
554
555class MappedTempFile {
556public:
557 char *data() const { return Map.data(); }
558 size_t size() const { return Map.size(); }
559
560 Error discard() {
561 assert(Map && "Map already destroyed");
562 Map.unmap();
563 return Temp.discard();
564 }
565
566 Error keep(const Twine &Name) {
567 assert(Map && "Map already destroyed");
568 Map.unmap();
569 return Temp.keep(Name);
570 }
571
572 MappedTempFile(TempFile Temp, sys::fs::mapped_file_region Map)
573 : Temp(std::move(Temp)), Map(std::move(Map)) {}
574
575private:
576 TempFile Temp;
577 sys::fs::mapped_file_region Map;
578};
579} // namespace
580
582 Done = true;
583 if (FD != -1) {
585 if (std::error_code EC = sys::fs::closeFile(File))
586 return errorCodeToError(EC);
587 }
588 FD = -1;
589
590 // Always try to close and remove.
591 std::error_code RemoveEC;
592 if (!TmpName.empty()) {
593 std::error_code EC = sys::fs::remove(TmpName);
594 if (Logger)
595 Logger->logTempFileRemove(TmpName, EC);
596 if (EC)
597 return errorCodeToError(EC);
598 }
599 TmpName = "";
600
601 return Error::success();
602}
603
605 assert(!Done);
606 Done = true;
607 // Always try to close and rename.
608 std::error_code RenameEC = sys::fs::rename(TmpName, Name);
609
610 if (Logger)
611 Logger->logTempFileKeep(TmpName, Name.str(), RenameEC);
612
613 if (!RenameEC)
614 TmpName = "";
615
617 if (std::error_code EC = sys::fs::closeFile(File))
618 return errorCodeToError(EC);
619 FD = -1;
620
621 return errorCodeToError(RenameEC);
622}
623
626 int FD;
627 SmallString<128> ResultPath;
628 if (std::error_code EC = sys::fs::createUniqueFile(Model, FD, ResultPath))
629 return errorCodeToError(EC);
630
631 if (Logger)
632 Logger->logTempFileCreate(ResultPath);
633
634 TempFile Ret(ResultPath, FD, Logger);
635 return std::move(Ret);
636}
637
638bool TrieRecord::compare_exchange_strong(Data &Existing, Data New) {
639 uint64_t ExistingPacked = pack(Existing);
640 uint64_t NewPacked = pack(New);
641 if (Storage.compare_exchange_strong(ExistingPacked, NewPacked))
642 return true;
643 Existing = unpack(ExistingPacked);
644 return false;
645}
646
648DataRecordHandle::getFromDataPool(const OnDiskDataAllocator &Pool,
650 auto HeaderData = Pool.get(Offset, sizeof(DataRecordHandle::Header));
651 if (!HeaderData)
652 return HeaderData.takeError();
653
654 auto Record = DataRecordHandle::get(HeaderData->data());
655 if (Record.getTotalSize() + Offset.get() > Pool.size())
656 return createStringError(
657 make_error_code(std::errc::illegal_byte_sequence),
658 "data record span passed the end of the data pool");
659
660 return Record;
661}
662
663DataRecordHandle DataRecordHandle::constructImpl(char *Mem, const Input &I,
664 const Layout &L) {
665 char *Next = Mem + sizeof(Header);
666
667 // Fill in Packed and set other data, then come back to construct the header.
668 Header::PackTy Packed = 0;
669 Packed |= LayoutFlags::pack(L.Flags) << Header::LayoutFlagsShift;
670
671 // Construct DataSize.
672 switch (L.Flags.DataSize) {
673 case DataSizeFlags::Uses1B:
674 assert(I.Data.size() <= UINT8_MAX);
675 Packed |= (Header::PackTy)I.Data.size()
676 << ((sizeof(Packed) - 2) * CHAR_BIT);
677 break;
678 case DataSizeFlags::Uses2B:
679 assert(I.Data.size() <= UINT16_MAX);
680 Packed |= (Header::PackTy)I.Data.size()
681 << ((sizeof(Packed) - 4) * CHAR_BIT);
682 break;
683 case DataSizeFlags::Uses4B:
684 support::endian::write32le(Next, I.Data.size());
685 Next += 4;
686 break;
687 case DataSizeFlags::Uses8B:
688 support::endian::write64le(Next, I.Data.size());
689 Next += 8;
690 break;
691 }
692
693 // Construct NumRefs.
694 //
695 // NOTE: May be writing NumRefs even if there are zero refs in order to fix
696 // alignment.
697 switch (L.Flags.NumRefs) {
698 case NumRefsFlags::Uses0B:
699 break;
700 case NumRefsFlags::Uses1B:
701 assert(I.Refs.size() <= UINT8_MAX);
702 Packed |= (Header::PackTy)I.Refs.size()
703 << ((sizeof(Packed) - 2) * CHAR_BIT);
704 break;
705 case NumRefsFlags::Uses2B:
706 assert(I.Refs.size() <= UINT16_MAX);
707 Packed |= (Header::PackTy)I.Refs.size()
708 << ((sizeof(Packed) - 4) * CHAR_BIT);
709 break;
710 case NumRefsFlags::Uses4B:
711 support::endian::write32le(Next, I.Refs.size());
712 Next += 4;
713 break;
714 case NumRefsFlags::Uses8B:
715 support::endian::write64le(Next, I.Refs.size());
716 Next += 8;
717 break;
718 }
719
720 // Construct Refs[].
721 if (!I.Refs.empty()) {
722 assert((L.Flags.RefKind == RefKindFlags::InternalRef4B) == I.Refs.is4B());
723 ArrayRef<uint8_t> RefsBuffer = I.Refs.getBuffer();
724 llvm::copy(RefsBuffer, Next);
725 Next += RefsBuffer.size();
726 }
727
728 // Construct Data and the trailing null.
730 llvm::copy(I.Data, Next);
731 Next[I.Data.size()] = 0;
732
733 // Construct the header itself and return.
734 Header *H = new (Mem) Header{Packed};
735 DataRecordHandle Record(*H);
736 assert(Record.getData() == I.Data);
737 assert(Record.getNumRefs() == I.Refs.size());
738 assert(Record.getRefs() == I.Refs);
739 assert(Record.getLayoutFlags().DataSize == L.Flags.DataSize);
740 assert(Record.getLayoutFlags().NumRefs == L.Flags.NumRefs);
741 assert(Record.getLayoutFlags().RefKind == L.Flags.RefKind);
742 return Record;
743}
744
745DataRecordHandle::Layout::Layout(const Input &I) {
746 // Start initial relative offsets right after the Header.
747 uint64_t RelOffset = sizeof(Header);
748
749 // Initialize the easy stuff.
750 DataSize = I.Data.size();
751 NumRefs = I.Refs.size();
752
753 // Check refs size.
754 Flags.RefKind =
755 I.Refs.is4B() ? RefKindFlags::InternalRef4B : RefKindFlags::InternalRef;
756
757 // Find the smallest slot available for DataSize.
758 bool Has1B = true;
759 bool Has2B = true;
760 if (DataSize <= UINT8_MAX && Has1B) {
761 Flags.DataSize = DataSizeFlags::Uses1B;
762 Has1B = false;
763 } else if (DataSize <= UINT16_MAX && Has2B) {
764 Flags.DataSize = DataSizeFlags::Uses2B;
765 Has2B = false;
766 } else if (DataSize <= UINT32_MAX) {
767 Flags.DataSize = DataSizeFlags::Uses4B;
768 RelOffset += 4;
769 } else {
770 Flags.DataSize = DataSizeFlags::Uses8B;
771 RelOffset += 8;
772 }
773
774 // Find the smallest slot available for NumRefs. Never sets NumRefs8B here.
775 if (!NumRefs) {
776 Flags.NumRefs = NumRefsFlags::Uses0B;
777 } else if (NumRefs <= UINT8_MAX && Has1B) {
778 Flags.NumRefs = NumRefsFlags::Uses1B;
779 Has1B = false;
780 } else if (NumRefs <= UINT16_MAX && Has2B) {
781 Flags.NumRefs = NumRefsFlags::Uses2B;
782 Has2B = false;
783 } else {
784 Flags.NumRefs = NumRefsFlags::Uses4B;
785 RelOffset += 4;
786 }
787
788 // Helper to "upgrade" either DataSize or NumRefs by 4B to avoid complicated
789 // padding rules when reading and writing. This also bumps RelOffset.
790 //
791 // The value for NumRefs is strictly limited to UINT32_MAX, but it can be
792 // stored as 8B. This means we can *always* find a size to grow.
793 //
794 // NOTE: Only call this once.
795 auto GrowSizeFieldsBy4B = [&]() {
796 assert(isAligned(Align(4), RelOffset));
797 RelOffset += 4;
798
799 assert(Flags.NumRefs != NumRefsFlags::Uses8B &&
800 "Expected to be able to grow NumRefs8B");
801
802 // First try to grow DataSize. NumRefs will not (yet) be 8B, and if
803 // DataSize is upgraded to 8B it'll already be aligned.
804 //
805 // Failing that, grow NumRefs.
806 if (Flags.DataSize < DataSizeFlags::Uses4B)
807 Flags.DataSize = DataSizeFlags::Uses4B; // DataSize: Packed => 4B.
808 else if (Flags.DataSize < DataSizeFlags::Uses8B)
809 Flags.DataSize = DataSizeFlags::Uses8B; // DataSize: 4B => 8B.
810 else if (Flags.NumRefs < NumRefsFlags::Uses4B)
811 Flags.NumRefs = NumRefsFlags::Uses4B; // NumRefs: Packed => 4B.
812 else
813 Flags.NumRefs = NumRefsFlags::Uses8B; // NumRefs: 4B => 8B.
814 };
815
816 assert(isAligned(Align(4), RelOffset));
817 if (Flags.RefKind == RefKindFlags::InternalRef) {
818 // List of 8B refs should be 8B-aligned. Grow one of the sizes to get this
819 // without padding.
820 if (!isAligned(Align(8), RelOffset))
821 GrowSizeFieldsBy4B();
822
823 assert(isAligned(Align(8), RelOffset));
824 RefsRelOffset = RelOffset;
825 RelOffset += 8 * NumRefs;
826 } else {
827 // The array of 4B refs doesn't need 8B alignment, but the data will need
828 // to be 8B-aligned. Detect this now, and, if necessary, shift everything
829 // by 4B by growing one of the sizes.
830 // If we remove the need for 8B-alignment for data there is <1% savings in
831 // disk storage for a clang build using MCCAS but the 8B-alignment may be
832 // useful in the future so keep it for now.
833 uint64_t RefListSize = 4 * NumRefs;
834 if (!isAligned(Align(8), RelOffset + RefListSize))
835 GrowSizeFieldsBy4B();
836 RefsRelOffset = RelOffset;
837 RelOffset += RefListSize;
838 }
839
840 assert(isAligned(Align(8), RelOffset));
841 DataRelOffset = RelOffset;
842}
843
844uint64_t DataRecordHandle::getDataSize() const {
845 int64_t RelOffset = sizeof(Header);
846 auto *DataSizePtr = reinterpret_cast<const char *>(H) + RelOffset;
847 switch (getLayoutFlags().DataSize) {
848 case DataSizeFlags::Uses1B:
849 return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
850 case DataSizeFlags::Uses2B:
851 return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
852 UINT16_MAX;
853 case DataSizeFlags::Uses4B:
854 return support::endian::read32le(DataSizePtr);
855 case DataSizeFlags::Uses8B:
856 return support::endian::read64le(DataSizePtr);
857 }
858 llvm_unreachable("Unknown DataSizeFlags enum");
859}
860
861void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const {
862 if (LF.DataSize >= DataSizeFlags::Uses4B)
863 RelOffset += 4;
864 if (LF.DataSize >= DataSizeFlags::Uses8B)
865 RelOffset += 4;
866}
867
868uint32_t DataRecordHandle::getNumRefs() const {
869 LayoutFlags LF = getLayoutFlags();
870 int64_t RelOffset = sizeof(Header);
871 skipDataSize(LF, RelOffset);
872 auto *NumRefsPtr = reinterpret_cast<const char *>(H) + RelOffset;
873 switch (LF.NumRefs) {
874 case NumRefsFlags::Uses0B:
875 return 0;
876 case NumRefsFlags::Uses1B:
877 return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
878 case NumRefsFlags::Uses2B:
879 return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
880 UINT16_MAX;
881 case NumRefsFlags::Uses4B:
882 return support::endian::read32le(NumRefsPtr);
883 case NumRefsFlags::Uses8B:
884 return support::endian::read64le(NumRefsPtr);
885 }
886 llvm_unreachable("Unknown NumRefsFlags enum");
887}
888
889void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const {
890 if (LF.NumRefs >= NumRefsFlags::Uses4B)
891 RelOffset += 4;
892 if (LF.NumRefs >= NumRefsFlags::Uses8B)
893 RelOffset += 4;
894}
895
896int64_t DataRecordHandle::getRefsRelOffset() const {
897 LayoutFlags LF = getLayoutFlags();
898 int64_t RelOffset = sizeof(Header);
899 skipDataSize(LF, RelOffset);
900 skipNumRefs(LF, RelOffset);
901 return RelOffset;
902}
903
904int64_t DataRecordHandle::getDataRelOffset() const {
905 LayoutFlags LF = getLayoutFlags();
906 int64_t RelOffset = sizeof(Header);
907 skipDataSize(LF, RelOffset);
908 skipNumRefs(LF, RelOffset);
909 uint32_t RefSize = LF.RefKind == RefKindFlags::InternalRef4B ? 4 : 8;
910 RelOffset += RefSize * getNumRefs();
911 return RelOffset;
912}
913
915 if (UpstreamDB) {
916 if (auto E = UpstreamDB->validate(Deep, Hasher))
917 return E;
918 }
919 return Index.validate([&](FileOffset Offset,
921 -> Error {
922 auto formatError = [&](Twine Msg) {
923 return createStringError(
925 "bad record at 0x" +
926 utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
927 Msg.str());
928 };
929
930 if (Record.Data.size() != sizeof(TrieRecord))
931 return formatError("wrong data record size");
932 if (!isAligned(Align::Of<TrieRecord>(), Record.Data.size()))
933 return formatError("wrong data record alignment");
934
935 auto *R = reinterpret_cast<const TrieRecord *>(Record.Data.data());
936 TrieRecord::Data D = R->load();
937 std::unique_ptr<MemoryBuffer> FileBuffer;
938 if ((uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Unknown &&
939 (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::DataPool &&
940 (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Standalone &&
941 (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf &&
942 (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf0)
943 return formatError("invalid record kind value");
944
946 auto I = getIndexProxyFromRef(Ref);
947 if (!I)
948 return I.takeError();
949
950 switch (D.SK) {
951 case TrieRecord::StorageKind::Unknown:
952 // This could be an abandoned entry due to a termination before updating
953 // the record. It can be reused by later insertion so just skip this entry
954 // for now.
955 return Error::success();
956 case TrieRecord::StorageKind::DataPool:
957 // Check offset is a postive value, and large enough to hold the
958 // header for the data record.
959 if (D.Offset.get() <= 0 ||
960 D.Offset.get() + sizeof(DataRecordHandle::Header) >= DataPool.size())
961 return formatError("datapool record out of bound");
962 break;
963 case TrieRecord::StorageKind::Standalone:
964 case TrieRecord::StorageKind::StandaloneLeaf:
965 case TrieRecord::StorageKind::StandaloneLeaf0:
966 SmallString<256> Path;
967 getStandalonePath(TrieRecord::getStandaloneFilePrefix(D.SK), I->Offset,
968 Path);
969 // If need to validate the content of the file later, just load the
970 // buffer here. Otherwise, just check the existance of the file.
971 if (Deep) {
972 auto File = MemoryBuffer::getFile(Path, /*IsText=*/false,
973 /*RequiresNullTerminator=*/false);
974 if (!File || !*File)
975 return formatError("record file \'" + Path + "\' does not exist");
976
977 FileBuffer = std::move(*File);
978 } else if (!llvm::sys::fs::exists(Path))
979 return formatError("record file \'" + Path + "\' does not exist");
980 }
981
982 if (!Deep)
983 return Error::success();
984
985 auto dataError = [&](Twine Msg) {
987 "bad data for digest \'" + toHex(I->Hash) +
988 "\': " + Msg.str());
989 };
991 ArrayRef<char> StoredData;
992
993 switch (D.SK) {
994 case TrieRecord::StorageKind::Unknown:
995 llvm_unreachable("already handled");
996 case TrieRecord::StorageKind::DataPool: {
997 auto DataRecord = DataRecordHandle::getFromDataPool(DataPool, D.Offset);
998 if (!DataRecord)
999 return dataError(toString(DataRecord.takeError()));
1000
1001 for (auto InternRef : DataRecord->getRefs()) {
1002 auto Index = getIndexProxyFromRef(InternRef);
1003 if (!Index)
1004 return Index.takeError();
1005 Refs.push_back(Index->Hash);
1006 }
1007 StoredData = DataRecord->getData();
1008 break;
1009 }
1010 case TrieRecord::StorageKind::Standalone: {
1011 if (FileBuffer->getBufferSize() < sizeof(DataRecordHandle::Header))
1012 return dataError("data record is not big enough to read the header");
1013 auto DataRecord = DataRecordHandle::get(FileBuffer->getBufferStart());
1014 if (DataRecord.getTotalSize() < FileBuffer->getBufferSize())
1015 return dataError(
1016 "data record span passed the end of the standalone file");
1017 for (auto InternRef : DataRecord.getRefs()) {
1018 auto Index = getIndexProxyFromRef(InternRef);
1019 if (!Index)
1020 return Index.takeError();
1021 Refs.push_back(Index->Hash);
1022 }
1023 StoredData = DataRecord.getData();
1024 break;
1025 }
1026 case TrieRecord::StorageKind::StandaloneLeaf:
1027 case TrieRecord::StorageKind::StandaloneLeaf0: {
1028 StoredData = arrayRefFromStringRef<char>(FileBuffer->getBuffer());
1029 if (D.SK == TrieRecord::StorageKind::StandaloneLeaf0) {
1030 if (!FileBuffer->getBuffer().ends_with('\0'))
1031 return dataError("standalone file is not zero terminated");
1032 StoredData = StoredData.drop_back(1);
1033 }
1034 break;
1035 }
1036 }
1037
1038 SmallVector<uint8_t> ComputedHash;
1039 Hasher(Refs, StoredData, ComputedHash);
1040 if (I->Hash != ArrayRef(ComputedHash))
1041 return dataError("hash mismatch, got \'" + toHex(ComputedHash) +
1042 "\' instead");
1043
1044 return Error::success();
1045 });
1046}
1047
1049 auto formatError = [&](Twine Msg) {
1050 return createStringError(
1052 "bad ref=0x" +
1053 utohexstr(ExternalRef.getOpaqueData(), /*LowerCase=*/true) + ": " +
1054 Msg.str());
1055 };
1056
1057 if (ExternalRef.getOpaqueData() == 0)
1058 return formatError("zero is not a valid ref");
1059
1060 InternalRef InternalRef = getInternalRef(ExternalRef);
1061 auto I = getIndexProxyFromRef(InternalRef);
1062 if (!I)
1063 return formatError(llvm::toString(I.takeError()));
1064 auto Hash = getDigest(*I);
1065
1066 OnDiskTrieRawHashMap::ConstOnDiskPtr P = Index.find(Hash);
1067 if (!P)
1068 return formatError("not found using hash " + toHex(Hash));
1069 IndexProxy OtherI = getIndexProxyFromPointer(P);
1070 ObjectID OtherRef = getExternalReference(makeInternalRef(OtherI.Offset));
1071 if (OtherRef != ExternalRef)
1072 return formatError("ref does not match indexed offset " +
1073 utohexstr(OtherRef.getOpaqueData(), /*LowerCase=*/true) +
1074 " for hash " + toHex(Hash));
1075 return Error::success();
1076}
1077
1079 OS << "on-disk-root-path: " << RootPath << "\n";
1080
1081 struct PoolInfo {
1083 };
1085
1086 OS << "\n";
1087 OS << "index:\n";
1088 Index.print(OS, [&](ArrayRef<char> Data) {
1089 assert(Data.size() == sizeof(TrieRecord));
1091 auto *R = reinterpret_cast<const TrieRecord *>(Data.data());
1092 TrieRecord::Data D = R->load();
1093 OS << " SK=";
1094 switch (D.SK) {
1095 case TrieRecord::StorageKind::Unknown:
1096 OS << "unknown ";
1097 break;
1098 case TrieRecord::StorageKind::DataPool:
1099 OS << "datapool ";
1100 Pool.push_back({D.Offset.get()});
1101 break;
1102 case TrieRecord::StorageKind::Standalone:
1103 OS << "standalone-data ";
1104 break;
1105 case TrieRecord::StorageKind::StandaloneLeaf:
1106 OS << "standalone-leaf ";
1107 break;
1108 case TrieRecord::StorageKind::StandaloneLeaf0:
1109 OS << "standalone-leaf+0";
1110 break;
1111 }
1112 OS << " Offset=" << (void *)D.Offset.get();
1113 });
1114 if (Pool.empty())
1115 return;
1116
1117 OS << "\n";
1118 OS << "pool:\n";
1119 llvm::sort(
1120 Pool, [](PoolInfo LHS, PoolInfo RHS) { return LHS.Offset < RHS.Offset; });
1121 for (PoolInfo PI : Pool) {
1122 OS << "- addr=" << (void *)PI.Offset << " ";
1123 auto D = DataRecordHandle::getFromDataPool(DataPool, FileOffset(PI.Offset));
1124 if (!D) {
1125 OS << "error: " << toString(D.takeError());
1126 return;
1127 }
1128
1129 OS << "record refs=" << D->getNumRefs() << " data=" << D->getDataSize()
1130 << " size=" << D->getTotalSize()
1131 << " end=" << (void *)(PI.Offset + D->getTotalSize()) << "\n";
1132 }
1133}
1134
1136OnDiskGraphDB::indexHash(ArrayRef<uint8_t> Hash) {
1137 auto P = Index.insertLazy(
1138 Hash, [](FileOffset TentativeOffset,
1139 OnDiskTrieRawHashMap::ValueProxy TentativeValue) {
1140 assert(TentativeValue.Data.size() == sizeof(TrieRecord));
1141 assert(
1142 isAddrAligned(Align::Of<TrieRecord>(), TentativeValue.Data.data()));
1143 new (TentativeValue.Data.data()) TrieRecord();
1144 });
1145 if (LLVM_UNLIKELY(!P))
1146 return P.takeError();
1147
1148 assert(*P && "Expected insertion");
1149 return getIndexProxyFromPointer(*P);
1150}
1151
1152OnDiskGraphDB::IndexProxy OnDiskGraphDB::getIndexProxyFromPointer(
1154 assert(P);
1155 assert(P.getOffset());
1156 return IndexProxy{P.getOffset(), P->Hash,
1157 *const_cast<TrieRecord *>(
1158 reinterpret_cast<const TrieRecord *>(P->Data.data()))};
1159}
1160
1162 auto I = indexHash(Hash);
1163 if (LLVM_UNLIKELY(!I))
1164 return I.takeError();
1165 return getExternalReference(*I);
1166}
1167
1168ObjectID OnDiskGraphDB::getExternalReference(const IndexProxy &I) {
1169 return getExternalReference(makeInternalRef(I.Offset));
1170}
1171
1172std::optional<ObjectID>
1174 bool CheckUpstream) {
1175 auto tryUpstream =
1176 [&](std::optional<IndexProxy> I) -> std::optional<ObjectID> {
1177 if (!CheckUpstream || !UpstreamDB)
1178 return std::nullopt;
1179 std::optional<ObjectID> UpstreamID =
1180 UpstreamDB->getExistingReference(Digest);
1181 if (LLVM_UNLIKELY(!UpstreamID))
1182 return std::nullopt;
1183 auto Ref = expectedToOptional(indexHash(Digest));
1184 if (!Ref)
1185 return std::nullopt;
1186 if (!I)
1187 I.emplace(*Ref);
1188 return getExternalReference(*I);
1189 };
1190
1191 OnDiskTrieRawHashMap::ConstOnDiskPtr P = Index.find(Digest);
1192 if (!P)
1193 return tryUpstream(std::nullopt);
1194 IndexProxy I = getIndexProxyFromPointer(P);
1195 TrieRecord::Data Obj = I.Ref.load();
1196 if (Obj.SK == TrieRecord::StorageKind::Unknown)
1197 return tryUpstream(I);
1198 return getExternalReference(makeInternalRef(I.Offset));
1199}
1200
1202OnDiskGraphDB::getIndexProxyFromRef(InternalRef Ref) const {
1203 auto P = Index.recoverFromFileOffset(Ref.getFileOffset());
1204 if (LLVM_UNLIKELY(!P))
1205 return P.takeError();
1206 return getIndexProxyFromPointer(*P);
1207}
1208
1210 auto I = getIndexProxyFromRef(Ref);
1211 if (!I)
1212 return I.takeError();
1213 return I->Hash;
1214}
1215
1216ArrayRef<uint8_t> OnDiskGraphDB::getDigest(const IndexProxy &I) const {
1217 return I.Hash;
1218}
1219
1220static std::variant<const StandaloneDataInMemory *, DataRecordHandle>
1222 ObjectHandle OH) {
1223 // Decode ObjectHandle to locate the stored content.
1224 uint64_t Data = OH.getOpaqueData();
1225 if (Data & 1) {
1226 const auto *SDIM =
1227 reinterpret_cast<const StandaloneDataInMemory *>(Data & (-1ULL << 1));
1228 return SDIM;
1229 }
1230
1231 auto DataHandle =
1232 cantFail(DataRecordHandle::getFromDataPool(DataPool, FileOffset(Data)));
1233 assert(DataHandle.getData().end()[0] == 0 && "Null termination");
1234 return DataHandle;
1235}
1236
1237static OnDiskContent getContentFromHandle(const OnDiskDataAllocator &DataPool,
1238 ObjectHandle OH) {
1239 auto SDIMOrRecord = getStandaloneDataOrDataRecord(DataPool, OH);
1240 if (std::holds_alternative<const StandaloneDataInMemory *>(SDIMOrRecord)) {
1241 return std::get<const StandaloneDataInMemory *>(SDIMOrRecord)->getContent();
1242 } else {
1243 auto DataHandle = std::get<DataRecordHandle>(std::move(SDIMOrRecord));
1244 return OnDiskContent{std::move(DataHandle), std::nullopt};
1245 }
1246}
1247
1249 OnDiskContent Content = getContentFromHandle(DataPool, Node);
1250 return Content.getData();
1251}
1252
1253InternalRefArrayRef OnDiskGraphDB::getInternalRefs(ObjectHandle Node) const {
1254 if (std::optional<DataRecordHandle> Record =
1255 getContentFromHandle(DataPool, Node).Record)
1256 return Record->getRefs();
1257 return std::nullopt;
1258}
1259
1262 auto SDIMOrRecord = getStandaloneDataOrDataRecord(DataPool, Node);
1263 if (std::holds_alternative<const StandaloneDataInMemory *>(SDIMOrRecord)) {
1264 auto *SDIM = std::get<const StandaloneDataInMemory *>(SDIMOrRecord);
1265 return SDIM->getInternalFileBackedObjectData(RootPath);
1266 } else {
1267 auto DataHandle = std::get<DataRecordHandle>(std::move(SDIMOrRecord));
1268 return FileBackedData{DataHandle.getData(), /*FileInfo=*/std::nullopt};
1269 }
1270}
1271
1274 InternalRef Ref = getInternalRef(ExternalRef);
1275 auto I = getIndexProxyFromRef(Ref);
1276 if (!I)
1277 return I.takeError();
1278 TrieRecord::Data Object = I->Ref.load();
1279
1280 if (Object.SK == TrieRecord::StorageKind::Unknown)
1281 return faultInFromUpstream(ExternalRef);
1282
1283 if (Object.SK == TrieRecord::StorageKind::DataPool)
1284 return ObjectHandle::fromFileOffset(Object.Offset);
1285
1286 // Only TrieRecord::StorageKind::Standalone (and variants) need to be
1287 // explicitly loaded.
1288 //
1289 // There's corruption if standalone objects have offsets, or if we get here
1290 // for something that isn't standalone.
1291 if (Object.Offset)
1293 switch (Object.SK) {
1294 case TrieRecord::StorageKind::Unknown:
1295 case TrieRecord::StorageKind::DataPool:
1296 llvm_unreachable("unexpected storage kind");
1297 case TrieRecord::StorageKind::Standalone:
1298 case TrieRecord::StorageKind::StandaloneLeaf0:
1299 case TrieRecord::StorageKind::StandaloneLeaf:
1300 break;
1301 }
1302
1303 // Load it from disk.
1304 //
1305 // Note: Creation logic guarantees that data that needs null-termination is
1306 // suitably 0-padded. Requiring null-termination here would be too expensive
1307 // for extremely large objects that happen to be page-aligned.
1308 SmallString<256> Path;
1309 getStandalonePath(TrieRecord::getStandaloneFilePrefix(Object.SK), I->Offset,
1310 Path);
1311
1312 auto BypassSandbox = sys::sandbox::scopedDisable();
1313
1314 auto File = sys::fs::openNativeFileForRead(Path);
1315 if (!File)
1316 return createFileError(Path, File.takeError());
1317
1318 llvm::scope_exit CloseFile([&]() { sys::fs::closeFile(*File); });
1319
1321 if (std::error_code EC = sys::fs::status(*File, Status))
1323
1324 std::error_code EC;
1325 auto Region = std::make_unique<sys::fs::mapped_file_region>(
1326 *File, sys::fs::mapped_file_region::readonly, Status.getSize(), 0, EC);
1327 if (EC)
1329
1331 static_cast<StandaloneDataMapTy *>(StandaloneData)
1332 ->insert(I->Hash, Object.SK, std::move(Region), I->Offset));
1333}
1334
1336 auto Presence = getObjectPresence(Ref, /*CheckUpstream=*/true);
1337 if (!Presence)
1338 return Presence.takeError();
1339
1340 switch (*Presence) {
1341 case ObjectPresence::Missing:
1342 return false;
1343 case ObjectPresence::InPrimaryDB:
1344 return true;
1345 case ObjectPresence::OnlyInUpstreamDB:
1346 if (auto FaultInResult = faultInFromUpstream(Ref); !FaultInResult)
1347 return FaultInResult.takeError();
1348 return true;
1349 }
1350 llvm_unreachable("Unknown ObjectPresence enum");
1351}
1352
1354OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef,
1355 bool CheckUpstream) const {
1356 InternalRef Ref = getInternalRef(ExternalRef);
1357 auto I = getIndexProxyFromRef(Ref);
1358 if (!I)
1359 return I.takeError();
1360
1361 TrieRecord::Data Object = I->Ref.load();
1362 if (Object.SK != TrieRecord::StorageKind::Unknown)
1363 return ObjectPresence::InPrimaryDB;
1364
1365 if (!CheckUpstream || !UpstreamDB)
1366 return ObjectPresence::Missing;
1367
1368 std::optional<ObjectID> UpstreamID =
1369 UpstreamDB->getExistingReference(getDigest(*I));
1370 return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB
1371 : ObjectPresence::Missing;
1372}
1373
1374InternalRef OnDiskGraphDB::makeInternalRef(FileOffset IndexOffset) {
1375 return InternalRef::getFromOffset(IndexOffset);
1376}
1377
1378static void getStandalonePath(StringRef RootPath, StringRef Prefix,
1379 FileOffset IndexOffset,
1380 SmallVectorImpl<char> &Path) {
1381 Path.assign(RootPath.begin(), RootPath.end());
1382 sys::path::append(Path,
1383 Prefix + Twine(IndexOffset.get()) + "." + CASFormatVersion);
1384}
1385
1386void OnDiskGraphDB::getStandalonePath(StringRef Prefix, FileOffset IndexOffset,
1387 SmallVectorImpl<char> &Path) const {
1388 return ::getStandalonePath(RootPath, Prefix, IndexOffset, Path);
1389}
1390
1391OnDiskContent StandaloneDataInMemory::getContent() const {
1392 bool Leaf0 = false;
1393 bool Leaf = false;
1394 switch (SK) {
1395 default:
1396 llvm_unreachable("Storage kind must be standalone");
1397 case TrieRecord::StorageKind::Standalone:
1398 break;
1399 case TrieRecord::StorageKind::StandaloneLeaf0:
1400 Leaf = Leaf0 = true;
1401 break;
1402 case TrieRecord::StorageKind::StandaloneLeaf:
1403 Leaf = true;
1404 break;
1405 }
1406
1407 if (Leaf) {
1408 StringRef Data(Region->data(), Region->size());
1409 assert(Data.drop_back(Leaf0).end()[0] == 0 &&
1410 "Standalone node data missing null termination");
1411 return OnDiskContent{std::nullopt,
1412 arrayRefFromStringRef<char>(Data.drop_back(Leaf0))};
1413 }
1414
1415 DataRecordHandle Record = DataRecordHandle::get(Region->data());
1416 assert(Record.getData().end()[0] == 0 &&
1417 "Standalone object record missing null termination for data");
1418 return OnDiskContent{Record, std::nullopt};
1419}
1420
1421OnDiskGraphDB::FileBackedData
1422StandaloneDataInMemory::getInternalFileBackedObjectData(
1423 StringRef RootPath) const {
1424 switch (SK) {
1425 case TrieRecord::StorageKind::Unknown:
1426 case TrieRecord::StorageKind::DataPool:
1427 llvm_unreachable("unexpected storage kind");
1428 case TrieRecord::StorageKind::Standalone:
1429 return OnDiskGraphDB::FileBackedData{getContent().getData(),
1430 /*FileInfo=*/std::nullopt};
1431 case TrieRecord::StorageKind::StandaloneLeaf0:
1432 case TrieRecord::StorageKind::StandaloneLeaf:
1433 bool IsFileNulTerminated = SK == TrieRecord::StorageKind::StandaloneLeaf0;
1434 SmallString<256> Path;
1435 ::getStandalonePath(RootPath, TrieRecord::getStandaloneFilePrefix(SK),
1436 IndexOffset, Path);
1437 return OnDiskGraphDB::FileBackedData{
1438 getContent().getData(), OnDiskGraphDB::FileBackedData::FileInfoTy{
1439 std::string(Path), IsFileNulTerminated}};
1440 }
1441 llvm_unreachable("Unknown StorageKind enum");
1442}
1443
1444static Expected<MappedTempFile>
1446 auto BypassSandbox = sys::sandbox::scopedDisable();
1447
1448 assert(Size && "Unexpected request for an empty temp file");
1449 Expected<TempFile> File = TempFile::create(FinalPath + ".%%%%%%", Logger);
1450 if (!File)
1451 return File.takeError();
1452
1453 if (Error E = preallocateFileTail(File->FD, 0, Size).takeError())
1454 return createFileError(File->TmpName, std::move(E));
1455
1456 if (auto EC = sys::fs::resize_file_before_mapping_readwrite(File->FD, Size))
1457 return createFileError(File->TmpName, EC);
1458
1459 std::error_code EC;
1462 0, EC);
1463 if (EC)
1464 return createFileError(File->TmpName, EC);
1465 return MappedTempFile(std::move(*File), std::move(Map));
1466}
1467
1468static size_t getPageSize() {
1470 return PageSize;
1471}
1472
1473Error OnDiskGraphDB::createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data) {
1474 assert(Data.size() > TrieRecord::MaxEmbeddedSize &&
1475 "Expected a bigger file for external content...");
1476
1477 bool Leaf0 = isAligned(Align(getPageSize()), Data.size());
1478 TrieRecord::StorageKind SK = Leaf0 ? TrieRecord::StorageKind::StandaloneLeaf0
1479 : TrieRecord::StorageKind::StandaloneLeaf;
1480
1481 SmallString<256> Path;
1482 int64_t FileSize = Data.size() + Leaf0;
1483 getStandalonePath(TrieRecord::getStandaloneFilePrefix(SK), I.Offset, Path);
1484
1485 auto BypassSandbox = sys::sandbox::scopedDisable();
1486
1487 // Write the file. Don't reuse this mapped_file_region, which is read/write.
1488 // Let load() pull up one that's read-only.
1489 Expected<MappedTempFile> File = createTempFile(Path, FileSize, Logger.get());
1490 if (!File)
1491 return File.takeError();
1492 assert(File->size() == (uint64_t)FileSize);
1493 llvm::copy(Data, File->data());
1494 if (Leaf0)
1495 File->data()[Data.size()] = 0;
1496 assert(File->data()[Data.size()] == 0);
1497 if (Error E = File->keep(Path))
1498 return E;
1499
1500 // Store the object reference.
1501 TrieRecord::Data Existing;
1502 {
1503 TrieRecord::Data Leaf{SK, FileOffset()};
1504 if (I.Ref.compare_exchange_strong(Existing, Leaf)) {
1505 recordStandaloneSizeIncrease(FileSize);
1506 return Error::success();
1507 }
1508 }
1509
1510 // If there was a race, confirm that the new value has valid storage.
1511 if (Existing.SK == TrieRecord::StorageKind::Unknown)
1512 return createCorruptObjectError(getDigest(I));
1513
1514 return Error::success();
1515}
1516
1519 auto I = getIndexProxyFromRef(getInternalRef(ID));
1520 if (LLVM_UNLIKELY(!I))
1521 return I.takeError();
1522
1523 // Early return in case the node exists.
1524 {
1525 TrieRecord::Data Existing = I->Ref.load();
1526 if (Existing.SK != TrieRecord::StorageKind::Unknown)
1527 return Error::success();
1528 }
1529
1530 // Big leaf nodes.
1531 if (Refs.empty() && Data.size() > TrieRecord::MaxEmbeddedSize)
1532 return createStandaloneLeaf(*I, Data);
1533
1534 // TODO: Check whether it's worth checking the index for an already existing
1535 // object (like storeTreeImpl() does) before building up the
1536 // InternalRefVector.
1537 InternalRefVector InternalRefs;
1538 for (ObjectID Ref : Refs)
1539 InternalRefs.push_back(getInternalRef(Ref));
1540
1541 // Create the object.
1542
1543 DataRecordHandle::Input Input{InternalRefs, Data};
1544
1545 // Compute the storage kind, allocate it, and create the record.
1546 TrieRecord::StorageKind SK = TrieRecord::StorageKind::Unknown;
1547 FileOffset PoolOffset;
1548 SmallString<256> Path;
1549 std::optional<MappedTempFile> File;
1550 std::optional<uint64_t> FileSize;
1551 auto AllocStandaloneFile = [&](size_t Size) -> Expected<char *> {
1552 getStandalonePath(TrieRecord::getStandaloneFilePrefix(
1553 TrieRecord::StorageKind::Standalone),
1554 I->Offset, Path);
1555 if (Error E = createTempFile(Path, Size, Logger.get()).moveInto(File))
1556 return std::move(E);
1557 assert(File->size() == Size);
1558 FileSize = Size;
1559 SK = TrieRecord::StorageKind::Standalone;
1560 return File->data();
1561 };
1562 auto Alloc = [&](size_t Size) -> Expected<char *> {
1563 if (Size <= TrieRecord::MaxEmbeddedSize) {
1564 SK = TrieRecord::StorageKind::DataPool;
1565 auto P = DataPool.allocate(Size);
1566 if (LLVM_UNLIKELY(!P)) {
1567 char *NewAlloc = nullptr;
1568 auto NewE = handleErrors(
1569 P.takeError(), [&](std::unique_ptr<StringError> E) -> Error {
1570 if (E->convertToErrorCode() == std::errc::not_enough_memory)
1571 return AllocStandaloneFile(Size).moveInto(NewAlloc);
1572 return Error(std::move(E));
1573 });
1574 if (!NewE)
1575 return NewAlloc;
1576 return std::move(NewE);
1577 }
1578 PoolOffset = P->getOffset();
1579 LLVM_DEBUG({
1580 dbgs() << "pool-alloc addr=" << (void *)PoolOffset.get()
1581 << " size=" << Size
1582 << " end=" << (void *)(PoolOffset.get() + Size) << "\n";
1583 });
1584 return (*P)->data();
1585 }
1586 return AllocStandaloneFile(Size);
1587 };
1588
1589 DataRecordHandle Record;
1590 if (Error E =
1591 DataRecordHandle::createWithError(Alloc, Input).moveInto(Record))
1592 return E;
1593 assert(Record.getData().end()[0] == 0 && "Expected null-termination");
1594 assert(Record.getData() == Input.Data && "Expected initialization");
1595 assert(SK != TrieRecord::StorageKind::Unknown);
1596 assert(bool(File) != bool(PoolOffset) &&
1597 "Expected either a mapped file or a pooled offset");
1598
1599 // Check for a race before calling MappedTempFile::keep().
1600 //
1601 // Then decide what to do with the file. Better to discard than overwrite if
1602 // another thread/process has already added this.
1603 TrieRecord::Data Existing = I->Ref.load();
1604 {
1605 TrieRecord::Data NewObject{SK, PoolOffset};
1606 if (File) {
1607 if (Existing.SK == TrieRecord::StorageKind::Unknown) {
1608 // Keep the file!
1609 if (Error E = File->keep(Path))
1610 return E;
1611 } else {
1612 File.reset();
1613 }
1614 }
1615
1616 // If we didn't already see a racing/existing write, then try storing the
1617 // new object. If that races, confirm that the new value has valid storage.
1618 //
1619 // TODO: Find a way to reuse the storage from the new-but-abandoned record
1620 // handle.
1621 if (Existing.SK == TrieRecord::StorageKind::Unknown) {
1622 if (I->Ref.compare_exchange_strong(Existing, NewObject)) {
1623 if (FileSize)
1624 recordStandaloneSizeIncrease(*FileSize);
1625 return Error::success();
1626 }
1627 }
1628 }
1629
1630 if (Existing.SK == TrieRecord::StorageKind::Unknown)
1632
1633 // Load existing object.
1634 return Error::success();
1635}
1636
1638 return storeFile(ID, FilePath, /*ImportKind=*/std::nullopt);
1639}
1640
1642 ObjectID ID, StringRef FilePath,
1643 std::optional<InternalUpstreamImportKind> ImportKind) {
1644 auto I = getIndexProxyFromRef(getInternalRef(ID));
1645 if (LLVM_UNLIKELY(!I))
1646 return I.takeError();
1647
1648 // Early return in case the node exists.
1649 {
1650 TrieRecord::Data Existing = I->Ref.load();
1651 if (Existing.SK != TrieRecord::StorageKind::Unknown)
1652 return Error::success();
1653 }
1654
1655 auto BypassSandbox = sys::sandbox::scopedDisable();
1656
1657 uint64_t FileSize;
1658 if (std::error_code EC = sys::fs::file_size(FilePath, FileSize))
1659 return createFileError(FilePath, EC);
1660
1661 if (FileSize <= TrieRecord::MaxEmbeddedSize) {
1662 auto Buf = MemoryBuffer::getFile(FilePath);
1663 if (!Buf)
1664 return createFileError(FilePath, Buf.getError());
1665 return store(ID, {}, arrayRefFromStringRef<char>((*Buf)->getBuffer()));
1666 }
1667
1668 UniqueTempFile UniqueTmp;
1669 auto ExpectedPath = UniqueTmp.createAndCopyFrom(RootPath, FilePath);
1670 if (!ExpectedPath)
1671 return ExpectedPath.takeError();
1672 StringRef TmpPath = *ExpectedPath;
1673
1674 TrieRecord::StorageKind SK;
1675 if (ImportKind.has_value()) {
1676 // Importing the file from upstream, the nul is already added if necessary.
1677 switch (*ImportKind) {
1678 case InternalUpstreamImportKind::Leaf:
1679 SK = TrieRecord::StorageKind::StandaloneLeaf;
1680 break;
1681 case InternalUpstreamImportKind::Leaf0:
1682 SK = TrieRecord::StorageKind::StandaloneLeaf0;
1683 break;
1684 }
1685 } else {
1686 bool Leaf0 = isAligned(Align(getPageSize()), FileSize);
1687 SK = Leaf0 ? TrieRecord::StorageKind::StandaloneLeaf0
1688 : TrieRecord::StorageKind::StandaloneLeaf;
1689
1690 if (Leaf0) {
1691 // Add a nul byte at the end.
1692 std::error_code EC;
1693 raw_fd_ostream OS(TmpPath, EC, sys::fs::CD_OpenExisting,
1695 if (EC)
1696 return createFileError(TmpPath, EC);
1697 OS.write(0);
1698 OS.close();
1699 if (OS.has_error())
1700 return createFileError(TmpPath, OS.error());
1701 }
1702 }
1703
1704 SmallString<256> StandalonePath;
1705 getStandalonePath(TrieRecord::getStandaloneFilePrefix(SK), I->Offset,
1706 StandalonePath);
1707 if (Error E = UniqueTmp.renameTo(StandalonePath))
1708 return E;
1709
1710 // Store the object reference.
1711 TrieRecord::Data Existing;
1712 {
1713 TrieRecord::Data Leaf{SK, FileOffset()};
1714 if (I->Ref.compare_exchange_strong(Existing, Leaf)) {
1715 recordStandaloneSizeIncrease(FileSize);
1716 return Error::success();
1717 }
1718 }
1719
1720 // If there was a race, confirm that the new value has valid storage.
1721 if (Existing.SK == TrieRecord::StorageKind::Unknown)
1722 return createCorruptObjectError(getDigest(*I));
1723
1724 return Error::success();
1725}
1726
1727void OnDiskGraphDB::recordStandaloneSizeIncrease(size_t SizeIncrease) {
1728 standaloneStorageSize().fetch_add(SizeIncrease, std::memory_order_relaxed);
1729}
1730
1731std::atomic<uint64_t> &OnDiskGraphDB::standaloneStorageSize() const {
1732 MutableArrayRef<uint8_t> UserHeader = DataPool.getUserHeader();
1733 assert(UserHeader.size() == sizeof(std::atomic<uint64_t>));
1734 assert(isAddrAligned(Align(8), UserHeader.data()));
1735 return *reinterpret_cast<std::atomic<uint64_t> *>(UserHeader.data());
1736}
1737
1738uint64_t OnDiskGraphDB::getStandaloneStorageSize() const {
1739 return standaloneStorageSize().load(std::memory_order_relaxed);
1740}
1741
1743 return Index.size() + DataPool.size() + getStandaloneStorageSize();
1744}
1745
1747 unsigned IndexPercent = Index.size() * 100ULL / Index.capacity();
1748 unsigned DataPercent = DataPool.size() * 100ULL / DataPool.capacity();
1749 return std::max(IndexPercent, DataPercent);
1750}
1751
1754 unsigned HashByteSize, OnDiskGraphDB *UpstreamDB,
1755 std::shared_ptr<OnDiskCASLogger> Logger,
1756 FaultInPolicy Policy) {
1757 if (std::error_code EC = sys::fs::create_directories(AbsPath))
1758 return createFileError(AbsPath, EC);
1759
1760 constexpr uint64_t MB = 1024ull * 1024ull;
1761 constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
1762
1763 uint64_t MaxIndexSize = 12 * GB;
1764 uint64_t MaxDataPoolSize = 24 * GB;
1765
1766 if (useSmallMappingSize(AbsPath)) {
1767 MaxIndexSize = 1 * GB;
1768 MaxDataPoolSize = 2 * GB;
1769 }
1770
1771 auto CustomSize = getOverriddenMaxMappingSize();
1772 if (!CustomSize)
1773 return CustomSize.takeError();
1774 if (*CustomSize)
1775 MaxIndexSize = MaxDataPoolSize = **CustomSize;
1776
1777 SmallString<256> IndexPath(AbsPath);
1779 std::optional<OnDiskTrieRawHashMap> Index;
1781 IndexPath, IndexTableName + "[" + HashName + "]",
1782 HashByteSize * CHAR_BIT,
1783 /*DataSize=*/sizeof(TrieRecord), MaxIndexSize,
1784 /*MinFileSize=*/MB, Logger)
1785 .moveInto(Index))
1786 return std::move(E);
1787
1788 uint32_t UserHeaderSize = sizeof(std::atomic<uint64_t>);
1789
1790 SmallString<256> DataPoolPath(AbsPath);
1792 std::optional<OnDiskDataAllocator> DataPool;
1793 StringRef PolicyName =
1794 Policy == FaultInPolicy::SingleNode ? "single" : "full";
1796 DataPoolPath,
1797 DataPoolTableName + "[" + HashName + "]" + PolicyName,
1798 MaxDataPoolSize, /*MinFileSize=*/MB, UserHeaderSize, Logger,
1799 [](void *UserHeaderPtr) {
1800 new (UserHeaderPtr) std::atomic<uint64_t>(0);
1801 })
1802 .moveInto(DataPool))
1803 return std::move(E);
1804 if (DataPool->getUserHeader().size() != UserHeaderSize)
1806 "unexpected user header in '" + DataPoolPath +
1807 "'");
1808
1809 return std::unique_ptr<OnDiskGraphDB>(
1810 new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool),
1811 UpstreamDB, Policy, std::move(Logger)));
1812}
1813
1814OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
1815 OnDiskDataAllocator DataPool,
1816 OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy,
1817 std::shared_ptr<OnDiskCASLogger> Logger)
1818 : Index(std::move(Index)), DataPool(std::move(DataPool)),
1819 RootPath(RootPath.str()), UpstreamDB(UpstreamDB), FIPolicy(Policy),
1820 Logger(std::move(Logger)) {
1821 /// Lifetime for "big" objects not in DataPool.
1822 ///
1823 /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something
1824 /// simpler on the assumption there won't be much contention since most data
1825 /// is not big. If there is contention, and we've already fixed ObjectProxy
1826 /// object handles to be cheap enough to use consistently, the fix might be
1827 /// to use better use of them rather than optimizing this map.
1828 ///
1829 /// FIXME: Figure out the right number of shards, if any.
1830 StandaloneData = new StandaloneDataMapTy();
1831}
1832
1834 delete static_cast<StandaloneDataMapTy *>(StandaloneData);
1835}
1836
1837Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
1838 ObjectHandle UpstreamNode) {
1839 // Copies the full CAS tree from upstream. Uses depth-first copying to protect
1840 // against the process dying during importing and leaving the database with an
1841 // incomplete tree. Note that if the upstream has missing nodes then the tree
1842 // will be copied with missing nodes as well, it won't be considered an error.
1843 struct UpstreamCursor {
1845 size_t RefsCount;
1848 };
1849 /// Keeps track of the state of visitation for current node and all of its
1850 /// parents.
1852 /// Keeps track of the currently visited nodes as they are imported into
1853 /// primary database, from current node and its parents. When a node is
1854 /// entered for visitation it appends its own ID, then appends referenced IDs
1855 /// as they get imported. When a node is fully imported it removes the
1856 /// referenced IDs from the bottom of the stack which leaves its own ID at the
1857 /// bottom, adding to the list of referenced IDs for the parent node.
1858 SmallVector<ObjectID, 128> PrimaryNodesStack;
1859
1860 auto enqueueNode = [&](ObjectID PrimaryID, std::optional<ObjectHandle> Node) {
1861 PrimaryNodesStack.push_back(PrimaryID);
1862 if (!Node)
1863 return;
1864 auto Refs = UpstreamDB->getObjectRefs(*Node);
1865 CursorStack.push_back(
1866 {*Node, (size_t)llvm::size(Refs), Refs.begin(), Refs.end()});
1867 };
1868
1869 enqueueNode(PrimaryID, UpstreamNode);
1870
1871 while (!CursorStack.empty()) {
1872 UpstreamCursor &Cur = CursorStack.back();
1873 if (Cur.RefI == Cur.RefE) {
1874 // Copy the node data into the primary store.
1875
1876 // The bottom of \p PrimaryNodesStack contains the primary ID for the
1877 // current node plus the list of imported referenced IDs.
1878 assert(PrimaryNodesStack.size() >= Cur.RefsCount + 1);
1879 ObjectID PrimaryID = *(PrimaryNodesStack.end() - Cur.RefsCount - 1);
1880 auto PrimaryRefs = ArrayRef(PrimaryNodesStack)
1881 .slice(PrimaryNodesStack.size() - Cur.RefsCount);
1882 if (Error E = importUpstreamData(PrimaryID, PrimaryRefs, Cur.Node))
1883 return E;
1884 // Remove the current node and its IDs from the stack.
1885 PrimaryNodesStack.truncate(PrimaryNodesStack.size() - Cur.RefsCount);
1886 CursorStack.pop_back();
1887 continue;
1888 }
1889
1890 ObjectID UpstreamID = *(Cur.RefI++);
1891 auto PrimaryID = getReference(UpstreamDB->getDigest(UpstreamID));
1892 if (LLVM_UNLIKELY(!PrimaryID))
1893 return PrimaryID.takeError();
1894 if (containsObject(*PrimaryID, /*CheckUpstream=*/false)) {
1895 // This \p ObjectID already exists in the primary. Either it was imported
1896 // via \p importFullTree or the client created it, in which case the
1897 // client takes responsibility for how it was formed.
1898 enqueueNode(*PrimaryID, std::nullopt);
1899 continue;
1900 }
1901 Expected<std::optional<ObjectHandle>> UpstreamNode =
1902 UpstreamDB->load(UpstreamID);
1903 if (!UpstreamNode)
1904 return UpstreamNode.takeError();
1905 enqueueNode(*PrimaryID, *UpstreamNode);
1906 }
1907
1908 assert(PrimaryNodesStack.size() == 1);
1909 assert(PrimaryNodesStack.front() == PrimaryID);
1910 return Error::success();
1911}
1912
1913Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
1914 ObjectHandle UpstreamNode) {
1915 // Copies only a single node, it doesn't copy the referenced nodes.
1916
1917 auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode);
1919 Refs.reserve(llvm::size(UpstreamRefs));
1920 for (ObjectID UpstreamRef : UpstreamRefs) {
1921 auto Ref = getReference(UpstreamDB->getDigest(UpstreamRef));
1922 if (LLVM_UNLIKELY(!Ref))
1923 return Ref.takeError();
1924 Refs.push_back(*Ref);
1925 }
1926
1927 return importUpstreamData(PrimaryID, Refs, UpstreamNode);
1928}
1929
1930Error OnDiskGraphDB::importUpstreamData(ObjectID PrimaryID,
1931 ArrayRef<ObjectID> PrimaryRefs,
1932 ObjectHandle UpstreamNode) {
1933 // If there are references we can't copy an upstream's standalone file because
1934 // we need to re-resolve the reference offsets it contains.
1935 if (PrimaryRefs.empty()) {
1936 auto FBData = UpstreamDB->getInternalFileBackedObjectData(UpstreamNode);
1937 if (FBData.FileInfo.has_value()) {
1938 // Disk-space optimization, import the file directly since it is a
1939 // standalone leaf.
1940 return storeFile(
1941 PrimaryID, FBData.FileInfo->FilePath,
1942 /*InternalUpstreamImport=*/FBData.FileInfo->IsFileNulTerminated
1943 ? InternalUpstreamImportKind::Leaf0
1944 : InternalUpstreamImportKind::Leaf);
1945 }
1946 }
1947
1948 auto Data = UpstreamDB->getObjectData(UpstreamNode);
1949 return store(PrimaryID, PrimaryRefs, Data);
1950}
1951
1952Expected<std::optional<ObjectHandle>>
1953OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) {
1954 if (!UpstreamDB)
1955 return std::nullopt;
1956
1957 auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID));
1958 if (LLVM_UNLIKELY(!UpstreamID))
1959 return UpstreamID.takeError();
1960
1961 Expected<std::optional<ObjectHandle>> UpstreamNode =
1962 UpstreamDB->load(*UpstreamID);
1963 if (!UpstreamNode)
1964 return UpstreamNode.takeError();
1965 if (!*UpstreamNode)
1966 return std::nullopt;
1967
1968 if (Error E = FIPolicy == FaultInPolicy::SingleNode
1969 ? importSingleNode(PrimaryID, **UpstreamNode)
1970 : importFullTree(PrimaryID, **UpstreamNode))
1971 return std::move(E);
1972 return load(PrimaryID);
1973}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Mark last scratch load
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_UNLIKELY(EXPR)
Definition Compiler.h:336
This file defines the DenseMap class.
static cl::opt< int > PageSize("imp-null-check-page-size", cl::desc("The page size of the target in bytes"), cl::init(4096), cl::Hidden)
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
#define I(x, y, z)
Definition MD5.cpp:57
#define H(x, y, z)
Definition MD5.cpp:56
This file declares interface for OnDiskCASLogger, an interface that can be used to log CAS events to ...
This file declares interface for OnDiskDataAllocator, a file backed data pool can be used to allocate...
static constexpr StringLiteral FilePrefixLeaf0
static constexpr StringLiteral DataPoolTableName
static constexpr StringLiteral FilePrefixObject
static constexpr StringLiteral FilePrefixLeaf
static constexpr StringLiteral IndexFilePrefix
static OnDiskContent getContentFromHandle(const OnDiskDataAllocator &DataPool, ObjectHandle OH)
static constexpr StringLiteral DataPoolFilePrefix
static Error createCorruptObjectError(Expected< ArrayRef< uint8_t > > ID)
static std::variant< const StandaloneDataInMemory *, DataRecordHandle > getStandaloneDataOrDataRecord(const OnDiskDataAllocator &DataPool, ObjectHandle OH)
static size_t getPageSize()
static void getStandalonePath(StringRef RootPath, StringRef Prefix, FileOffset IndexOffset, SmallVectorImpl< char > &Path)
static Expected< MappedTempFile > createTempFile(StringRef FinalPath, uint64_t Size, OnDiskCASLogger *Logger)
static constexpr StringLiteral IndexTableName
This declares OnDiskGraphDB, an ondisk CAS database with a fixed length hash.
This file declares interface for OnDiskTrieRawHashMap, a thread-safe and (mostly) lock-free hash map ...
#define P(N)
Provides a library for accessing information about this process and other processes on the operating ...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static Split data
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
The Input class is used to parse a yaml document into in-memory structs and vectors.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:201
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
const T * data() const
Definition ArrayRef.h:139
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
Logging utility - given an ordered specification of features, and assuming a scalar reward,...
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void reserve(size_type N)
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
iterator begin() const
Definition StringRef.h:113
iterator end() const
Definition StringRef.h:115
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
FileOffset is a wrapper around uint64_t to represent the offset of data from the beginning of the fil...
Definition FileOffset.h:24
uint64_t get() const
Definition FileOffset.h:26
Handle to a loaded object in a ObjectStore instance.
LLVM_ABI_FOR_TEST Expected< ArrayRef< char > > get(FileOffset Offset, size_t Size) const
Get the data of Size stored at the given Offset.
static LLVM_ABI_FOR_TEST Expected< OnDiskDataAllocator > create(const Twine &Path, const Twine &TableName, uint64_t MaxFileSize, std::optional< uint64_t > NewFileInitialSize, uint32_t UserHeaderSize=0, std::shared_ptr< ondisk::OnDiskCASLogger > Logger=nullptr, function_ref< void(void *)> UserHeaderInit=nullptr)
LLVM_ABI_FOR_TEST size_t size() const
OnDiskTrieRawHashMap is a persistent trie data structure used as hash maps.
static LLVM_ABI_FOR_TEST Expected< OnDiskTrieRawHashMap > create(const Twine &Path, const Twine &TrieName, size_t NumHashBits, uint64_t DataSize, uint64_t MaxFileSize, std::optional< uint64_t > NewFileInitialSize, std::shared_ptr< ondisk::OnDiskCASLogger > Logger=nullptr, std::optional< size_t > NewTableNumRootBits=std::nullopt, std::optional< size_t > NewTableNumSubtrieBits=std::nullopt)
Gets or creates a file at Path with a hash-mapped trie named TrieName.
static std::optional< InternalRef4B > tryToShrink(InternalRef Ref)
Shrink to 4B reference.
Array of internal node references.
Standard 8 byte reference inside OnDiskGraphDB.
static InternalRef getFromOffset(FileOffset Offset)
Handle for a loaded node object.
static ObjectHandle fromFileOffset(FileOffset Offset)
static ObjectHandle fromMemory(uintptr_t Ptr)
Reference to a node.
uint64_t getOpaqueData() const
Interface for logging low-level on-disk cas operations.
On-disk CAS nodes database, independent of a particular hashing algorithm.
FaultInPolicy
How to fault-in nodes if an upstream database is used.
@ SingleNode
Copy only the requested node.
void print(raw_ostream &OS) const
LLVM_ABI_FOR_TEST Error validateObjectID(ObjectID ID) const
Checks that ID exists in the index.
LLVM_ABI_FOR_TEST Expected< std::optional< ObjectHandle > > load(ObjectID Ref)
Expected< bool > isMaterialized(ObjectID Ref)
Check whether the object associated with Ref is stored in the CAS.
Error validate(bool Deep, HashingFuncT Hasher) const
Validate the OnDiskGraphDB.
object_refs_range getObjectRefs(ObjectHandle Node) const
unsigned getHardStorageLimitUtilization() const
LLVM_ABI_FOR_TEST Error store(ObjectID ID, ArrayRef< ObjectID > Refs, ArrayRef< char > Data)
Associate data & references with a particular object ID.
ArrayRef< uint8_t > getDigest(ObjectID Ref) const
FileBackedData getInternalFileBackedObjectData(ObjectHandle Node) const
Provides access to the underlying file path, that represents an object leaf node, when available.
LLVM_ABI_FOR_TEST Error storeFile(ObjectID ID, StringRef FilePath)
Associates the data of a file with a particular object ID.
LLVM_ABI_FOR_TEST size_t getStorageSize() const
static LLVM_ABI_FOR_TEST Expected< std::unique_ptr< OnDiskGraphDB > > open(StringRef Path, StringRef HashName, unsigned HashByteSize, OnDiskGraphDB *UpstreamDB=nullptr, std::shared_ptr< OnDiskCASLogger > Logger=nullptr, FaultInPolicy Policy=FaultInPolicy::FullTree)
Open the on-disk store from a directory.
bool containsObject(ObjectID Ref, bool CheckUpstream=true) const
Check whether the object associated with Ref is stored in the CAS.
LLVM_ABI_FOR_TEST Expected< ObjectID > getReference(ArrayRef< uint8_t > Hash)
Form a reference for the provided hash.
function_ref< void( ArrayRef< ArrayRef< uint8_t > >, ArrayRef< char >, SmallVectorImpl< uint8_t > &)> HashingFuncT
Hashing function type for validation.
LLVM_ABI_FOR_TEST ArrayRef< char > getObjectData(ObjectHandle Node) const
LLVM_ABI_FOR_TEST std::optional< ObjectID > getExistingReference(ArrayRef< uint8_t > Digest, bool CheckUpstream=true)
Get an existing reference to the object Digest.
Helper RAII class for copying a file to a unique file path.
Error renameTo(StringRef RenameToPath)
Rename the new unique file to RenameToPath.
Expected< StringRef > createAndCopyFrom(StringRef ParentPath, StringRef CopyFromPath)
Create a new unique file path under ParentPath and copy the contents of CopyFromPath into it.
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
static unsigned getPageSizeEstimate()
Get the process's estimated page size.
Definition Process.h:62
LLVM_ABI Error keep(const Twine &Name)
static LLVM_ABI Expected< TempFile > create(const Twine &Model, unsigned Mode=all_read|all_write, OpenFlags ExtraFlags=OF_None)
This creates a temporary file with createUniqueFile and schedules it for deletion with sys::RemoveFil...
Represents the result of a call to sys::fs::status().
Definition FileSystem.h:222
This class represents a memory mapped file.
LLVM_ABI size_t size() const
Definition Path.cpp:1180
@ readonly
May only access map via const_data as read only.
@ readwrite
May access map via data and modify it. Written to path.
LLVM_ABI char * data() const
Definition Path.cpp:1185
#define UINT64_MAX
Definition DataTypes.h:77
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
constexpr StringLiteral CASFormatVersion
The version for all the ondisk database files.
Expected< std::optional< uint64_t > > getOverriddenMaxMappingSize()
Retrieves an overridden maximum mapping size for CAS files, if any, speicified by LLVM_CAS_MAX_MAPPIN...
Expected< size_t > preallocateFileTail(int FD, size_t CurrentSize, size_t NewSize)
Allocate space for the file FD on disk, if the filesystem supports it.
bool useSmallMappingSize(const Twine &Path)
Whether to use a small file mapping for ondisk databases created in Path.
uint64_t getDataSize(const FuncRecordTy *Record)
Return the coverage map data size for the function.
uint64_t read64le(const void *P)
Definition Endian.h:435
void write64le(void *P, uint64_t V)
Definition Endian.h:478
void write32le(void *P, uint32_t V)
Definition Endian.h:475
uint32_t read32le(const void *P)
Definition Endian.h:432
LLVM_ABI std::error_code closeFile(file_t &F)
Close the file object.
LLVM_ABI std::error_code rename(const Twine &from, const Twine &to)
Rename from to to.
std::error_code resize_file_before_mapping_readwrite(int FD, uint64_t Size)
Resize FD to Size before mapping mapped_file_region::readwrite.
Definition FileSystem.h:410
LLVM_ABI bool exists(const basic_file_status &status)
Does file exist?
Definition Path.cpp:1088
@ OF_Append
The file should be opened in append mode.
Definition FileSystem.h:767
LLVM_ABI std::error_code createUniqueFile(const Twine &Model, int &ResultFD, SmallVectorImpl< char > &ResultPath, OpenFlags Flags=OF_None, unsigned Mode=all_read|all_write)
Create a uniquely named file.
Definition Path.cpp:872
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
@ CD_OpenExisting
CD_OpenExisting - When opening a file:
Definition FileSystem.h:737
LLVM_ABI Expected< file_t > openNativeFileForRead(const Twine &Name, OpenFlags Flags=OF_None, SmallVectorImpl< char > *RealPath=nullptr)
Opens the file with the given name in a read-only mode, returning its open file descriptor.
LLVM_ABI std::error_code create_directories(const Twine &path, bool IgnoreExisting=true, perms Perms=owner_all|group_all)
Create all the non-existent directories in path.
Definition Path.cpp:974
LLVM_ABI file_t convertFDToNativeFile(int FD)
Converts from a Posix file descriptor number to a native file handle.
Definition FileSystem.h:991
LLVM_ABI std::error_code status(const Twine &path, file_status &result, bool follow=true)
Get file status as if by POSIX stat().
std::error_code file_size(const Twine &Path, uint64_t &Result)
Get file size.
Definition FileSystem.h:684
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition Path.cpp:457
ScopedSetting scopedDisable()
Definition IOSandbox.h:36
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
Error createFileError(const Twine &F, Error E)
Concatenate a source file path and/or name with an Error.
Definition Error.h:1399
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
ArrayRef< CharT > arrayRefFromStringRef(StringRef Input)
Construct a string ref from an array ref of unsigned chars.
std::error_code make_error_code(BitcodeError E)
@ Done
Definition Threading.h:60
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Definition Alignment.h:134
Error handleErrors(Error E, HandlerTs &&... Hs)
Pass the ErrorInfo(s) contained in E to their respective handlers.
Definition Error.h:967
FunctionAddr VTableAddr uintptr_t uintptr_t DataSize
Definition InstrProf.h:267
std::string utohexstr(uint64_t X, bool LowerCase=false, unsigned Width=0)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Error createStringError(std::error_code EC, char const *Fmt, const Ts &... Vals)
Create formatted StringError object.
Definition Error.h:1305
@ argument_out_of_domain
Definition Errc.h:37
@ illegal_byte_sequence
Definition Errc.h:52
@ invalid_argument
Definition Errc.h:56
std::optional< T > expectedToOptional(Expected< T > &&E)
Convert an Expected to an Optional without doing anything.
Definition Error.h:1094
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ Other
Any other memory.
Definition ModRef.h:68
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1885
void toHex(ArrayRef< uint8_t > Input, bool LowerCase, SmallVectorImpl< char > &Output)
Convert buffer Input to its hexadecimal representation. The returned string is double the size of Inp...
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
LLVM_ABI Error errorCodeToError(std::error_code EC)
Helper for converting an std::error_code to a Error.
Definition Error.cpp:107
void consumeError(Error Err)
Consume a Error without doing anything.
Definition Error.h:1083
bool isAddrAligned(Align Lhs, const void *Addr)
Checks that Addr is a multiple of the alignment.
Definition Alignment.h:139
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
Proxy for an on-disk index record.
static constexpr Align Of()
Allow constructions of constexpr Align from types.
Definition Alignment.h:94
Const value proxy to access the records stored in TrieRawHashMap.
Value proxy to access the records stored in TrieRawHashMap.
Encapsulates file info for an underlying object node.