LLVM 23.0.0git
OnDiskGraphDB.cpp
Go to the documentation of this file.
1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file implements OnDiskGraphDB, an on-disk CAS nodes database,
11/// independent of a particular hashing algorithm. It only needs to be
12/// configured for the hash size and controls the schema of the storage.
13///
14/// OnDiskGraphDB defines:
15///
16/// - How the data is stored inside database, either as a standalone file, or
17/// allocated inside a datapool.
18/// - How references to other objects inside the same database is stored. They
19/// are stored as internal references, instead of full hash value to save
20/// space.
21/// - How to chain databases together and import objects from upstream
22/// databases.
23///
24/// Here's a top-level description of the current layout:
25///
26/// - db/index.<version>: a file for the "index" table, named by \a
27/// IndexTableName and managed by \a TrieRawHashMap. The contents are 8B
28/// that are accessed atomically, describing the object kind and where/how
29/// it's stored (including an optional file offset). See \a TrieRecord for
30/// more details.
31/// - db/data.<version>: a file for the "data" table, named by \a
32/// DataPoolTableName and managed by \a DataStore. New objects within
33/// TrieRecord::MaxEmbeddedSize are inserted here as \a
34/// TrieRecord::StorageKind::DataPool.
35/// - db/obj.<offset>.<version>: a file storing an object outside the main
36/// "data" table, named by its offset into the "index" table, with the
37/// format of \a TrieRecord::StorageKind::Standalone.
38/// - db/leaf.<offset>.<version>: a file storing a leaf node outside the
39/// main "data" table, named by its offset into the "index" table, with
40/// the format of \a TrieRecord::StorageKind::StandaloneLeaf.
41/// - db/leaf+0.<offset>.<version>: a file storing a null-terminated leaf object
42/// outside the main "data" table, named by its offset into the "index" table,
43/// with the format of \a TrieRecord::StorageKind::StandaloneLeaf0.
44//
45//===----------------------------------------------------------------------===//
46
48#include "OnDiskCommon.h"
49#include "llvm/ADT/DenseMap.h"
50#include "llvm/ADT/ScopeExit.h"
57#include "llvm/Support/Errc.h"
58#include "llvm/Support/Error.h"
63#include "llvm/Support/Path.h"
65#include <atomic>
66#include <mutex>
67#include <optional>
68#include <variant>
69
70#define DEBUG_TYPE "on-disk-cas"
71
72using namespace llvm;
73using namespace llvm::cas;
74using namespace llvm::cas::ondisk;
75
76static constexpr StringLiteral IndexTableName = "llvm.cas.index";
77static constexpr StringLiteral DataPoolTableName = "llvm.cas.data";
78
79static constexpr StringLiteral IndexFilePrefix = "index.";
80static constexpr StringLiteral DataPoolFilePrefix = "data.";
81
82static constexpr StringLiteral FilePrefixObject = "obj.";
83static constexpr StringLiteral FilePrefixLeaf = "leaf.";
84static constexpr StringLiteral FilePrefixLeaf0 = "leaf+0.";
85
87 if (!ID)
88 return ID.takeError();
89
91 "corrupt object '" + toHex(*ID) + "'");
92}
93
94namespace {
95
96/// Trie record data: 8 bytes, atomic<uint64_t>
97/// - 1-byte: StorageKind
98/// - 7-bytes: DataStoreOffset (offset into referenced file)
99class TrieRecord {
100public:
101 enum class StorageKind : uint8_t {
102 /// Unknown object.
103 Unknown = 0,
104
105 /// data.vX: main pool, full DataStore record.
106 DataPool = 1,
107
108 /// obj.<TrieRecordOffset>.vX: standalone, with a full DataStore record.
109 Standalone = 10,
110
111 /// leaf.<TrieRecordOffset>.vX: standalone, just the data. File contents
112 /// exactly the data content and file size matches the data size. No refs.
113 StandaloneLeaf = 11,
114
115 /// leaf+0.<TrieRecordOffset>.vX: standalone, just the data plus an
116 /// extra null character ('\0'). File size is 1 bigger than the data size.
117 /// No refs.
118 StandaloneLeaf0 = 12,
119 };
120
121 static StringRef getStandaloneFilePrefix(StorageKind SK) {
122 switch (SK) {
123 default:
124 llvm_unreachable("Expected standalone storage kind");
125 case TrieRecord::StorageKind::Standalone:
126 return FilePrefixObject;
127 case TrieRecord::StorageKind::StandaloneLeaf:
128 return FilePrefixLeaf;
129 case TrieRecord::StorageKind::StandaloneLeaf0:
130 return FilePrefixLeaf0;
131 }
132 }
133
134 enum Limits : int64_t {
135 /// Saves files bigger than 64KB standalone instead of embedding them.
136 MaxEmbeddedSize = 64LL * 1024LL - 1,
137 };
138
139 struct Data {
140 StorageKind SK = StorageKind::Unknown;
141 FileOffset Offset;
142 };
143
144 /// Pack StorageKind and Offset from Data into 8 byte TrieRecord.
145 static uint64_t pack(Data D) {
146 assert(D.Offset.get() < (int64_t)(1ULL << 56));
147 uint64_t Packed = uint64_t(D.SK) << 56 | D.Offset.get();
148 assert(D.SK != StorageKind::Unknown || Packed == 0);
149#ifndef NDEBUG
150 Data RoundTrip = unpack(Packed);
151 assert(D.SK == RoundTrip.SK);
152 assert(D.Offset.get() == RoundTrip.Offset.get());
153#endif
154 return Packed;
155 }
156
157 // Unpack TrieRecord into Data.
158 static Data unpack(uint64_t Packed) {
159 Data D;
160 if (!Packed)
161 return D;
162 D.SK = (StorageKind)(Packed >> 56);
163 D.Offset = FileOffset(Packed & (UINT64_MAX >> 8));
164 return D;
165 }
166
167 TrieRecord() : Storage(0) {}
168
169 Data load() const { return unpack(Storage); }
170 bool compare_exchange_strong(Data &Existing, Data New);
171
172private:
173 std::atomic<uint64_t> Storage;
174};
175
176/// DataStore record data: 4B + size? + refs? + data + 0
177/// - 4-bytes: Header
178/// - {0,4,8}-bytes: DataSize (may be packed in Header)
179/// - {0,4,8}-bytes: NumRefs (may be packed in Header)
180/// - NumRefs*{4,8}-bytes: Refs[] (end-ptr is 8-byte aligned)
181/// - <data>
182/// - 1-byte: 0-term
183struct DataRecordHandle {
184 /// NumRefs storage: 4B, 2B, 1B, or 0B (no refs). Or, 8B, for alignment
185 /// convenience to avoid computing padding later.
186 enum class NumRefsFlags : uint8_t {
187 Uses0B = 0U,
188 Uses1B = 1U,
189 Uses2B = 2U,
190 Uses4B = 3U,
191 Uses8B = 4U,
192 Max = Uses8B,
193 };
194
195 /// DataSize storage: 8B, 4B, 2B, or 1B.
196 enum class DataSizeFlags {
197 Uses1B = 0U,
198 Uses2B = 1U,
199 Uses4B = 2U,
200 Uses8B = 3U,
201 Max = Uses8B,
202 };
203
204 /// Kind of ref stored in Refs[]: InternalRef or InternalRef4B.
205 enum class RefKindFlags {
206 InternalRef = 0U,
207 InternalRef4B = 1U,
208 Max = InternalRef4B,
209 };
210
211 enum Counts : int {
212 NumRefsShift = 0,
213 NumRefsBits = 3,
214 DataSizeShift = NumRefsShift + NumRefsBits,
215 DataSizeBits = 2,
216 RefKindShift = DataSizeShift + DataSizeBits,
217 RefKindBits = 1,
218 };
219 static_assert(((UINT32_MAX << NumRefsBits) & (uint32_t)NumRefsFlags::Max) ==
220 0,
221 "Not enough bits");
222 static_assert(((UINT32_MAX << DataSizeBits) & (uint32_t)DataSizeFlags::Max) ==
223 0,
224 "Not enough bits");
225 static_assert(((UINT32_MAX << RefKindBits) & (uint32_t)RefKindFlags::Max) ==
226 0,
227 "Not enough bits");
228
229 /// Layout of the DataRecordHandle and how to decode it.
230 struct LayoutFlags {
231 NumRefsFlags NumRefs;
232 DataSizeFlags DataSize;
233 RefKindFlags RefKind;
234
235 static uint64_t pack(LayoutFlags LF) {
236 unsigned Packed = ((unsigned)LF.NumRefs << NumRefsShift) |
237 ((unsigned)LF.DataSize << DataSizeShift) |
238 ((unsigned)LF.RefKind << RefKindShift);
239#ifndef NDEBUG
240 LayoutFlags RoundTrip = unpack(Packed);
241 assert(LF.NumRefs == RoundTrip.NumRefs);
242 assert(LF.DataSize == RoundTrip.DataSize);
243 assert(LF.RefKind == RoundTrip.RefKind);
244#endif
245 return Packed;
246 }
247 static LayoutFlags unpack(uint64_t Storage) {
248 assert(Storage <= UINT8_MAX && "Expect storage to fit in a byte");
249 LayoutFlags LF;
250 LF.NumRefs =
251 (NumRefsFlags)((Storage >> NumRefsShift) & ((1U << NumRefsBits) - 1));
252 LF.DataSize = (DataSizeFlags)((Storage >> DataSizeShift) &
253 ((1U << DataSizeBits) - 1));
254 LF.RefKind =
255 (RefKindFlags)((Storage >> RefKindShift) & ((1U << RefKindBits) - 1));
256 return LF;
257 }
258 };
259
260 /// Header layout:
261 /// - 1-byte: LayoutFlags
262 /// - 1-byte: 1B size field
263 /// - {0,2}-bytes: 2B size field
264 struct Header {
265 using PackTy = uint32_t;
266 PackTy Packed;
267
268 static constexpr unsigned LayoutFlagsShift =
269 (sizeof(PackTy) - 1) * CHAR_BIT;
270 };
271
272 struct Input {
273 InternalRefArrayRef Refs;
274 ArrayRef<char> Data;
275 };
276
277 LayoutFlags getLayoutFlags() const {
278 return LayoutFlags::unpack(H->Packed >> Header::LayoutFlagsShift);
279 }
280
281 uint64_t getDataSize() const;
282 void skipDataSize(LayoutFlags LF, int64_t &RelOffset) const;
283 uint32_t getNumRefs() const;
284 void skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const;
285 int64_t getRefsRelOffset() const;
286 int64_t getDataRelOffset() const;
287
288 static uint64_t getTotalSize(uint64_t DataRelOffset, uint64_t DataSize) {
289 return DataRelOffset + DataSize + 1;
290 }
291 uint64_t getTotalSize() const {
292 return getDataRelOffset() + getDataSize() + 1;
293 }
294
295 /// Describe the layout of data stored and how to decode from
296 /// DataRecordHandle.
297 struct Layout {
298 explicit Layout(const Input &I);
299
300 LayoutFlags Flags;
301 uint64_t DataSize = 0;
302 uint32_t NumRefs = 0;
303 int64_t RefsRelOffset = 0;
304 int64_t DataRelOffset = 0;
305 uint64_t getTotalSize() const {
306 return DataRecordHandle::getTotalSize(DataRelOffset, DataSize);
307 }
308 };
309
310 InternalRefArrayRef getRefs() const {
311 assert(H && "Expected valid handle");
312 auto *BeginByte = reinterpret_cast<const char *>(H) + getRefsRelOffset();
313 size_t Size = getNumRefs();
314 if (!Size)
315 return InternalRefArrayRef();
316 if (getLayoutFlags().RefKind == RefKindFlags::InternalRef4B)
317 return ArrayRef(reinterpret_cast<const InternalRef4B *>(BeginByte), Size);
318 return ArrayRef(reinterpret_cast<const InternalRef *>(BeginByte), Size);
319 }
320
321 ArrayRef<char> getData() const {
322 assert(H && "Expected valid handle");
323 return ArrayRef(reinterpret_cast<const char *>(H) + getDataRelOffset(),
324 getDataSize());
325 }
326
327 static DataRecordHandle create(function_ref<char *(size_t Size)> Alloc,
328 const Input &I);
329 static Expected<DataRecordHandle>
330 createWithError(function_ref<Expected<char *>(size_t Size)> Alloc,
331 const Input &I);
332 static DataRecordHandle construct(char *Mem, const Input &I);
333
334 static DataRecordHandle get(const char *Mem) {
335 return DataRecordHandle(
336 *reinterpret_cast<const DataRecordHandle::Header *>(Mem));
337 }
338 static Expected<DataRecordHandle>
339 getFromDataPool(const OnDiskDataAllocator &Pool, FileOffset Offset);
340
341 explicit operator bool() const { return H; }
342 const Header &getHeader() const { return *H; }
343
344 DataRecordHandle() = default;
345 explicit DataRecordHandle(const Header &H) : H(&H) {}
346
347private:
348 static DataRecordHandle constructImpl(char *Mem, const Input &I,
349 const Layout &L);
350 const Header *H = nullptr;
351};
352
353/// Proxy for any on-disk object or raw data.
354struct OnDiskContent {
355 std::optional<DataRecordHandle> Record;
356 std::optional<ArrayRef<char>> Bytes;
357
358 ArrayRef<char> getData() const {
359 if (Bytes)
360 return *Bytes;
361 assert(Record && "Expected record or bytes");
362 return Record->getData();
363 }
364};
365
366/// Data loaded inside the memory from standalone file.
367class StandaloneDataInMemory {
368public:
369 OnDiskContent getContent() const;
370
371 OnDiskGraphDB::FileBackedData
372 getInternalFileBackedObjectData(StringRef RootPath) const;
373
374 StandaloneDataInMemory(std::unique_ptr<sys::fs::mapped_file_region> Region,
375 TrieRecord::StorageKind SK, FileOffset IndexOffset)
376 : Region(std::move(Region)), SK(SK), IndexOffset(IndexOffset) {
377#ifndef NDEBUG
378 bool IsStandalone = false;
379 switch (SK) {
380 case TrieRecord::StorageKind::Standalone:
381 case TrieRecord::StorageKind::StandaloneLeaf:
382 case TrieRecord::StorageKind::StandaloneLeaf0:
383 IsStandalone = true;
384 break;
385 default:
386 break;
387 }
388 assert(IsStandalone);
389#endif
390 }
391
392private:
393 std::unique_ptr<sys::fs::mapped_file_region> Region;
394 TrieRecord::StorageKind SK;
395 FileOffset IndexOffset;
396};
397
398/// Container to lookup loaded standalone objects.
399template <size_t NumShards> class StandaloneDataMap {
400 static_assert(isPowerOf2_64(NumShards), "Expected power of 2");
401
402public:
403 uintptr_t insert(ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
404 std::unique_ptr<sys::fs::mapped_file_region> Region,
405 FileOffset IndexOffset);
406
407 const StandaloneDataInMemory *lookup(ArrayRef<uint8_t> Hash) const;
408 bool count(ArrayRef<uint8_t> Hash) const { return bool(lookup(Hash)); }
409
410private:
411 struct Shard {
412 /// Needs to store a std::unique_ptr for a stable address identity.
413 DenseMap<const uint8_t *, std::unique_ptr<StandaloneDataInMemory>> Map;
414 mutable std::mutex Mutex;
415 };
416 Shard &getShard(ArrayRef<uint8_t> Hash) {
417 return const_cast<Shard &>(
418 const_cast<const StandaloneDataMap *>(this)->getShard(Hash));
419 }
420 const Shard &getShard(ArrayRef<uint8_t> Hash) const {
421 static_assert(NumShards <= 256, "Expected only 8 bits of shard");
422 return Shards[Hash[0] % NumShards];
423 }
424
425 Shard Shards[NumShards];
426};
427
428using StandaloneDataMapTy = StandaloneDataMap<16>;
429
430/// A vector of internal node references.
431class InternalRefVector {
432public:
433 void push_back(InternalRef Ref) {
434 if (NeedsFull)
435 return FullRefs.push_back(Ref);
436 if (std::optional<InternalRef4B> Small = InternalRef4B::tryToShrink(Ref))
437 return SmallRefs.push_back(*Small);
438 NeedsFull = true;
439 assert(FullRefs.empty());
440 FullRefs.reserve(SmallRefs.size() + 1);
441 for (InternalRef4B Small : SmallRefs)
442 FullRefs.push_back(Small);
443 FullRefs.push_back(Ref);
444 SmallRefs.clear();
445 }
446
447 operator InternalRefArrayRef() const {
448 assert(SmallRefs.empty() || FullRefs.empty());
449 return NeedsFull ? InternalRefArrayRef(FullRefs)
450 : InternalRefArrayRef(SmallRefs);
451 }
452
453private:
454 bool NeedsFull = false;
457};
458
459} // namespace
460
461Expected<DataRecordHandle> DataRecordHandle::createWithError(
462 function_ref<Expected<char *>(size_t Size)> Alloc, const Input &I) {
463 Layout L(I);
464 if (Expected<char *> Mem = Alloc(L.getTotalSize()))
465 return constructImpl(*Mem, I, L);
466 else
467 return Mem.takeError();
468}
469
471 // Store the file offset as it is.
472 assert(!(Offset.get() & 0x1));
473 return ObjectHandle(Offset.get());
474}
475
477 // Store the pointer from memory with lowest bit set.
478 assert(!(Ptr & 0x1));
479 return ObjectHandle(Ptr | 1);
480}
481
482/// Proxy for an on-disk index record.
488
489template <size_t N>
490uintptr_t StandaloneDataMap<N>::insert(
491 ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
492 std::unique_ptr<sys::fs::mapped_file_region> Region,
493 FileOffset IndexOffset) {
494 auto &S = getShard(Hash);
495 std::lock_guard<std::mutex> Lock(S.Mutex);
496 auto &V = S.Map[Hash.data()];
497 if (!V)
498 V = std::make_unique<StandaloneDataInMemory>(std::move(Region), SK,
499 IndexOffset);
500 return reinterpret_cast<uintptr_t>(V.get());
501}
502
503template <size_t N>
504const StandaloneDataInMemory *
505StandaloneDataMap<N>::lookup(ArrayRef<uint8_t> Hash) const {
506 auto &S = getShard(Hash);
507 std::lock_guard<std::mutex> Lock(S.Mutex);
508 auto I = S.Map.find(Hash.data());
509 if (I == S.Map.end())
510 return nullptr;
511 return &*I->second;
512}
513
514namespace {
515
516/// Copy of \a sys::fs::TempFile that skips RemoveOnSignal, which is too
517/// expensive to register/unregister at this rate.
518///
519/// FIXME: Add a TempFileManager that maintains a thread-safe list of open temp
520/// files and has a signal handler registerd that removes them all.
521class TempFile {
522 bool Done = false;
523 TempFile(StringRef Name, int FD, OnDiskCASLogger *Logger)
524 : TmpName(std::string(Name)), FD(FD), Logger(Logger) {}
525
526public:
527 /// This creates a temporary file with createUniqueFile.
528 static Expected<TempFile> create(const Twine &Model, OnDiskCASLogger *Logger);
529 TempFile(TempFile &&Other) { *this = std::move(Other); }
530 TempFile &operator=(TempFile &&Other) {
531 TmpName = std::move(Other.TmpName);
532 FD = Other.FD;
533 Logger = Other.Logger;
534 Other.Done = true;
535 Other.FD = -1;
536 return *this;
537 }
538
539 // Name of the temporary file.
540 std::string TmpName;
541
542 // The open file descriptor.
543 int FD = -1;
544
545 OnDiskCASLogger *Logger = nullptr;
546
547 // Keep this with the given name.
548 Error keep(const Twine &Name);
549 Error discard();
550
551 // This checks that keep or delete was called.
552 ~TempFile() { consumeError(discard()); }
553};
554
555class MappedTempFile {
556public:
557 char *data() const { return Map.data(); }
558 size_t size() const { return Map.size(); }
559
560 Error discard() {
561 assert(Map && "Map already destroyed");
562 Map.unmap();
563 return Temp.discard();
564 }
565
566 Error keep(const Twine &Name) {
567 assert(Map && "Map already destroyed");
568 Map.unmap();
569 return Temp.keep(Name);
570 }
571
572 MappedTempFile(TempFile Temp, sys::fs::mapped_file_region Map)
573 : Temp(std::move(Temp)), Map(std::move(Map)) {}
574
575private:
576 TempFile Temp;
577 sys::fs::mapped_file_region Map;
578};
579} // namespace
580
582 Done = true;
583 if (FD != -1) {
585 if (std::error_code EC = sys::fs::closeFile(File))
586 return errorCodeToError(EC);
587 }
588 FD = -1;
589
590 // Always try to close and remove.
591 std::error_code RemoveEC;
592 if (!TmpName.empty()) {
593 std::error_code EC = sys::fs::remove(TmpName);
594 if (Logger)
595 Logger->logTempFileRemove(TmpName, EC);
596 if (EC)
597 return errorCodeToError(EC);
598 }
599 TmpName = "";
600
601 return Error::success();
602}
603
605 assert(!Done);
606 Done = true;
607 // Always try to close and rename.
608 std::error_code RenameEC = sys::fs::rename(TmpName, Name);
609
610 if (Logger)
611 Logger->logTempFileKeep(TmpName, Name.str(), RenameEC);
612
613 if (!RenameEC)
614 TmpName = "";
615
617 if (std::error_code EC = sys::fs::closeFile(File))
618 return errorCodeToError(EC);
619 FD = -1;
620
621 return errorCodeToError(RenameEC);
622}
623
626 int FD;
627 SmallString<128> ResultPath;
628 if (std::error_code EC = sys::fs::createUniqueFile(Model, FD, ResultPath))
629 return errorCodeToError(EC);
630
631 if (Logger)
632 Logger->logTempFileCreate(ResultPath);
633
634 TempFile Ret(ResultPath, FD, Logger);
635 return std::move(Ret);
636}
637
638bool TrieRecord::compare_exchange_strong(Data &Existing, Data New) {
639 uint64_t ExistingPacked = pack(Existing);
640 uint64_t NewPacked = pack(New);
641 if (Storage.compare_exchange_strong(ExistingPacked, NewPacked))
642 return true;
643 Existing = unpack(ExistingPacked);
644 return false;
645}
646
648DataRecordHandle::getFromDataPool(const OnDiskDataAllocator &Pool,
650 auto HeaderData = Pool.get(Offset, sizeof(DataRecordHandle::Header));
651 if (!HeaderData)
652 return HeaderData.takeError();
653
654 auto Record = DataRecordHandle::get(HeaderData->data());
655 if (Record.getTotalSize() + Offset.get() > Pool.size())
656 return createStringError(
657 make_error_code(std::errc::illegal_byte_sequence),
658 "data record span passed the end of the data pool");
659
660 return Record;
661}
662
663DataRecordHandle DataRecordHandle::constructImpl(char *Mem, const Input &I,
664 const Layout &L) {
665 char *Next = Mem + sizeof(Header);
666
667 // Fill in Packed and set other data, then come back to construct the header.
668 Header::PackTy Packed = 0;
669 Packed |= LayoutFlags::pack(L.Flags) << Header::LayoutFlagsShift;
670
671 // Construct DataSize.
672 switch (L.Flags.DataSize) {
673 case DataSizeFlags::Uses1B:
674 assert(I.Data.size() <= UINT8_MAX);
675 Packed |= (Header::PackTy)I.Data.size()
676 << ((sizeof(Packed) - 2) * CHAR_BIT);
677 break;
678 case DataSizeFlags::Uses2B:
679 assert(I.Data.size() <= UINT16_MAX);
680 Packed |= (Header::PackTy)I.Data.size()
681 << ((sizeof(Packed) - 4) * CHAR_BIT);
682 break;
683 case DataSizeFlags::Uses4B:
684 support::endian::write32le(Next, I.Data.size());
685 Next += 4;
686 break;
687 case DataSizeFlags::Uses8B:
688 support::endian::write64le(Next, I.Data.size());
689 Next += 8;
690 break;
691 }
692
693 // Construct NumRefs.
694 //
695 // NOTE: May be writing NumRefs even if there are zero refs in order to fix
696 // alignment.
697 switch (L.Flags.NumRefs) {
698 case NumRefsFlags::Uses0B:
699 break;
700 case NumRefsFlags::Uses1B:
701 assert(I.Refs.size() <= UINT8_MAX);
702 Packed |= (Header::PackTy)I.Refs.size()
703 << ((sizeof(Packed) - 2) * CHAR_BIT);
704 break;
705 case NumRefsFlags::Uses2B:
706 assert(I.Refs.size() <= UINT16_MAX);
707 Packed |= (Header::PackTy)I.Refs.size()
708 << ((sizeof(Packed) - 4) * CHAR_BIT);
709 break;
710 case NumRefsFlags::Uses4B:
711 support::endian::write32le(Next, I.Refs.size());
712 Next += 4;
713 break;
714 case NumRefsFlags::Uses8B:
715 support::endian::write64le(Next, I.Refs.size());
716 Next += 8;
717 break;
718 }
719
720 // Construct Refs[].
721 if (!I.Refs.empty()) {
722 assert((L.Flags.RefKind == RefKindFlags::InternalRef4B) == I.Refs.is4B());
723 ArrayRef<uint8_t> RefsBuffer = I.Refs.getBuffer();
724 llvm::copy(RefsBuffer, Next);
725 Next += RefsBuffer.size();
726 }
727
728 // Construct Data and the trailing null.
730 llvm::copy(I.Data, Next);
731 Next[I.Data.size()] = 0;
732
733 // Construct the header itself and return.
734 Header *H = new (Mem) Header{Packed};
735 DataRecordHandle Record(*H);
736 assert(Record.getData() == I.Data);
737 assert(Record.getNumRefs() == I.Refs.size());
738 assert(Record.getRefs() == I.Refs);
739 assert(Record.getLayoutFlags().DataSize == L.Flags.DataSize);
740 assert(Record.getLayoutFlags().NumRefs == L.Flags.NumRefs);
741 assert(Record.getLayoutFlags().RefKind == L.Flags.RefKind);
742 return Record;
743}
744
745DataRecordHandle::Layout::Layout(const Input &I) {
746 // Start initial relative offsets right after the Header.
747 uint64_t RelOffset = sizeof(Header);
748
749 // Initialize the easy stuff.
750 DataSize = I.Data.size();
751 NumRefs = I.Refs.size();
752
753 // Check refs size.
754 Flags.RefKind =
755 I.Refs.is4B() ? RefKindFlags::InternalRef4B : RefKindFlags::InternalRef;
756
757 // Find the smallest slot available for DataSize.
758 bool Has1B = true;
759 bool Has2B = true;
760 if (DataSize <= UINT8_MAX && Has1B) {
761 Flags.DataSize = DataSizeFlags::Uses1B;
762 Has1B = false;
763 } else if (DataSize <= UINT16_MAX && Has2B) {
764 Flags.DataSize = DataSizeFlags::Uses2B;
765 Has2B = false;
766 } else if (DataSize <= UINT32_MAX) {
767 Flags.DataSize = DataSizeFlags::Uses4B;
768 RelOffset += 4;
769 } else {
770 Flags.DataSize = DataSizeFlags::Uses8B;
771 RelOffset += 8;
772 }
773
774 // Find the smallest slot available for NumRefs. Never sets NumRefs8B here.
775 if (!NumRefs) {
776 Flags.NumRefs = NumRefsFlags::Uses0B;
777 } else if (NumRefs <= UINT8_MAX && Has1B) {
778 Flags.NumRefs = NumRefsFlags::Uses1B;
779 Has1B = false;
780 } else if (NumRefs <= UINT16_MAX && Has2B) {
781 Flags.NumRefs = NumRefsFlags::Uses2B;
782 Has2B = false;
783 } else {
784 Flags.NumRefs = NumRefsFlags::Uses4B;
785 RelOffset += 4;
786 }
787
788 // Helper to "upgrade" either DataSize or NumRefs by 4B to avoid complicated
789 // padding rules when reading and writing. This also bumps RelOffset.
790 //
791 // The value for NumRefs is strictly limited to UINT32_MAX, but it can be
792 // stored as 8B. This means we can *always* find a size to grow.
793 //
794 // NOTE: Only call this once.
795 auto GrowSizeFieldsBy4B = [&]() {
796 assert(isAligned(Align(4), RelOffset));
797 RelOffset += 4;
798
799 assert(Flags.NumRefs != NumRefsFlags::Uses8B &&
800 "Expected to be able to grow NumRefs8B");
801
802 // First try to grow DataSize. NumRefs will not (yet) be 8B, and if
803 // DataSize is upgraded to 8B it'll already be aligned.
804 //
805 // Failing that, grow NumRefs.
806 if (Flags.DataSize < DataSizeFlags::Uses4B)
807 Flags.DataSize = DataSizeFlags::Uses4B; // DataSize: Packed => 4B.
808 else if (Flags.DataSize < DataSizeFlags::Uses8B)
809 Flags.DataSize = DataSizeFlags::Uses8B; // DataSize: 4B => 8B.
810 else if (Flags.NumRefs < NumRefsFlags::Uses4B)
811 Flags.NumRefs = NumRefsFlags::Uses4B; // NumRefs: Packed => 4B.
812 else
813 Flags.NumRefs = NumRefsFlags::Uses8B; // NumRefs: 4B => 8B.
814 };
815
816 assert(isAligned(Align(4), RelOffset));
817 if (Flags.RefKind == RefKindFlags::InternalRef) {
818 // List of 8B refs should be 8B-aligned. Grow one of the sizes to get this
819 // without padding.
820 if (!isAligned(Align(8), RelOffset))
821 GrowSizeFieldsBy4B();
822
823 assert(isAligned(Align(8), RelOffset));
824 RefsRelOffset = RelOffset;
825 RelOffset += 8 * NumRefs;
826 } else {
827 // The array of 4B refs doesn't need 8B alignment, but the data will need
828 // to be 8B-aligned. Detect this now, and, if necessary, shift everything
829 // by 4B by growing one of the sizes.
830 // If we remove the need for 8B-alignment for data there is <1% savings in
831 // disk storage for a clang build using MCCAS but the 8B-alignment may be
832 // useful in the future so keep it for now.
833 uint64_t RefListSize = 4 * NumRefs;
834 if (!isAligned(Align(8), RelOffset + RefListSize))
835 GrowSizeFieldsBy4B();
836 RefsRelOffset = RelOffset;
837 RelOffset += RefListSize;
838 }
839
840 assert(isAligned(Align(8), RelOffset));
841 DataRelOffset = RelOffset;
842}
843
844uint64_t DataRecordHandle::getDataSize() const {
845 int64_t RelOffset = sizeof(Header);
846 auto *DataSizePtr = reinterpret_cast<const char *>(H) + RelOffset;
847 switch (getLayoutFlags().DataSize) {
848 case DataSizeFlags::Uses1B:
849 return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
850 case DataSizeFlags::Uses2B:
851 return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
852 UINT16_MAX;
853 case DataSizeFlags::Uses4B:
854 return support::endian::read32le(DataSizePtr);
855 case DataSizeFlags::Uses8B:
856 return support::endian::read64le(DataSizePtr);
857 }
858 llvm_unreachable("Unknown DataSizeFlags enum");
859}
860
861void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const {
862 if (LF.DataSize >= DataSizeFlags::Uses4B)
863 RelOffset += 4;
864 if (LF.DataSize >= DataSizeFlags::Uses8B)
865 RelOffset += 4;
866}
867
868uint32_t DataRecordHandle::getNumRefs() const {
869 LayoutFlags LF = getLayoutFlags();
870 int64_t RelOffset = sizeof(Header);
871 skipDataSize(LF, RelOffset);
872 auto *NumRefsPtr = reinterpret_cast<const char *>(H) + RelOffset;
873 switch (LF.NumRefs) {
874 case NumRefsFlags::Uses0B:
875 return 0;
876 case NumRefsFlags::Uses1B:
877 return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
878 case NumRefsFlags::Uses2B:
879 return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
880 UINT16_MAX;
881 case NumRefsFlags::Uses4B:
882 return support::endian::read32le(NumRefsPtr);
883 case NumRefsFlags::Uses8B:
884 return support::endian::read64le(NumRefsPtr);
885 }
886 llvm_unreachable("Unknown NumRefsFlags enum");
887}
888
889void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const {
890 if (LF.NumRefs >= NumRefsFlags::Uses4B)
891 RelOffset += 4;
892 if (LF.NumRefs >= NumRefsFlags::Uses8B)
893 RelOffset += 4;
894}
895
896int64_t DataRecordHandle::getRefsRelOffset() const {
897 LayoutFlags LF = getLayoutFlags();
898 int64_t RelOffset = sizeof(Header);
899 skipDataSize(LF, RelOffset);
900 skipNumRefs(LF, RelOffset);
901 return RelOffset;
902}
903
904int64_t DataRecordHandle::getDataRelOffset() const {
905 LayoutFlags LF = getLayoutFlags();
906 int64_t RelOffset = sizeof(Header);
907 skipDataSize(LF, RelOffset);
908 skipNumRefs(LF, RelOffset);
909 uint32_t RefSize = LF.RefKind == RefKindFlags::InternalRef4B ? 4 : 8;
910 RelOffset += RefSize * getNumRefs();
911 return RelOffset;
912}
913
915 if (UpstreamDB) {
916 if (auto E = UpstreamDB->validate(Deep, Hasher))
917 return E;
918 }
919 if (!isAligned(Align(8), DataPool.size()))
921 "data pool bump pointer is not aligned");
922 return Index.validate([&](FileOffset Offset,
924 -> Error {
925 auto formatError = [&](Twine Msg) {
926 return createStringError(
928 "bad record at 0x" +
929 utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
930 Msg.str());
931 };
932
933 if (Record.Data.size() != sizeof(TrieRecord))
934 return formatError("wrong data record size");
935 if (!isAligned(Align::Of<TrieRecord>(), Record.Data.size()))
936 return formatError("wrong data record alignment");
937
938 auto *R = reinterpret_cast<const TrieRecord *>(Record.Data.data());
939 TrieRecord::Data D = R->load();
940 std::unique_ptr<MemoryBuffer> FileBuffer;
941 if ((uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Unknown &&
942 (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::DataPool &&
943 (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Standalone &&
944 (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf &&
945 (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf0)
946 return formatError("invalid record kind value");
947
949 auto I = getIndexProxyFromRef(Ref);
950 if (!I)
951 return I.takeError();
952
953 switch (D.SK) {
954 case TrieRecord::StorageKind::Unknown:
955 // This could be an abandoned entry due to a termination before updating
956 // the record. It can be reused by later insertion so just skip this entry
957 // for now.
958 return Error::success();
959 case TrieRecord::StorageKind::DataPool: {
960 // Check offset is a postive value, and large enough to hold the
961 // header for the data record.
962 if (D.Offset.get() <= 0 ||
963 D.Offset.get() + sizeof(DataRecordHandle::Header) >= DataPool.size())
964 return formatError("datapool record out of bound");
965
966 // DataRecord start needs to be aligned.
967 if (!isAligned(Align(8), D.Offset.get()))
968 return formatError("data record offset is not aligned");
969
970 // Validate the layout flags before getFromDataPool calls getTotalSize().
971 auto HeaderData =
972 DataPool.get(D.Offset, sizeof(DataRecordHandle::Header));
973 if (!HeaderData)
974 return formatError(toString(HeaderData.takeError()));
975 auto LF = DataRecordHandle::get(HeaderData->data()).getLayoutFlags();
976 if (LF.NumRefs > DataRecordHandle::NumRefsFlags::Max ||
977 LF.DataSize > DataRecordHandle::DataSizeFlags::Max)
978 return formatError("data record has invalid layout flags");
979 break;
980 }
981 case TrieRecord::StorageKind::Standalone:
982 case TrieRecord::StorageKind::StandaloneLeaf:
983 case TrieRecord::StorageKind::StandaloneLeaf0:
984 SmallString<256> Path;
985 getStandalonePath(TrieRecord::getStandaloneFilePrefix(D.SK), I->Offset,
986 Path);
987 // If need to validate the content of the file later, just load the
988 // buffer here. Otherwise, just check the existance of the file.
989 if (Deep) {
990 auto File = MemoryBuffer::getFile(Path, /*IsText=*/false,
991 /*RequiresNullTerminator=*/false);
992 if (!File || !*File)
993 return formatError("record file \'" + Path + "\' does not exist");
994
995 FileBuffer = std::move(*File);
996 } else if (!llvm::sys::fs::exists(Path))
997 return formatError("record file \'" + Path + "\' does not exist");
998 }
999
1000 if (!Deep)
1001 return Error::success();
1002
1003 auto dataError = [&](Twine Msg) {
1005 "bad data for digest \'" + toHex(I->Hash) +
1006 "\': " + Msg.str());
1007 };
1009 ArrayRef<char> StoredData;
1010
1011 switch (D.SK) {
1012 case TrieRecord::StorageKind::Unknown:
1013 llvm_unreachable("already handled");
1014 case TrieRecord::StorageKind::DataPool: {
1015 auto DataRecord = DataRecordHandle::getFromDataPool(DataPool, D.Offset);
1016 if (!DataRecord)
1017 return dataError(toString(DataRecord.takeError()));
1018
1019 for (auto InternRef : DataRecord->getRefs()) {
1020 if (InternRef.getFileOffset().get() <= 0)
1021 return dataError("invalid ref offset");
1022 auto Index = getIndexProxyFromRef(InternRef);
1023 if (!Index)
1024 return Index.takeError();
1025 Refs.push_back(Index->Hash);
1026 }
1027 StoredData = DataRecord->getData();
1028 break;
1029 }
1030 case TrieRecord::StorageKind::Standalone: {
1031 if (FileBuffer->getBufferSize() < sizeof(DataRecordHandle::Header))
1032 return dataError("data record is not big enough to read the header");
1033 auto DataRecord = DataRecordHandle::get(FileBuffer->getBufferStart());
1034 if (DataRecord.getTotalSize() < FileBuffer->getBufferSize())
1035 return dataError(
1036 "data record span passed the end of the standalone file");
1037 for (auto InternRef : DataRecord.getRefs()) {
1038 if (InternRef.getFileOffset().get() <= 0)
1039 return dataError("invalid ref offset");
1040 auto Index = getIndexProxyFromRef(InternRef);
1041 if (!Index)
1042 return Index.takeError();
1043 Refs.push_back(Index->Hash);
1044 }
1045 StoredData = DataRecord.getData();
1046 break;
1047 }
1048 case TrieRecord::StorageKind::StandaloneLeaf:
1049 case TrieRecord::StorageKind::StandaloneLeaf0: {
1050 StoredData = arrayRefFromStringRef<char>(FileBuffer->getBuffer());
1051 if (D.SK == TrieRecord::StorageKind::StandaloneLeaf0) {
1052 if (!FileBuffer->getBuffer().ends_with('\0'))
1053 return dataError("standalone file is not zero terminated");
1054 StoredData = StoredData.drop_back(1);
1055 }
1056 break;
1057 }
1058 }
1059
1060 SmallVector<uint8_t> ComputedHash;
1061 Hasher(Refs, StoredData, ComputedHash);
1062 if (I->Hash != ArrayRef(ComputedHash))
1063 return dataError("hash mismatch, got \'" + toHex(ComputedHash) +
1064 "\' instead");
1065
1066 return Error::success();
1067 });
1068}
1069
1071 auto formatError = [&](Twine Msg) {
1072 return createStringError(
1074 "bad ref=0x" +
1075 utohexstr(ExternalRef.getOpaqueData(), /*LowerCase=*/true) + ": " +
1076 Msg.str());
1077 };
1078
1079 if (ExternalRef.getOpaqueData() == 0)
1080 return formatError("zero is not a valid ref");
1081
1082 InternalRef InternalRef = getInternalRef(ExternalRef);
1083 auto I = getIndexProxyFromRef(InternalRef);
1084 if (!I)
1085 return formatError(llvm::toString(I.takeError()));
1086 auto Hash = getDigest(*I);
1087
1088 OnDiskTrieRawHashMap::ConstOnDiskPtr P = Index.find(Hash);
1089 if (!P)
1090 return formatError("not found using hash " + toHex(Hash));
1091 IndexProxy OtherI = getIndexProxyFromPointer(P);
1092 ObjectID OtherRef = getExternalReference(makeInternalRef(OtherI.Offset));
1093 if (OtherRef != ExternalRef)
1094 return formatError("ref does not match indexed offset " +
1095 utohexstr(OtherRef.getOpaqueData(), /*LowerCase=*/true) +
1096 " for hash " + toHex(Hash));
1097 return Error::success();
1098}
1099
1101 OS << "on-disk-root-path: " << RootPath << "\n";
1102
1103 struct PoolInfo {
1105 };
1107
1108 OS << "\n";
1109 OS << "index:\n";
1110 Index.print(OS, [&](ArrayRef<char> Data) {
1111 assert(Data.size() == sizeof(TrieRecord));
1113 auto *R = reinterpret_cast<const TrieRecord *>(Data.data());
1114 TrieRecord::Data D = R->load();
1115 OS << " SK=";
1116 switch (D.SK) {
1117 case TrieRecord::StorageKind::Unknown:
1118 OS << "unknown ";
1119 break;
1120 case TrieRecord::StorageKind::DataPool:
1121 OS << "datapool ";
1122 Pool.push_back({D.Offset.get()});
1123 break;
1124 case TrieRecord::StorageKind::Standalone:
1125 OS << "standalone-data ";
1126 break;
1127 case TrieRecord::StorageKind::StandaloneLeaf:
1128 OS << "standalone-leaf ";
1129 break;
1130 case TrieRecord::StorageKind::StandaloneLeaf0:
1131 OS << "standalone-leaf+0";
1132 break;
1133 }
1134 OS << " Offset=" << (void *)D.Offset.get();
1135 });
1136 if (Pool.empty())
1137 return;
1138
1139 OS << "\n";
1140 OS << "pool:\n";
1141 llvm::sort(
1142 Pool, [](PoolInfo LHS, PoolInfo RHS) { return LHS.Offset < RHS.Offset; });
1143 for (PoolInfo PI : Pool) {
1144 OS << "- addr=" << (void *)PI.Offset << " ";
1145 auto D = DataRecordHandle::getFromDataPool(DataPool, FileOffset(PI.Offset));
1146 if (!D) {
1147 OS << "error: " << toString(D.takeError());
1148 return;
1149 }
1150
1151 OS << "record refs=" << D->getNumRefs() << " data=" << D->getDataSize()
1152 << " size=" << D->getTotalSize()
1153 << " end=" << (void *)(PI.Offset + D->getTotalSize()) << "\n";
1154 }
1155}
1156
1158OnDiskGraphDB::indexHash(ArrayRef<uint8_t> Hash) {
1159 auto P = Index.insertLazy(
1160 Hash, [](FileOffset TentativeOffset,
1161 OnDiskTrieRawHashMap::ValueProxy TentativeValue) {
1162 assert(TentativeValue.Data.size() == sizeof(TrieRecord));
1163 assert(
1164 isAddrAligned(Align::Of<TrieRecord>(), TentativeValue.Data.data()));
1165 new (TentativeValue.Data.data()) TrieRecord();
1166 });
1167 if (LLVM_UNLIKELY(!P))
1168 return P.takeError();
1169
1170 assert(*P && "Expected insertion");
1171 return getIndexProxyFromPointer(*P);
1172}
1173
1174OnDiskGraphDB::IndexProxy OnDiskGraphDB::getIndexProxyFromPointer(
1176 assert(P);
1177 assert(P.getOffset());
1178 return IndexProxy{P.getOffset(), P->Hash,
1179 *const_cast<TrieRecord *>(
1180 reinterpret_cast<const TrieRecord *>(P->Data.data()))};
1181}
1182
1184 auto I = indexHash(Hash);
1185 if (LLVM_UNLIKELY(!I))
1186 return I.takeError();
1187 return getExternalReference(*I);
1188}
1189
1190ObjectID OnDiskGraphDB::getExternalReference(const IndexProxy &I) {
1191 return getExternalReference(makeInternalRef(I.Offset));
1192}
1193
1194std::optional<ObjectID>
1196 bool CheckUpstream) {
1197 auto tryUpstream =
1198 [&](std::optional<IndexProxy> I) -> std::optional<ObjectID> {
1199 if (!CheckUpstream || !UpstreamDB)
1200 return std::nullopt;
1201 std::optional<ObjectID> UpstreamID =
1202 UpstreamDB->getExistingReference(Digest);
1203 if (LLVM_UNLIKELY(!UpstreamID))
1204 return std::nullopt;
1205 auto Ref = expectedToOptional(indexHash(Digest));
1206 if (!Ref)
1207 return std::nullopt;
1208 if (!I)
1209 I.emplace(*Ref);
1210 return getExternalReference(*I);
1211 };
1212
1213 OnDiskTrieRawHashMap::ConstOnDiskPtr P = Index.find(Digest);
1214 if (!P)
1215 return tryUpstream(std::nullopt);
1216 IndexProxy I = getIndexProxyFromPointer(P);
1217 TrieRecord::Data Obj = I.Ref.load();
1218 if (Obj.SK == TrieRecord::StorageKind::Unknown)
1219 return tryUpstream(I);
1220 return getExternalReference(makeInternalRef(I.Offset));
1221}
1222
1224OnDiskGraphDB::getIndexProxyFromRef(InternalRef Ref) const {
1225 auto P = Index.recoverFromFileOffset(Ref.getFileOffset());
1226 if (LLVM_UNLIKELY(!P))
1227 return P.takeError();
1228 return getIndexProxyFromPointer(*P);
1229}
1230
1232 auto I = getIndexProxyFromRef(Ref);
1233 if (!I)
1234 return I.takeError();
1235 return I->Hash;
1236}
1237
1238ArrayRef<uint8_t> OnDiskGraphDB::getDigest(const IndexProxy &I) const {
1239 return I.Hash;
1240}
1241
1242static std::variant<const StandaloneDataInMemory *, DataRecordHandle>
1244 ObjectHandle OH) {
1245 // Decode ObjectHandle to locate the stored content.
1246 uint64_t Data = OH.getOpaqueData();
1247 if (Data & 1) {
1248 const auto *SDIM =
1249 reinterpret_cast<const StandaloneDataInMemory *>(Data & (-1ULL << 1));
1250 return SDIM;
1251 }
1252
1253 auto DataHandle =
1254 cantFail(DataRecordHandle::getFromDataPool(DataPool, FileOffset(Data)));
1255 assert(DataHandle.getData().end()[0] == 0 && "Null termination");
1256 return DataHandle;
1257}
1258
1259static OnDiskContent getContentFromHandle(const OnDiskDataAllocator &DataPool,
1260 ObjectHandle OH) {
1261 auto SDIMOrRecord = getStandaloneDataOrDataRecord(DataPool, OH);
1262 if (std::holds_alternative<const StandaloneDataInMemory *>(SDIMOrRecord)) {
1263 return std::get<const StandaloneDataInMemory *>(SDIMOrRecord)->getContent();
1264 } else {
1265 auto DataHandle = std::get<DataRecordHandle>(std::move(SDIMOrRecord));
1266 return OnDiskContent{std::move(DataHandle), std::nullopt};
1267 }
1268}
1269
1271 OnDiskContent Content = getContentFromHandle(DataPool, Node);
1272 return Content.getData();
1273}
1274
1275InternalRefArrayRef OnDiskGraphDB::getInternalRefs(ObjectHandle Node) const {
1276 if (std::optional<DataRecordHandle> Record =
1277 getContentFromHandle(DataPool, Node).Record)
1278 return Record->getRefs();
1279 return std::nullopt;
1280}
1281
1284 auto SDIMOrRecord = getStandaloneDataOrDataRecord(DataPool, Node);
1285 if (std::holds_alternative<const StandaloneDataInMemory *>(SDIMOrRecord)) {
1286 auto *SDIM = std::get<const StandaloneDataInMemory *>(SDIMOrRecord);
1287 return SDIM->getInternalFileBackedObjectData(RootPath);
1288 } else {
1289 auto DataHandle = std::get<DataRecordHandle>(std::move(SDIMOrRecord));
1290 return FileBackedData{DataHandle.getData(), /*FileInfo=*/std::nullopt};
1291 }
1292}
1293
1296 InternalRef Ref = getInternalRef(ExternalRef);
1297 auto I = getIndexProxyFromRef(Ref);
1298 if (!I)
1299 return I.takeError();
1300 TrieRecord::Data Object = I->Ref.load();
1301
1302 if (Object.SK == TrieRecord::StorageKind::Unknown)
1303 return faultInFromUpstream(ExternalRef);
1304
1305 if (Object.SK == TrieRecord::StorageKind::DataPool)
1306 return ObjectHandle::fromFileOffset(Object.Offset);
1307
1308 // Only TrieRecord::StorageKind::Standalone (and variants) need to be
1309 // explicitly loaded.
1310 //
1311 // There's corruption if standalone objects have offsets, or if we get here
1312 // for something that isn't standalone.
1313 if (Object.Offset)
1315 switch (Object.SK) {
1316 case TrieRecord::StorageKind::Unknown:
1317 case TrieRecord::StorageKind::DataPool:
1318 llvm_unreachable("unexpected storage kind");
1319 case TrieRecord::StorageKind::Standalone:
1320 case TrieRecord::StorageKind::StandaloneLeaf0:
1321 case TrieRecord::StorageKind::StandaloneLeaf:
1322 break;
1323 }
1324
1325 // Load it from disk.
1326 //
1327 // Note: Creation logic guarantees that data that needs null-termination is
1328 // suitably 0-padded. Requiring null-termination here would be too expensive
1329 // for extremely large objects that happen to be page-aligned.
1330 SmallString<256> Path;
1331 getStandalonePath(TrieRecord::getStandaloneFilePrefix(Object.SK), I->Offset,
1332 Path);
1333
1334 auto BypassSandbox = sys::sandbox::scopedDisable();
1335
1336 auto File = sys::fs::openNativeFileForRead(Path);
1337 if (!File)
1338 return createFileError(Path, File.takeError());
1339
1340 llvm::scope_exit CloseFile([&]() { sys::fs::closeFile(*File); });
1341
1343 if (std::error_code EC = sys::fs::status(*File, Status))
1345
1346 std::error_code EC;
1347 auto Region = std::make_unique<sys::fs::mapped_file_region>(
1348 *File, sys::fs::mapped_file_region::readonly, Status.getSize(), 0, EC);
1349 if (EC)
1351
1353 static_cast<StandaloneDataMapTy *>(StandaloneData)
1354 ->insert(I->Hash, Object.SK, std::move(Region), I->Offset));
1355}
1356
1358 auto Presence = getObjectPresence(Ref, /*CheckUpstream=*/true);
1359 if (!Presence)
1360 return Presence.takeError();
1361
1362 switch (*Presence) {
1363 case ObjectPresence::Missing:
1364 return false;
1365 case ObjectPresence::InPrimaryDB:
1366 return true;
1367 case ObjectPresence::OnlyInUpstreamDB:
1368 if (auto FaultInResult = faultInFromUpstream(Ref); !FaultInResult)
1369 return FaultInResult.takeError();
1370 return true;
1371 }
1372 llvm_unreachable("Unknown ObjectPresence enum");
1373}
1374
1376OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef,
1377 bool CheckUpstream) const {
1378 InternalRef Ref = getInternalRef(ExternalRef);
1379 auto I = getIndexProxyFromRef(Ref);
1380 if (!I)
1381 return I.takeError();
1382
1383 TrieRecord::Data Object = I->Ref.load();
1384 if (Object.SK != TrieRecord::StorageKind::Unknown)
1385 return ObjectPresence::InPrimaryDB;
1386
1387 if (!CheckUpstream || !UpstreamDB)
1388 return ObjectPresence::Missing;
1389
1390 std::optional<ObjectID> UpstreamID =
1391 UpstreamDB->getExistingReference(getDigest(*I));
1392 return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB
1393 : ObjectPresence::Missing;
1394}
1395
1396InternalRef OnDiskGraphDB::makeInternalRef(FileOffset IndexOffset) {
1397 return InternalRef::getFromOffset(IndexOffset);
1398}
1399
1400static void getStandalonePath(StringRef RootPath, StringRef Prefix,
1401 FileOffset IndexOffset,
1402 SmallVectorImpl<char> &Path) {
1403 Path.assign(RootPath.begin(), RootPath.end());
1404 sys::path::append(Path,
1405 Prefix + Twine(IndexOffset.get()) + "." + CASFormatVersion);
1406}
1407
1408void OnDiskGraphDB::getStandalonePath(StringRef Prefix, FileOffset IndexOffset,
1409 SmallVectorImpl<char> &Path) const {
1410 return ::getStandalonePath(RootPath, Prefix, IndexOffset, Path);
1411}
1412
1413OnDiskContent StandaloneDataInMemory::getContent() const {
1414 bool Leaf0 = false;
1415 bool Leaf = false;
1416 switch (SK) {
1417 default:
1418 llvm_unreachable("Storage kind must be standalone");
1419 case TrieRecord::StorageKind::Standalone:
1420 break;
1421 case TrieRecord::StorageKind::StandaloneLeaf0:
1422 Leaf = Leaf0 = true;
1423 break;
1424 case TrieRecord::StorageKind::StandaloneLeaf:
1425 Leaf = true;
1426 break;
1427 }
1428
1429 if (Leaf) {
1430 StringRef Data(Region->data(), Region->size());
1431 assert(Data.drop_back(Leaf0).end()[0] == 0 &&
1432 "Standalone node data missing null termination");
1433 return OnDiskContent{std::nullopt,
1434 arrayRefFromStringRef<char>(Data.drop_back(Leaf0))};
1435 }
1436
1437 DataRecordHandle Record = DataRecordHandle::get(Region->data());
1438 assert(Record.getData().end()[0] == 0 &&
1439 "Standalone object record missing null termination for data");
1440 return OnDiskContent{Record, std::nullopt};
1441}
1442
1443OnDiskGraphDB::FileBackedData
1444StandaloneDataInMemory::getInternalFileBackedObjectData(
1445 StringRef RootPath) const {
1446 switch (SK) {
1447 case TrieRecord::StorageKind::Unknown:
1448 case TrieRecord::StorageKind::DataPool:
1449 llvm_unreachable("unexpected storage kind");
1450 case TrieRecord::StorageKind::Standalone:
1451 return OnDiskGraphDB::FileBackedData{getContent().getData(),
1452 /*FileInfo=*/std::nullopt};
1453 case TrieRecord::StorageKind::StandaloneLeaf0:
1454 case TrieRecord::StorageKind::StandaloneLeaf:
1455 bool IsFileNulTerminated = SK == TrieRecord::StorageKind::StandaloneLeaf0;
1456 SmallString<256> Path;
1457 ::getStandalonePath(RootPath, TrieRecord::getStandaloneFilePrefix(SK),
1458 IndexOffset, Path);
1459 return OnDiskGraphDB::FileBackedData{
1460 getContent().getData(), OnDiskGraphDB::FileBackedData::FileInfoTy{
1461 std::string(Path), IsFileNulTerminated}};
1462 }
1463 llvm_unreachable("Unknown StorageKind enum");
1464}
1465
1466static Expected<MappedTempFile>
1468 auto BypassSandbox = sys::sandbox::scopedDisable();
1469
1470 assert(Size && "Unexpected request for an empty temp file");
1471 Expected<TempFile> File = TempFile::create(FinalPath + ".%%%%%%", Logger);
1472 if (!File)
1473 return File.takeError();
1474
1475 if (Error E = preallocateFileTail(File->FD, 0, Size).takeError())
1476 return createFileError(File->TmpName, std::move(E));
1477
1478 if (auto EC = sys::fs::resize_file_before_mapping_readwrite(File->FD, Size))
1479 return createFileError(File->TmpName, EC);
1480
1481 std::error_code EC;
1484 0, EC);
1485 if (EC)
1486 return createFileError(File->TmpName, EC);
1487 return MappedTempFile(std::move(*File), std::move(Map));
1488}
1489
1490static size_t getPageSize() {
1492 return PageSize;
1493}
1494
1495Error OnDiskGraphDB::createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data) {
1496 assert(Data.size() > TrieRecord::MaxEmbeddedSize &&
1497 "Expected a bigger file for external content...");
1498
1499 bool Leaf0 = isAligned(Align(getPageSize()), Data.size());
1500 TrieRecord::StorageKind SK = Leaf0 ? TrieRecord::StorageKind::StandaloneLeaf0
1501 : TrieRecord::StorageKind::StandaloneLeaf;
1502
1503 SmallString<256> Path;
1504 int64_t FileSize = Data.size() + Leaf0;
1505 getStandalonePath(TrieRecord::getStandaloneFilePrefix(SK), I.Offset, Path);
1506
1507 auto BypassSandbox = sys::sandbox::scopedDisable();
1508
1509 // Write the file. Don't reuse this mapped_file_region, which is read/write.
1510 // Let load() pull up one that's read-only.
1511 Expected<MappedTempFile> File = createTempFile(Path, FileSize, Logger.get());
1512 if (!File)
1513 return File.takeError();
1514 assert(File->size() == (uint64_t)FileSize);
1515 llvm::copy(Data, File->data());
1516 if (Leaf0)
1517 File->data()[Data.size()] = 0;
1518 assert(File->data()[Data.size()] == 0);
1519 if (Error E = File->keep(Path))
1520 return E;
1521
1522 // Store the object reference.
1523 TrieRecord::Data Existing;
1524 {
1525 TrieRecord::Data Leaf{SK, FileOffset()};
1526 if (I.Ref.compare_exchange_strong(Existing, Leaf)) {
1527 recordStandaloneSizeIncrease(FileSize);
1528 return Error::success();
1529 }
1530 }
1531
1532 // If there was a race, confirm that the new value has valid storage.
1533 if (Existing.SK == TrieRecord::StorageKind::Unknown)
1534 return createCorruptObjectError(getDigest(I));
1535
1536 return Error::success();
1537}
1538
1541 auto I = getIndexProxyFromRef(getInternalRef(ID));
1542 if (LLVM_UNLIKELY(!I))
1543 return I.takeError();
1544
1545 // Early return in case the node exists.
1546 {
1547 TrieRecord::Data Existing = I->Ref.load();
1548 if (Existing.SK != TrieRecord::StorageKind::Unknown)
1549 return Error::success();
1550 }
1551
1552 // Big leaf nodes.
1553 if (Refs.empty() && Data.size() > TrieRecord::MaxEmbeddedSize)
1554 return createStandaloneLeaf(*I, Data);
1555
1556 // TODO: Check whether it's worth checking the index for an already existing
1557 // object (like storeTreeImpl() does) before building up the
1558 // InternalRefVector.
1559 InternalRefVector InternalRefs;
1560 for (ObjectID Ref : Refs)
1561 InternalRefs.push_back(getInternalRef(Ref));
1562
1563 // Create the object.
1564
1565 DataRecordHandle::Input Input{InternalRefs, Data};
1566
1567 // Compute the storage kind, allocate it, and create the record.
1568 TrieRecord::StorageKind SK = TrieRecord::StorageKind::Unknown;
1569 FileOffset PoolOffset;
1570 SmallString<256> Path;
1571 std::optional<MappedTempFile> File;
1572 std::optional<uint64_t> FileSize;
1573 auto AllocStandaloneFile = [&](size_t Size) -> Expected<char *> {
1574 getStandalonePath(TrieRecord::getStandaloneFilePrefix(
1575 TrieRecord::StorageKind::Standalone),
1576 I->Offset, Path);
1577 if (Error E = createTempFile(Path, Size, Logger.get()).moveInto(File))
1578 return std::move(E);
1579 assert(File->size() == Size);
1580 FileSize = Size;
1581 SK = TrieRecord::StorageKind::Standalone;
1582 return File->data();
1583 };
1584 auto Alloc = [&](size_t Size) -> Expected<char *> {
1585 if (Size <= TrieRecord::MaxEmbeddedSize) {
1586 SK = TrieRecord::StorageKind::DataPool;
1587 auto P = DataPool.allocate(Size);
1588 if (LLVM_UNLIKELY(!P)) {
1589 char *NewAlloc = nullptr;
1590 auto NewE = handleErrors(
1591 P.takeError(), [&](std::unique_ptr<StringError> E) -> Error {
1592 if (E->convertToErrorCode() == std::errc::not_enough_memory)
1593 return AllocStandaloneFile(Size).moveInto(NewAlloc);
1594 return Error(std::move(E));
1595 });
1596 if (!NewE)
1597 return NewAlloc;
1598 return std::move(NewE);
1599 }
1600 PoolOffset = P->getOffset();
1601 LLVM_DEBUG({
1602 dbgs() << "pool-alloc addr=" << (void *)PoolOffset.get()
1603 << " size=" << Size
1604 << " end=" << (void *)(PoolOffset.get() + Size) << "\n";
1605 });
1606 return (*P)->data();
1607 }
1608 return AllocStandaloneFile(Size);
1609 };
1610
1611 DataRecordHandle Record;
1612 if (Error E =
1613 DataRecordHandle::createWithError(Alloc, Input).moveInto(Record))
1614 return E;
1615 assert(Record.getData().end()[0] == 0 && "Expected null-termination");
1616 assert(Record.getData() == Input.Data && "Expected initialization");
1617 assert(SK != TrieRecord::StorageKind::Unknown);
1618 assert(bool(File) != bool(PoolOffset) &&
1619 "Expected either a mapped file or a pooled offset");
1620
1621 // Check for a race before calling MappedTempFile::keep().
1622 //
1623 // Then decide what to do with the file. Better to discard than overwrite if
1624 // another thread/process has already added this.
1625 TrieRecord::Data Existing = I->Ref.load();
1626 {
1627 TrieRecord::Data NewObject{SK, PoolOffset};
1628 if (File) {
1629 if (Existing.SK == TrieRecord::StorageKind::Unknown) {
1630 // Keep the file!
1631 if (Error E = File->keep(Path))
1632 return E;
1633 } else {
1634 File.reset();
1635 }
1636 }
1637
1638 // If we didn't already see a racing/existing write, then try storing the
1639 // new object. If that races, confirm that the new value has valid storage.
1640 //
1641 // TODO: Find a way to reuse the storage from the new-but-abandoned record
1642 // handle.
1643 if (Existing.SK == TrieRecord::StorageKind::Unknown) {
1644 if (I->Ref.compare_exchange_strong(Existing, NewObject)) {
1645 if (FileSize)
1646 recordStandaloneSizeIncrease(*FileSize);
1647 return Error::success();
1648 }
1649 }
1650 }
1651
1652 if (Existing.SK == TrieRecord::StorageKind::Unknown)
1654
1655 // Load existing object.
1656 return Error::success();
1657}
1658
1660 return storeFile(ID, FilePath, /*ImportKind=*/std::nullopt);
1661}
1662
1664 ObjectID ID, StringRef FilePath,
1665 std::optional<InternalUpstreamImportKind> ImportKind) {
1666 auto I = getIndexProxyFromRef(getInternalRef(ID));
1667 if (LLVM_UNLIKELY(!I))
1668 return I.takeError();
1669
1670 // Early return in case the node exists.
1671 {
1672 TrieRecord::Data Existing = I->Ref.load();
1673 if (Existing.SK != TrieRecord::StorageKind::Unknown)
1674 return Error::success();
1675 }
1676
1677 auto BypassSandbox = sys::sandbox::scopedDisable();
1678
1679 uint64_t FileSize;
1680 if (std::error_code EC = sys::fs::file_size(FilePath, FileSize))
1681 return createFileError(FilePath, EC);
1682
1683 if (FileSize <= TrieRecord::MaxEmbeddedSize) {
1684 auto Buf = MemoryBuffer::getFile(FilePath);
1685 if (!Buf)
1686 return createFileError(FilePath, Buf.getError());
1687 return store(ID, {}, arrayRefFromStringRef<char>((*Buf)->getBuffer()));
1688 }
1689
1690 UniqueTempFile UniqueTmp;
1691 auto ExpectedPath = UniqueTmp.createAndCopyFrom(RootPath, FilePath);
1692 if (!ExpectedPath)
1693 return ExpectedPath.takeError();
1694 StringRef TmpPath = *ExpectedPath;
1695
1696 TrieRecord::StorageKind SK;
1697 if (ImportKind.has_value()) {
1698 // Importing the file from upstream, the nul is already added if necessary.
1699 switch (*ImportKind) {
1700 case InternalUpstreamImportKind::Leaf:
1701 SK = TrieRecord::StorageKind::StandaloneLeaf;
1702 break;
1703 case InternalUpstreamImportKind::Leaf0:
1704 SK = TrieRecord::StorageKind::StandaloneLeaf0;
1705 break;
1706 }
1707 } else {
1708 bool Leaf0 = isAligned(Align(getPageSize()), FileSize);
1709 SK = Leaf0 ? TrieRecord::StorageKind::StandaloneLeaf0
1710 : TrieRecord::StorageKind::StandaloneLeaf;
1711
1712 if (Leaf0) {
1713 // Add a nul byte at the end.
1714 std::error_code EC;
1715 raw_fd_ostream OS(TmpPath, EC, sys::fs::CD_OpenExisting,
1717 if (EC)
1718 return createFileError(TmpPath, EC);
1719 OS.write(0);
1720 OS.close();
1721 if (OS.has_error())
1722 return createFileError(TmpPath, OS.error());
1723 }
1724 }
1725
1726 SmallString<256> StandalonePath;
1727 getStandalonePath(TrieRecord::getStandaloneFilePrefix(SK), I->Offset,
1728 StandalonePath);
1729 if (Error E = UniqueTmp.renameTo(StandalonePath))
1730 return E;
1731
1732 // Store the object reference.
1733 TrieRecord::Data Existing;
1734 {
1735 TrieRecord::Data Leaf{SK, FileOffset()};
1736 if (I->Ref.compare_exchange_strong(Existing, Leaf)) {
1737 recordStandaloneSizeIncrease(FileSize);
1738 return Error::success();
1739 }
1740 }
1741
1742 // If there was a race, confirm that the new value has valid storage.
1743 if (Existing.SK == TrieRecord::StorageKind::Unknown)
1744 return createCorruptObjectError(getDigest(*I));
1745
1746 return Error::success();
1747}
1748
1749void OnDiskGraphDB::recordStandaloneSizeIncrease(size_t SizeIncrease) {
1750 standaloneStorageSize().fetch_add(SizeIncrease, std::memory_order_relaxed);
1751}
1752
1753std::atomic<uint64_t> &OnDiskGraphDB::standaloneStorageSize() const {
1754 MutableArrayRef<uint8_t> UserHeader = DataPool.getUserHeader();
1755 assert(UserHeader.size() == sizeof(std::atomic<uint64_t>));
1756 assert(isAddrAligned(Align(8), UserHeader.data()));
1757 return *reinterpret_cast<std::atomic<uint64_t> *>(UserHeader.data());
1758}
1759
1760uint64_t OnDiskGraphDB::getStandaloneStorageSize() const {
1761 return standaloneStorageSize().load(std::memory_order_relaxed);
1762}
1763
1765 return Index.size() + DataPool.size() + getStandaloneStorageSize();
1766}
1767
1769 unsigned IndexPercent = Index.size() * 100ULL / Index.capacity();
1770 unsigned DataPercent = DataPool.size() * 100ULL / DataPool.capacity();
1771 return std::max(IndexPercent, DataPercent);
1772}
1773
1776 unsigned HashByteSize, OnDiskGraphDB *UpstreamDB,
1777 std::shared_ptr<OnDiskCASLogger> Logger,
1778 FaultInPolicy Policy) {
1779 if (std::error_code EC = sys::fs::create_directories(AbsPath))
1780 return createFileError(AbsPath, EC);
1781
1782 constexpr uint64_t MB = 1024ull * 1024ull;
1783 constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
1784
1785 uint64_t MaxIndexSize = 12 * GB;
1786 uint64_t MaxDataPoolSize = 24 * GB;
1787
1788 if (useSmallMappingSize(AbsPath)) {
1789 MaxIndexSize = 1 * GB;
1790 MaxDataPoolSize = 2 * GB;
1791 }
1792
1793 auto CustomSize = getOverriddenMaxMappingSize();
1794 if (!CustomSize)
1795 return CustomSize.takeError();
1796 if (*CustomSize)
1797 MaxIndexSize = MaxDataPoolSize = **CustomSize;
1798
1799 SmallString<256> IndexPath(AbsPath);
1801 std::optional<OnDiskTrieRawHashMap> Index;
1803 IndexPath, IndexTableName + "[" + HashName + "]",
1804 HashByteSize * CHAR_BIT,
1805 /*DataSize=*/sizeof(TrieRecord), MaxIndexSize,
1806 /*MinFileSize=*/MB, Logger)
1807 .moveInto(Index))
1808 return std::move(E);
1809
1810 uint32_t UserHeaderSize = sizeof(std::atomic<uint64_t>);
1811
1812 SmallString<256> DataPoolPath(AbsPath);
1814 std::optional<OnDiskDataAllocator> DataPool;
1815 StringRef PolicyName =
1816 Policy == FaultInPolicy::SingleNode ? "single" : "full";
1818 DataPoolPath,
1819 DataPoolTableName + "[" + HashName + "]" + PolicyName,
1820 MaxDataPoolSize, /*MinFileSize=*/MB, UserHeaderSize, Logger,
1821 [](void *UserHeaderPtr) {
1822 new (UserHeaderPtr) std::atomic<uint64_t>(0);
1823 })
1824 .moveInto(DataPool))
1825 return std::move(E);
1826 if (DataPool->getUserHeader().size() != UserHeaderSize)
1828 "unexpected user header in '" + DataPoolPath +
1829 "'");
1830
1831 return std::unique_ptr<OnDiskGraphDB>(
1832 new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool),
1833 UpstreamDB, Policy, std::move(Logger)));
1834}
1835
1836OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
1837 OnDiskDataAllocator DataPool,
1838 OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy,
1839 std::shared_ptr<OnDiskCASLogger> Logger)
1840 : Index(std::move(Index)), DataPool(std::move(DataPool)),
1841 RootPath(RootPath.str()), UpstreamDB(UpstreamDB), FIPolicy(Policy),
1842 Logger(std::move(Logger)) {
1843 /// Lifetime for "big" objects not in DataPool.
1844 ///
1845 /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something
1846 /// simpler on the assumption there won't be much contention since most data
1847 /// is not big. If there is contention, and we've already fixed ObjectProxy
1848 /// object handles to be cheap enough to use consistently, the fix might be
1849 /// to use better use of them rather than optimizing this map.
1850 ///
1851 /// FIXME: Figure out the right number of shards, if any.
1852 StandaloneData = new StandaloneDataMapTy();
1853}
1854
1856 delete static_cast<StandaloneDataMapTy *>(StandaloneData);
1857}
1858
1859Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
1860 ObjectHandle UpstreamNode) {
1861 // Copies the full CAS tree from upstream. Uses depth-first copying to protect
1862 // against the process dying during importing and leaving the database with an
1863 // incomplete tree. Note that if the upstream has missing nodes then the tree
1864 // will be copied with missing nodes as well, it won't be considered an error.
1865 struct UpstreamCursor {
1867 size_t RefsCount;
1870 };
1871 /// Keeps track of the state of visitation for current node and all of its
1872 /// parents.
1874 /// Keeps track of the currently visited nodes as they are imported into
1875 /// primary database, from current node and its parents. When a node is
1876 /// entered for visitation it appends its own ID, then appends referenced IDs
1877 /// as they get imported. When a node is fully imported it removes the
1878 /// referenced IDs from the bottom of the stack which leaves its own ID at the
1879 /// bottom, adding to the list of referenced IDs for the parent node.
1880 SmallVector<ObjectID, 128> PrimaryNodesStack;
1881
1882 auto enqueueNode = [&](ObjectID PrimaryID, std::optional<ObjectHandle> Node) {
1883 PrimaryNodesStack.push_back(PrimaryID);
1884 if (!Node)
1885 return;
1886 auto Refs = UpstreamDB->getObjectRefs(*Node);
1887 CursorStack.push_back(
1888 {*Node, (size_t)llvm::size(Refs), Refs.begin(), Refs.end()});
1889 };
1890
1891 enqueueNode(PrimaryID, UpstreamNode);
1892
1893 while (!CursorStack.empty()) {
1894 UpstreamCursor &Cur = CursorStack.back();
1895 if (Cur.RefI == Cur.RefE) {
1896 // Copy the node data into the primary store.
1897
1898 // The bottom of \p PrimaryNodesStack contains the primary ID for the
1899 // current node plus the list of imported referenced IDs.
1900 assert(PrimaryNodesStack.size() >= Cur.RefsCount + 1);
1901 ObjectID PrimaryID = *(PrimaryNodesStack.end() - Cur.RefsCount - 1);
1902 auto PrimaryRefs = ArrayRef(PrimaryNodesStack)
1903 .slice(PrimaryNodesStack.size() - Cur.RefsCount);
1904 if (Error E = importUpstreamData(PrimaryID, PrimaryRefs, Cur.Node))
1905 return E;
1906 // Remove the current node and its IDs from the stack.
1907 PrimaryNodesStack.truncate(PrimaryNodesStack.size() - Cur.RefsCount);
1908 CursorStack.pop_back();
1909 continue;
1910 }
1911
1912 ObjectID UpstreamID = *(Cur.RefI++);
1913 auto PrimaryID = getReference(UpstreamDB->getDigest(UpstreamID));
1914 if (LLVM_UNLIKELY(!PrimaryID))
1915 return PrimaryID.takeError();
1916 if (containsObject(*PrimaryID, /*CheckUpstream=*/false)) {
1917 // This \p ObjectID already exists in the primary. Either it was imported
1918 // via \p importFullTree or the client created it, in which case the
1919 // client takes responsibility for how it was formed.
1920 enqueueNode(*PrimaryID, std::nullopt);
1921 continue;
1922 }
1923 Expected<std::optional<ObjectHandle>> UpstreamNode =
1924 UpstreamDB->load(UpstreamID);
1925 if (!UpstreamNode)
1926 return UpstreamNode.takeError();
1927 enqueueNode(*PrimaryID, *UpstreamNode);
1928 }
1929
1930 assert(PrimaryNodesStack.size() == 1);
1931 assert(PrimaryNodesStack.front() == PrimaryID);
1932 return Error::success();
1933}
1934
1935Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
1936 ObjectHandle UpstreamNode) {
1937 // Copies only a single node, it doesn't copy the referenced nodes.
1938
1939 auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode);
1941 Refs.reserve(llvm::size(UpstreamRefs));
1942 for (ObjectID UpstreamRef : UpstreamRefs) {
1943 auto Ref = getReference(UpstreamDB->getDigest(UpstreamRef));
1944 if (LLVM_UNLIKELY(!Ref))
1945 return Ref.takeError();
1946 Refs.push_back(*Ref);
1947 }
1948
1949 return importUpstreamData(PrimaryID, Refs, UpstreamNode);
1950}
1951
1952Error OnDiskGraphDB::importUpstreamData(ObjectID PrimaryID,
1953 ArrayRef<ObjectID> PrimaryRefs,
1954 ObjectHandle UpstreamNode) {
1955 // If there are references we can't copy an upstream's standalone file because
1956 // we need to re-resolve the reference offsets it contains.
1957 if (PrimaryRefs.empty()) {
1958 auto FBData = UpstreamDB->getInternalFileBackedObjectData(UpstreamNode);
1959 if (FBData.FileInfo.has_value()) {
1960 // Disk-space optimization, import the file directly since it is a
1961 // standalone leaf.
1962 return storeFile(
1963 PrimaryID, FBData.FileInfo->FilePath,
1964 /*InternalUpstreamImport=*/FBData.FileInfo->IsFileNulTerminated
1965 ? InternalUpstreamImportKind::Leaf0
1966 : InternalUpstreamImportKind::Leaf);
1967 }
1968 }
1969
1970 auto Data = UpstreamDB->getObjectData(UpstreamNode);
1971 return store(PrimaryID, PrimaryRefs, Data);
1972}
1973
1974Expected<std::optional<ObjectHandle>>
1975OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) {
1976 if (!UpstreamDB)
1977 return std::nullopt;
1978
1979 auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID));
1980 if (LLVM_UNLIKELY(!UpstreamID))
1981 return UpstreamID.takeError();
1982
1983 Expected<std::optional<ObjectHandle>> UpstreamNode =
1984 UpstreamDB->load(*UpstreamID);
1985 if (!UpstreamNode)
1986 return UpstreamNode.takeError();
1987 if (!*UpstreamNode)
1988 return std::nullopt;
1989
1990 if (Error E = FIPolicy == FaultInPolicy::SingleNode
1991 ? importSingleNode(PrimaryID, **UpstreamNode)
1992 : importFullTree(PrimaryID, **UpstreamNode))
1993 return std::move(E);
1994 return load(PrimaryID);
1995}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Mark last scratch load
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_UNLIKELY(EXPR)
Definition Compiler.h:336
This file defines the DenseMap class.
static cl::opt< int > PageSize("imp-null-check-page-size", cl::desc("The page size of the target in bytes"), cl::init(4096), cl::Hidden)
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
#define I(x, y, z)
Definition MD5.cpp:57
#define H(x, y, z)
Definition MD5.cpp:56
This file declares interface for OnDiskCASLogger, an interface that can be used to log CAS events to ...
This file declares interface for OnDiskDataAllocator, a file backed data pool can be used to allocate...
static constexpr StringLiteral FilePrefixLeaf0
static constexpr StringLiteral DataPoolTableName
static constexpr StringLiteral FilePrefixObject
static constexpr StringLiteral FilePrefixLeaf
static constexpr StringLiteral IndexFilePrefix
static OnDiskContent getContentFromHandle(const OnDiskDataAllocator &DataPool, ObjectHandle OH)
static constexpr StringLiteral DataPoolFilePrefix
static Error createCorruptObjectError(Expected< ArrayRef< uint8_t > > ID)
static std::variant< const StandaloneDataInMemory *, DataRecordHandle > getStandaloneDataOrDataRecord(const OnDiskDataAllocator &DataPool, ObjectHandle OH)
static size_t getPageSize()
static void getStandalonePath(StringRef RootPath, StringRef Prefix, FileOffset IndexOffset, SmallVectorImpl< char > &Path)
static Expected< MappedTempFile > createTempFile(StringRef FinalPath, uint64_t Size, OnDiskCASLogger *Logger)
static constexpr StringLiteral IndexTableName
This declares OnDiskGraphDB, an ondisk CAS database with a fixed length hash.
This file declares interface for OnDiskTrieRawHashMap, a thread-safe and (mostly) lock-free hash map ...
#define P(N)
Provides a library for accessing information about this process and other processes on the operating ...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static Split data
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
The Input class is used to parse a yaml document into in-memory structs and vectors.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:201
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
const T * data() const
Definition ArrayRef.h:139
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
Logging utility - given an ordered specification of features, and assuming a scalar reward,...
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void reserve(size_type N)
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
iterator begin() const
Definition StringRef.h:113
iterator end() const
Definition StringRef.h:115
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
FileOffset is a wrapper around uint64_t to represent the offset of data from the beginning of the fil...
Definition FileOffset.h:24
uint64_t get() const
Definition FileOffset.h:26
Handle to a loaded object in a ObjectStore instance.
LLVM_ABI_FOR_TEST Expected< ArrayRef< char > > get(FileOffset Offset, size_t Size) const
Get the data of Size stored at the given Offset.
static LLVM_ABI_FOR_TEST Expected< OnDiskDataAllocator > create(const Twine &Path, const Twine &TableName, uint64_t MaxFileSize, std::optional< uint64_t > NewFileInitialSize, uint32_t UserHeaderSize=0, std::shared_ptr< ondisk::OnDiskCASLogger > Logger=nullptr, function_ref< void(void *)> UserHeaderInit=nullptr)
LLVM_ABI_FOR_TEST size_t size() const
OnDiskTrieRawHashMap is a persistent trie data structure used as hash maps.
static LLVM_ABI_FOR_TEST Expected< OnDiskTrieRawHashMap > create(const Twine &Path, const Twine &TrieName, size_t NumHashBits, uint64_t DataSize, uint64_t MaxFileSize, std::optional< uint64_t > NewFileInitialSize, std::shared_ptr< ondisk::OnDiskCASLogger > Logger=nullptr, std::optional< size_t > NewTableNumRootBits=std::nullopt, std::optional< size_t > NewTableNumSubtrieBits=std::nullopt)
Gets or creates a file at Path with a hash-mapped trie named TrieName.
static std::optional< InternalRef4B > tryToShrink(InternalRef Ref)
Shrink to 4B reference.
Array of internal node references.
Standard 8 byte reference inside OnDiskGraphDB.
static InternalRef getFromOffset(FileOffset Offset)
Handle for a loaded node object.
static ObjectHandle fromFileOffset(FileOffset Offset)
static ObjectHandle fromMemory(uintptr_t Ptr)
Reference to a node.
uint64_t getOpaqueData() const
Interface for logging low-level on-disk cas operations.
On-disk CAS nodes database, independent of a particular hashing algorithm.
FaultInPolicy
How to fault-in nodes if an upstream database is used.
@ SingleNode
Copy only the requested node.
void print(raw_ostream &OS) const
LLVM_ABI_FOR_TEST Error validateObjectID(ObjectID ID) const
Checks that ID exists in the index.
LLVM_ABI_FOR_TEST Expected< std::optional< ObjectHandle > > load(ObjectID Ref)
Expected< bool > isMaterialized(ObjectID Ref)
Check whether the object associated with Ref is stored in the CAS.
Error validate(bool Deep, HashingFuncT Hasher) const
Validate the OnDiskGraphDB.
object_refs_range getObjectRefs(ObjectHandle Node) const
unsigned getHardStorageLimitUtilization() const
LLVM_ABI_FOR_TEST Error store(ObjectID ID, ArrayRef< ObjectID > Refs, ArrayRef< char > Data)
Associate data & references with a particular object ID.
ArrayRef< uint8_t > getDigest(ObjectID Ref) const
FileBackedData getInternalFileBackedObjectData(ObjectHandle Node) const
Provides access to the underlying file path, that represents an object leaf node, when available.
LLVM_ABI_FOR_TEST Error storeFile(ObjectID ID, StringRef FilePath)
Associates the data of a file with a particular object ID.
LLVM_ABI_FOR_TEST size_t getStorageSize() const
static LLVM_ABI_FOR_TEST Expected< std::unique_ptr< OnDiskGraphDB > > open(StringRef Path, StringRef HashName, unsigned HashByteSize, OnDiskGraphDB *UpstreamDB=nullptr, std::shared_ptr< OnDiskCASLogger > Logger=nullptr, FaultInPolicy Policy=FaultInPolicy::FullTree)
Open the on-disk store from a directory.
bool containsObject(ObjectID Ref, bool CheckUpstream=true) const
Check whether the object associated with Ref is stored in the CAS.
LLVM_ABI_FOR_TEST Expected< ObjectID > getReference(ArrayRef< uint8_t > Hash)
Form a reference for the provided hash.
function_ref< void( ArrayRef< ArrayRef< uint8_t > >, ArrayRef< char >, SmallVectorImpl< uint8_t > &)> HashingFuncT
Hashing function type for validation.
LLVM_ABI_FOR_TEST ArrayRef< char > getObjectData(ObjectHandle Node) const
LLVM_ABI_FOR_TEST std::optional< ObjectID > getExistingReference(ArrayRef< uint8_t > Digest, bool CheckUpstream=true)
Get an existing reference to the object Digest.
Helper RAII class for copying a file to a unique file path.
Error renameTo(StringRef RenameToPath)
Rename the new unique file to RenameToPath.
Expected< StringRef > createAndCopyFrom(StringRef ParentPath, StringRef CopyFromPath)
Create a new unique file path under ParentPath and copy the contents of CopyFromPath into it.
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
static unsigned getPageSizeEstimate()
Get the process's estimated page size.
Definition Process.h:62
LLVM_ABI Error keep(const Twine &Name)
static LLVM_ABI Expected< TempFile > create(const Twine &Model, unsigned Mode=all_read|all_write, OpenFlags ExtraFlags=OF_None)
This creates a temporary file with createUniqueFile and schedules it for deletion with sys::RemoveFil...
Represents the result of a call to sys::fs::status().
Definition FileSystem.h:222
This class represents a memory mapped file.
LLVM_ABI size_t size() const
Definition Path.cpp:1183
@ readonly
May only access map via const_data as read only.
@ readwrite
May access map via data and modify it. Written to path.
LLVM_ABI char * data() const
Definition Path.cpp:1188
#define UINT64_MAX
Definition DataTypes.h:77
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
constexpr StringLiteral CASFormatVersion
The version for all the ondisk database files.
Expected< std::optional< uint64_t > > getOverriddenMaxMappingSize()
Retrieves an overridden maximum mapping size for CAS files, if any, speicified by LLVM_CAS_MAX_MAPPIN...
Expected< size_t > preallocateFileTail(int FD, size_t CurrentSize, size_t NewSize)
Allocate space for the file FD on disk, if the filesystem supports it.
bool useSmallMappingSize(const Twine &Path)
Whether to use a small file mapping for ondisk databases created in Path.
uint64_t getDataSize(const FuncRecordTy *Record)
Return the coverage map data size for the function.
uint64_t read64le(const void *P)
Definition Endian.h:435
void write64le(void *P, uint64_t V)
Definition Endian.h:478
void write32le(void *P, uint32_t V)
Definition Endian.h:475
uint32_t read32le(const void *P)
Definition Endian.h:432
LLVM_ABI std::error_code closeFile(file_t &F)
Close the file object.
LLVM_ABI std::error_code rename(const Twine &from, const Twine &to)
Rename from to to.
std::error_code resize_file_before_mapping_readwrite(int FD, uint64_t Size)
Resize FD to Size before mapping mapped_file_region::readwrite.
Definition FileSystem.h:432
LLVM_ABI bool exists(const basic_file_status &status)
Does file exist?
Definition Path.cpp:1091
@ OF_Append
The file should be opened in append mode.
Definition FileSystem.h:789
LLVM_ABI std::error_code createUniqueFile(const Twine &Model, int &ResultFD, SmallVectorImpl< char > &ResultPath, OpenFlags Flags=OF_None, unsigned Mode=all_read|all_write)
Create a uniquely named file.
Definition Path.cpp:875
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
@ CD_OpenExisting
CD_OpenExisting - When opening a file:
Definition FileSystem.h:759
LLVM_ABI Expected< file_t > openNativeFileForRead(const Twine &Name, OpenFlags Flags=OF_None, SmallVectorImpl< char > *RealPath=nullptr)
Opens the file with the given name in a read-only mode, returning its open file descriptor.
LLVM_ABI std::error_code create_directories(const Twine &path, bool IgnoreExisting=true, perms Perms=owner_all|group_all)
Create all the non-existent directories in path.
Definition Path.cpp:977
LLVM_ABI file_t convertFDToNativeFile(int FD)
Converts from a Posix file descriptor number to a native file handle.
LLVM_ABI std::error_code status(const Twine &path, file_status &result, bool follow=true)
Get file status as if by POSIX stat().
std::error_code file_size(const Twine &Path, uint64_t &Result)
Get file size.
Definition FileSystem.h:706
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition Path.cpp:457
ScopedSetting scopedDisable()
Definition IOSandbox.h:36
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
Error createFileError(const Twine &F, Error E)
Concatenate a source file path and/or name with an Error.
Definition Error.h:1422
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
ArrayRef< CharT > arrayRefFromStringRef(StringRef Input)
Construct a string ref from an array ref of unsigned chars.
std::error_code make_error_code(BitcodeError E)
@ Done
Definition Threading.h:60
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Definition Alignment.h:134
Error handleErrors(Error E, HandlerTs &&... Hs)
Pass the ErrorInfo(s) contained in E to their respective handlers.
Definition Error.h:990
FunctionAddr VTableAddr uintptr_t uintptr_t DataSize
Definition InstrProf.h:299
std::string utohexstr(uint64_t X, bool LowerCase=false, unsigned Width=0)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Error createStringError(std::error_code EC, char const *Fmt, const Ts &... Vals)
Create formatted StringError object.
Definition Error.h:1328
@ argument_out_of_domain
Definition Errc.h:37
@ illegal_byte_sequence
Definition Errc.h:52
@ invalid_argument
Definition Errc.h:56
std::optional< T > expectedToOptional(Expected< T > &&E)
Convert an Expected to an Optional without doing anything.
Definition Error.h:1117
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ Other
Any other memory.
Definition ModRef.h:68
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1885
void toHex(ArrayRef< uint8_t > Input, bool LowerCase, SmallVectorImpl< char > &Output)
Convert buffer Input to its hexadecimal representation. The returned string is double the size of Inp...
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
LLVM_ABI Error errorCodeToError(std::error_code EC)
Helper for converting an std::error_code to a Error.
Definition Error.cpp:107
void consumeError(Error Err)
Consume a Error without doing anything.
Definition Error.h:1106
bool isAddrAligned(Align Lhs, const void *Addr)
Checks that Addr is a multiple of the alignment.
Definition Alignment.h:139
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
Proxy for an on-disk index record.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static constexpr Align Of()
Allow constructions of constexpr Align from types.
Definition Alignment.h:94
Const value proxy to access the records stored in TrieRawHashMap.
Value proxy to access the records stored in TrieRawHashMap.
Encapsulates file info for an underlying object node.