LLVM 22.0.0git
OnDiskGraphDB.h
Go to the documentation of this file.
1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This declares OnDiskGraphDB, an ondisk CAS database with a fixed length
11/// hash. This is the class that implements the database storage scheme without
12/// exposing the hashing algorithm.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CAS_ONDISKGRAPHDB_H
17#define LLVM_CAS_ONDISKGRAPHDB_H
18
22
24
25/// Standard 8 byte reference inside OnDiskGraphDB.
26class InternalRef {
27public:
28 FileOffset getFileOffset() const { return FileOffset(Data); }
29 uint64_t getRawData() const { return Data; }
30
31 static InternalRef getFromRawData(uint64_t Data) { return InternalRef(Data); }
32 static InternalRef getFromOffset(FileOffset Offset) {
33 return InternalRef(Offset.get());
34 }
35
36 friend bool operator==(InternalRef LHS, InternalRef RHS) {
37 return LHS.Data == RHS.Data;
38 }
39
40private:
42 InternalRef(uint64_t Data) : Data(Data) {}
44};
45
46/// Compact 4 byte reference inside OnDiskGraphDB for smaller references.
47class InternalRef4B {
48public:
49 FileOffset getFileOffset() const { return FileOffset(Data); }
50 uint32_t getRawData() const { return Data; }
51
52 /// Shrink to 4B reference.
53 static std::optional<InternalRef4B> tryToShrink(InternalRef Ref) {
54 uint64_t Offset = Ref.getRawData();
55 if (Offset > UINT32_MAX)
56 return std::nullopt;
57 return InternalRef4B(Offset);
58 }
59
60 operator InternalRef() const {
62 }
63
64private:
65 friend class InternalRef;
66 InternalRef4B(uint32_t Data) : Data(Data) {}
68};
69
70/// Array of internal node references.
72public:
73 size_t size() const { return Size; }
74 bool empty() const { return !Size; }
75
77 : public iterator_facade_base<iterator, std::random_access_iterator_tag,
78 const InternalRef> {
79 public:
80 bool operator==(const iterator &RHS) const { return I == RHS.I; }
83 return *Ref;
85 }
101 if (auto *Ref = dyn_cast<const InternalRef *>(I))
102 I = Ref + N;
103 else
105 return *this;
106 }
108 if (auto *Ref = dyn_cast<const InternalRef *>(I))
109 I = Ref - N;
110 else
112 return *this;
113 }
114 InternalRef operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
115
116 iterator() = default;
117
118 uint64_t getOpaqueData() const { return uintptr_t(I.getOpaqueValue()); }
119
121 return iterator(
123 const InternalRef4B *>::getFromOpaqueValue((void *)
124 Opaque));
125 }
126
127 private:
129 explicit iterator(
131 : I(I) {}
133 };
134
135 bool operator==(const InternalRefArrayRef &RHS) const {
136 return size() == RHS.size() && std::equal(begin(), end(), RHS.begin());
137 }
138
139 iterator begin() const { return iterator(Begin); }
140 iterator end() const { return begin() + Size; }
141
142 /// Array accessor.
143 InternalRef operator[](ptrdiff_t N) const { return begin()[N]; }
144
145 bool is4B() const { return isa<const InternalRef4B *>(Begin); }
146 bool is8B() const { return isa<const InternalRef *>(Begin); }
147
149 if (is4B()) {
150 auto *B = cast<const InternalRef4B *>(Begin);
151 return ArrayRef((const uint8_t *)B, sizeof(InternalRef4B) * Size);
152 }
153 auto *B = cast<const InternalRef *>(Begin);
154 return ArrayRef((const uint8_t *)B, sizeof(InternalRef) * Size);
155 }
156
157 InternalRefArrayRef(std::nullopt_t = std::nullopt) {
158 // This is useful so that all the casts in the \p iterator functions can
159 // operate without needing to check for a null value.
160 static InternalRef PlaceHolder = InternalRef::getFromRawData(0);
161 Begin = &PlaceHolder;
162 }
163
165 : Begin(Refs.begin()), Size(Refs.size()) {}
166
168 : Begin(Refs.begin()), Size(Refs.size()) {}
169
170private:
172 size_t Size = 0;
173};
174
175/// Reference to a node. The node's data may not be stored in the database.
176/// An \p ObjectID instance can only be used with the \p OnDiskGraphDB instance
177/// it came from. \p ObjectIDs from different \p OnDiskGraphDB instances are not
178/// comparable.
179class ObjectID {
180public:
181 uint64_t getOpaqueData() const { return Opaque; }
182
183 static ObjectID fromOpaqueData(uint64_t Opaque) { return ObjectID(Opaque); }
184
185 friend bool operator==(const ObjectID &LHS, const ObjectID &RHS) {
186 return LHS.Opaque == RHS.Opaque;
187 }
188 friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS) {
189 return !(LHS == RHS);
190 }
191
192private:
193 explicit ObjectID(uint64_t Opaque) : Opaque(Opaque) {}
194 uint64_t Opaque;
195};
196
197/// Handle for a loaded node object.
199public:
200 explicit ObjectHandle(uint64_t Opaque) : Opaque(Opaque) {}
201 uint64_t getOpaqueData() const { return Opaque; }
202
204 static ObjectHandle fromMemory(uintptr_t Ptr);
205
206 friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) {
207 return LHS.Opaque == RHS.Opaque;
208 }
209 friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) {
210 return !(LHS == RHS);
211 }
212
213private:
214 uint64_t Opaque;
215};
216
217/// Iterator for ObjectID.
219 : public iterator_facade_base<object_refs_iterator,
220 std::random_access_iterator_tag, ObjectID> {
221public:
222 bool operator==(const object_refs_iterator &RHS) const { return I == RHS.I; }
224 return ObjectID::fromOpaqueData((*I).getRawData());
225 }
226 bool operator<(const object_refs_iterator &RHS) const { return I < RHS.I; }
228 return I - RHS.I;
229 }
231 I += N;
232 return *this;
233 }
235 I -= N;
236 return *this;
237 }
238 ObjectID operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
239
242
243 uint64_t getOpaqueData() const { return I.getOpaqueData(); }
244
248
249private:
251};
252
254
255/// On-disk CAS nodes database, independent of a particular hashing algorithm.
256class OnDiskGraphDB {
257public:
258 /// Associate data & references with a particular object ID. If there is
259 /// already a record for this object the operation is a no-op. \param ID the
260 /// object ID to associate the data & references with. \param Refs references
261 /// \param Data data buffer.
263
264 /// \returns \p nullopt if the object associated with \p Ref does not exist.
266
267 /// \returns the hash bytes digest for the object reference.
269 // ObjectID should be valid to fetch Digest.
270 return cantFail(getDigest(getInternalRef(Ref)));
271 }
272
273 /// Form a reference for the provided hash. The reference can be used as part
274 /// of a CAS object even if it's not associated with an object yet.
276
277 /// Get an existing reference to the object \p Digest.
278 ///
279 /// Returns \p nullopt if the object is not stored in this CAS.
280 std::optional<ObjectID> getExistingReference(ArrayRef<uint8_t> Digest);
281
282 /// Check whether the object associated with \p Ref is stored in the CAS.
283 /// Note that this function will fault-in according to the policy.
285
286 /// Check whether the object associated with \p Ref is stored in the CAS.
287 /// Note that this function does not fault-in.
289 return containsObject(Ref, /*CheckUpstream=*/true);
290 }
291
292 /// \returns the data part of the provided object handle.
294
295 /// \returns the object referenced by the provided object handle.
297 InternalRefArrayRef Refs = getInternalRefs(Node);
298 return make_range(Refs.begin(), Refs.end());
299 }
300
301 /// \returns Total size of stored objects.
302 ///
303 /// NOTE: There's a possibility that the returned size is not including a
304 /// large object if the process crashed right at the point of inserting it.
305 size_t getStorageSize() const;
306
307 /// \returns The precentage of space utilization of hard space limits.
308 ///
309 /// Return value is an integer between 0 and 100 for percentage.
310 unsigned getHardStorageLimitUtilization() const;
311
312 void print(raw_ostream &OS) const;
313
314 /// Hashing function type for validation.
317
318 /// Validate the OnDiskGraphDB.
319 ///
320 /// \param Deep if true, rehash all the objects to ensure no data
321 /// corruption in stored objects, otherwise just validate the structure of
322 /// CAS database.
323 /// \param Hasher is the hashing function used for objects inside CAS.
324 Error validate(bool Deep, HashingFuncT Hasher) const;
325
326 /// How to fault-in nodes if an upstream database is used.
327 enum class FaultInPolicy {
328 /// Copy only the requested node.
330 /// Copy the the entire graph of a node.
332 };
333
334 /// Open the on-disk store from a directory.
335 ///
336 /// \param Path directory for the on-disk store. The directory will be created
337 /// if it doesn't exist.
338 /// \param HashName Identifier name for the hashing algorithm that is going to
339 /// be used.
340 /// \param HashByteSize Size for the object digest hash bytes.
341 /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes
342 /// if they don't exist in the primary store. The upstream store is only used
343 /// for reading nodes, new nodes are only written to the primary store.
344 /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied
345 /// to primary store. This is recorded at creation time and subsequent opens
346 /// need to pass the same policy otherwise the \p open will fail.
348 open(StringRef Path, StringRef HashName, unsigned HashByteSize,
349 std::unique_ptr<OnDiskGraphDB> UpstreamDB = nullptr,
350 FaultInPolicy Policy = FaultInPolicy::FullTree);
351
353
354private:
355 /// Forward declaration for a proxy for an ondisk index record.
356 struct IndexProxy;
357
358 enum class ObjectPresence {
359 Missing,
360 InPrimaryDB,
361 OnlyInUpstreamDB,
362 };
363
364 /// Check if object exists and if it is on upstream only.
365 Expected<ObjectPresence> getObjectPresence(ObjectID Ref,
366 bool CheckUpstream) const;
367
368 /// \returns true if object can be found in database.
369 bool containsObject(ObjectID Ref, bool CheckUpstream) const {
370 auto Presence = getObjectPresence(Ref, CheckUpstream);
371 if (!Presence) {
372 consumeError(Presence.takeError());
373 return false;
374 }
375 switch (*Presence) {
376 case ObjectPresence::Missing:
377 return false;
378 case ObjectPresence::InPrimaryDB:
379 return true;
380 case ObjectPresence::OnlyInUpstreamDB:
381 return true;
382 }
383 llvm_unreachable("Unknown ObjectPresence enum");
384 }
385
386 /// When \p load is called for a node that doesn't exist, this function tries
387 /// to load it from the upstream store and copy it to the primary one.
388 Expected<std::optional<ObjectHandle>> faultInFromUpstream(ObjectID PrimaryID);
389
390 /// Import the entire tree from upstream with \p UpstreamNode as root.
391 Error importFullTree(ObjectID PrimaryID, ObjectHandle UpstreamNode);
392 /// Import only the \param UpstreamNode.
393 Error importSingleNode(ObjectID PrimaryID, ObjectHandle UpstreamNode);
394
395 /// Found the IndexProxy for the hash.
396 Expected<IndexProxy> indexHash(ArrayRef<uint8_t> Hash);
397
398 /// Get path for creating standalone data file.
399 void getStandalonePath(StringRef FileSuffix, const IndexProxy &I,
400 SmallVectorImpl<char> &Path) const;
401 /// Create a standalone leaf file.
402 Error createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data);
403
404 /// \name Helper functions for internal data structures.
405 /// \{
406 static InternalRef getInternalRef(ObjectID Ref) {
407 return InternalRef::getFromRawData(Ref.getOpaqueData());
408 }
409
410 static ObjectID getExternalReference(InternalRef Ref) {
411 return ObjectID::fromOpaqueData(Ref.getRawData());
412 }
413
414 static ObjectID getExternalReference(const IndexProxy &I);
415
416 static InternalRef makeInternalRef(FileOffset IndexOffset);
417
418 Expected<ArrayRef<uint8_t>> getDigest(InternalRef Ref) const;
419
420 ArrayRef<uint8_t> getDigest(const IndexProxy &I) const;
421
422 Expected<IndexProxy> getIndexProxyFromRef(InternalRef Ref) const;
423
425 getIndexProxyFromPointer(OnDiskTrieRawHashMap::ConstOnDiskPtr P) const;
426
427 InternalRefArrayRef getInternalRefs(ObjectHandle Node) const;
428 /// \}
429
430 /// Get the atomic variable that keeps track of the standalone data storage
431 /// size.
432 std::atomic<uint64_t> &standaloneStorageSize() const;
433
434 /// Increase the standalone data size.
435 void recordStandaloneSizeIncrease(size_t SizeIncrease);
436 /// Get the standalone data size.
437 uint64_t getStandaloneStorageSize() const;
438
439 // Private constructor.
440 OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
441 OnDiskDataAllocator DataPool,
442 std::unique_ptr<OnDiskGraphDB> UpstreamDB,
443 FaultInPolicy Policy);
444
445 /// Mapping from hash to object reference.
446 ///
447 /// Data type is TrieRecord.
448 OnDiskTrieRawHashMap Index;
449
450 /// Storage for most objects.
451 ///
452 /// Data type is DataRecordHandle.
453 OnDiskDataAllocator DataPool;
454
455 /// A StandaloneDataMap.
456 void *StandaloneData = nullptr;
457
458 /// Path to the root directory.
459 std::string RootPath;
460
461 /// Optional on-disk store to be used for faulting-in nodes.
462 std::unique_ptr<OnDiskGraphDB> UpstreamDB;
463
464 /// The policy used to fault in data from upstream.
465 FaultInPolicy FIPolicy;
466};
467
468} // namespace llvm::cas::ondisk
469
470#endif // LLVM_CAS_ONDISKGRAPHDB_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Mark last scratch load
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define I(x, y, z)
Definition MD5.cpp:58
This file declares interface for OnDiskDataAllocator, a file backed data pool can be used to allocate...
This file declares interface for OnDiskTrieRawHashMap, a thread-safe and (mostly) lock-free hash map ...
#define P(N)
This file defines the PointerUnion class, which is a discriminated union of pointer types.
Value * RHS
Value * LHS
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
Tagged union holding either a T or a Error.
Definition Error.h:485
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
FileOffset is a wrapper around uint64_t to represent the offset of data from the beginning of the fil...
Definition FileOffset.h:24
Compact 4 byte reference inside OnDiskGraphDB for smaller references.
static std::optional< InternalRef4B > tryToShrink(InternalRef Ref)
Shrink to 4B reference.
ptrdiff_t operator-(const iterator &RHS) const
bool operator==(const iterator &RHS) const
static iterator fromOpaqueData(uint64_t Opaque)
bool operator<(const iterator &RHS) const
Array of internal node references.
InternalRef operator[](ptrdiff_t N) const
Array accessor.
ArrayRef< uint8_t > getBuffer() const
InternalRefArrayRef(std::nullopt_t=std::nullopt)
InternalRefArrayRef(ArrayRef< InternalRef4B > Refs)
bool operator==(const InternalRefArrayRef &RHS) const
InternalRefArrayRef(ArrayRef< InternalRef > Refs)
Standard 8 byte reference inside OnDiskGraphDB.
friend bool operator==(InternalRef LHS, InternalRef RHS)
FileOffset getFileOffset() const
static InternalRef getFromRawData(uint64_t Data)
static InternalRef getFromOffset(FileOffset Offset)
Handle for a loaded node object.
static ObjectHandle fromFileOffset(FileOffset Offset)
static ObjectHandle fromMemory(uintptr_t Ptr)
friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS)
friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS)
Reference to a node.
friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS)
uint64_t getOpaqueData() const
friend bool operator==(const ObjectID &LHS, const ObjectID &RHS)
static ObjectID fromOpaqueData(uint64_t Opaque)
static Expected< std::unique_ptr< OnDiskGraphDB > > open(StringRef Path, StringRef HashName, unsigned HashByteSize, std::unique_ptr< OnDiskGraphDB > UpstreamDB=nullptr, FaultInPolicy Policy=FaultInPolicy::FullTree)
Open the on-disk store from a directory.
FaultInPolicy
How to fault-in nodes if an upstream database is used.
@ FullTree
Copy the the entire graph of a node.
@ SingleNode
Copy only the requested node.
void print(raw_ostream &OS) const
Expected< bool > isMaterialized(ObjectID Ref)
Check whether the object associated with Ref is stored in the CAS.
Error validate(bool Deep, HashingFuncT Hasher) const
Validate the OnDiskGraphDB.
bool containsObject(ObjectID Ref) const
Check whether the object associated with Ref is stored in the CAS.
object_refs_range getObjectRefs(ObjectHandle Node) const
unsigned getHardStorageLimitUtilization() const
Error store(ObjectID ID, ArrayRef< ObjectID > Refs, ArrayRef< char > Data)
Associate data & references with a particular object ID.
ArrayRef< uint8_t > getDigest(ObjectID Ref) const
std::optional< ObjectID > getExistingReference(ArrayRef< uint8_t > Digest)
Get an existing reference to the object Digest.
Expected< ObjectID > getReference(ArrayRef< uint8_t > Hash)
Form a reference for the provided hash.
function_ref< void( ArrayRef< ArrayRef< uint8_t > >, ArrayRef< char >, SmallVectorImpl< uint8_t > &)> HashingFuncT
Hashing function type for validation.
ArrayRef< char > getObjectData(ObjectHandle Node) const
object_refs_iterator & operator-=(ptrdiff_t N)
bool operator<(const object_refs_iterator &RHS) const
object_refs_iterator & operator+=(ptrdiff_t N)
ptrdiff_t operator-(const object_refs_iterator &RHS) const
bool operator==(const object_refs_iterator &RHS) const
ObjectID operator[](ptrdiff_t N) const
static object_refs_iterator fromOpaqueData(uint64_t Opaque)
object_refs_iterator(InternalRefArrayRef::iterator I)
An efficient, type-erasing, non-owning reference to a callable.
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
llvm::iterator_range< object_refs_iterator > object_refs_range
@ Offset
Definition DWP.cpp:477
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void consumeError(Error Err)
Consume a Error without doing anything.
Definition Error.h:1083
#define N
Proxy for an on-disk index record.