LLVM 22.0.0git
OnDiskTrieRawHashMap.h
Go to the documentation of this file.
1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file declares interface for OnDiskTrieRawHashMap, a thread-safe and
11/// (mostly) lock-free hash map stored as trie and backed by persistent files on
12/// disk.
13///
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CAS_ONDISKTRIERAWHASHMAP_H
17#define LLVM_CAS_ONDISKTRIERAWHASHMAP_H
18
19#include "llvm/ADT/ArrayRef.h"
20#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/StringRef.h"
23#include "llvm/CAS/FileOffset.h"
24#include "llvm/Support/Error.h"
25#include <optional>
26
27namespace llvm {
28
29class raw_ostream;
30
31namespace cas {
32
33/// OnDiskTrieRawHashMap is a persistent trie data structure used as hash maps.
34/// The keys are fixed length, and are expected to be binary hashes with a
35/// normal distribution.
36///
37/// - Thread-safety is achieved through the use of atomics within a shared
38/// memory mapping. Atomic access does not work on networked filesystems.
39/// - Filesystem locks are used, but only sparingly:
40/// - during initialization, for creating / opening an existing store;
41/// - for the lifetime of the instance, a shared/reader lock is held
42/// - during destruction, if there are no concurrent readers, to shrink the
43/// files to their minimum size.
44/// - Path is used as a directory:
45/// - "index" stores the root trie and subtries.
46/// - "data" stores (most of) the entries, like a bump-ptr-allocator.
47/// - Large entries are stored externally in a file named by the key.
48/// - Code is system-dependent and binary format itself is not portable. These
49/// are not artifacts that can/should be moved between different systems; they
50/// are only appropriate for local storage.
52public:
53 LLVM_DUMP_METHOD void dump() const;
54 void
56 function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
57
58public:
59 /// Const value proxy to access the records stored in TrieRawHashMap.
70
71 /// Value proxy to access the records stored in TrieRawHashMap.
82
83 /// Validate the trie data structure.
84 ///
85 /// Callback receives the file offset to the data entry and the data stored.
87 function_ref<Error(FileOffset, ConstValueProxy)> RecordVerifier) const;
88
89 /// Check the valid range of file offset for OnDiskTrieRawHashMap.
91 return Offset.get() < (1LL << 48);
92 }
93
94public:
95 /// Template class to implement a `pointer` type into the trie data structure.
96 ///
97 /// It provides pointer-like operation, e.g., dereference to get underlying
98 /// data. It also reserves the top 16 bits of the pointer value, which can be
99 /// used to pack additional information if needed.
100 template <class ProxyT> class PointerImpl {
101 public:
104 }
105
106 explicit operator bool() const { return IsValue; }
107
108 const ProxyT &operator*() const {
110 return Value;
111 }
112 const ProxyT *operator->() const {
114 return &Value;
115 }
116
117 PointerImpl() = default;
118
119 protected:
126
127 ProxyT Value;
130
131 // True if points to a value (not a "nullptr"). Use an extra field because
132 // 0 can be a valid offset.
133 bool IsValue = false;
134 };
135
136 class pointer;
137 class const_pointer : public PointerImpl<ConstValueProxy> {
138 public:
139 const_pointer() = default;
140
141 private:
142 friend class pointer;
144 using const_pointer::PointerImpl::PointerImpl;
145 };
146
147 class pointer : public PointerImpl<ValueProxy> {
148 public:
149 operator const_pointer() const {
151 }
152
153 pointer() = default;
154
155 private:
157 using pointer::PointerImpl::PointerImpl;
158 };
159
160 /// Find the value from hash.
161 ///
162 /// \returns pointer to the value if exists, otherwise returns a non-value
163 /// pointer that evaluates to `false` when convert to boolean.
165
166 /// Helper function to recover a pointer into the trie from file offset.
168
170 function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue)>;
172 function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue,
173 FileOffset FinalOffset, ValueProxy FinalValue)>;
174
175 /// Insert lazily.
176 ///
177 /// \p OnConstruct is called when ready to insert a value, after allocating
178 /// space for the data. It is called at most once.
179 ///
180 /// \p OnLeak is called only if \p OnConstruct has been called and a race
181 /// occurred before insertion, causing the tentative offset and data to be
182 /// abandoned. This allows clients to clean up other results or update any
183 /// references.
184 ///
185 /// NOTE: Does *not* guarantee that \p OnConstruct is only called on success.
186 /// The in-memory \a TrieRawHashMap uses LazyAtomicPointer to synchronize
187 /// simultaneous writes, but that seems dangerous to use in a memory-mapped
188 /// file in case a process crashes in the busy state.
190 LazyInsertOnConstructCB OnConstruct = nullptr,
191 LazyInsertOnLeakCB OnLeak = nullptr);
192
194 return insertLazy(Value.Hash, [&](FileOffset, ValueProxy Allocated) {
195 assert(Allocated.Hash == Value.Hash);
196 assert(Allocated.Data.size() == Value.Data.size());
197 llvm::copy(Value.Data, Allocated.Data.begin());
198 });
199 }
200
201 size_t size() const;
202 size_t capacity() const;
203
204 /// Gets or creates a file at \p Path with a hash-mapped trie named \p
205 /// TrieName. The hash size is \p NumHashBits (in bits) and the records store
206 /// data of size \p DataSize (in bytes).
207 ///
208 /// \p MaxFileSize controls the maximum file size to support, limiting the
209 /// size of the \a mapped_file_region. \p NewFileInitialSize is the starting
210 /// size if a new file is created.
211 ///
212 /// \p NewTableNumRootBits and \p NewTableNumSubtrieBits are hints to
213 /// configure the trie, if it doesn't already exist.
214 ///
215 /// \pre NumHashBits is a multiple of 8 (byte-aligned).
217 create(const Twine &Path, const Twine &TrieName, size_t NumHashBits,
218 uint64_t DataSize, uint64_t MaxFileSize,
219 std::optional<uint64_t> NewFileInitialSize,
220 std::optional<size_t> NewTableNumRootBits = std::nullopt,
221 std::optional<size_t> NewTableNumSubtrieBits = std::nullopt);
222
226
227private:
228 struct ImplType;
229 explicit OnDiskTrieRawHashMap(std::unique_ptr<ImplType> Impl);
230 std::unique_ptr<ImplType> Impl;
231};
232
233} // namespace cas
234} // namespace llvm
235
236#endif // LLVM_CAS_ONDISKTRIERAWHASHMAP_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file declares interface for FileOffset that represent stored data at an offset from the beginnin...
This file contains some templates that are useful if you are working with the STL at all.
Value * RHS
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
Tagged union holding either a T or a Error.
Definition Error.h:485
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM Value Representation.
Definition Value.h:75
FileOffset is a wrapper around uint64_t to represent the offset of data from the beginning of the fil...
Definition FileOffset.h:24
PointerImpl(ProxyT Value, FileOffset Offset, bool IsValue=true)
const_pointer find(ArrayRef< uint8_t > Hash) const
Find the value from hash.
OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS)
Expected< pointer > insertLazy(ArrayRef< uint8_t > Hash, LazyInsertOnConstructCB OnConstruct=nullptr, LazyInsertOnLeakCB OnLeak=nullptr)
Insert lazily.
function_ref< void(FileOffset TentativeOffset, ValueProxy TentativeValue)> LazyInsertOnConstructCB
Error validate(function_ref< Error(FileOffset, ConstValueProxy)> RecordVerifier) const
Validate the trie data structure.
static Expected< OnDiskTrieRawHashMap > create(const Twine &Path, const Twine &TrieName, size_t NumHashBits, uint64_t DataSize, uint64_t MaxFileSize, std::optional< uint64_t > NewFileInitialSize, std::optional< size_t > NewTableNumRootBits=std::nullopt, std::optional< size_t > NewTableNumSubtrieBits=std::nullopt)
Gets or creates a file at Path with a hash-mapped trie named TrieName.
LLVM_DUMP_METHOD void dump() const
Expected< const_pointer > recoverFromFileOffset(FileOffset Offset) const
Helper function to recover a pointer into the trie from file offset.
static bool validOffset(FileOffset Offset)
Check the valid range of file offset for OnDiskTrieRawHashMap.
OnDiskTrieRawHashMap & operator=(OnDiskTrieRawHashMap &&RHS)
Expected< pointer > insert(const ConstValueProxy &Value)
void print(raw_ostream &OS, function_ref< void(ArrayRef< char >)> PrintRecordData=nullptr) const
function_ref< void(FileOffset TentativeOffset, ValueProxy TentativeValue, FileOffset FinalOffset, ValueProxy FinalValue)> LazyInsertOnLeakCB
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr uintptr_t uintptr_t DataSize
Definition InstrProf.h:267
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
Const value proxy to access the records stored in TrieRawHashMap.
ConstValueProxy(ArrayRef< uint8_t > Hash, StringRef Data)
ConstValueProxy(ArrayRef< uint8_t > Hash, ArrayRef< char > Data)
Value proxy to access the records stored in TrieRawHashMap.
ValueProxy(ArrayRef< uint8_t > Hash, MutableArrayRef< char > Data)