LLVM 23.0.0git
LowerMemIntrinsics.cpp
Go to the documentation of this file.
1//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
12#include "llvm/IR/IRBuilder.h"
14#include "llvm/IR/MDBuilder.h"
17#include "llvm/Support/Debug.h"
21#include <limits>
22#include <optional>
23
24#define DEBUG_TYPE "lower-mem-intrinsics"
25
26using namespace llvm;
27
28namespace llvm {
30}
31
32/// \returns \p Len urem \p OpSize, checking for optimization opportunities.
33/// \p OpSizeVal must be the integer value of the \c ConstantInt \p OpSize.
35 Value *OpSize, unsigned OpSizeVal) {
36 // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem.
37 if (isPowerOf2_32(OpSizeVal))
38 return B.CreateAnd(Len, OpSizeVal - 1);
39 return B.CreateURem(Len, OpSize);
40}
41
42/// \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization
43/// opportunities.
44/// If \p RTLoopRemainder is provided, it must be the result of
45/// \c getRuntimeLoopRemainder() with the same arguments.
47 unsigned OpSizeVal,
48 Value *RTLoopRemainder = nullptr) {
49 if (!RTLoopRemainder)
50 RTLoopRemainder = getRuntimeLoopRemainder(B, Len, OpSize, OpSizeVal);
51 return B.CreateSub(Len, RTLoopRemainder);
52}
53
54namespace {
55/// Container for the return values of insertLoopExpansion.
56struct LoopExpansionInfo {
57 /// The instruction at the end of the main loop body.
58 Instruction *MainLoopIP = nullptr;
59
60 /// The unit index in the main loop body.
61 Value *MainLoopIndex = nullptr;
62
63 /// The instruction at the end of the residual loop body. Can be nullptr if no
64 /// residual is required.
65 Instruction *ResidualLoopIP = nullptr;
66
67 /// The unit index in the residual loop body. Can be nullptr if no residual is
68 /// required.
69 Value *ResidualLoopIndex = nullptr;
70};
71
72std::optional<uint64_t> getAverageMemOpLoopTripCount(const MemIntrinsic &I) {
74 return std::nullopt;
75 if (std::optional<uint64_t> EC = I.getFunction()->getEntryCount();
76 !EC || *EC == 0)
77 return std::nullopt;
78 if (const auto Len = I.getLengthInBytes())
79 return Len->getZExtValue();
80 uint64_t Total = 0;
82 getValueProfDataFromInst(I, InstrProfValueKind::IPVK_MemOPSize,
83 std::numeric_limits<uint32_t>::max(), Total);
84 if (!Total)
85 return std::nullopt;
86 uint64_t TripCount = 0;
87 for (const auto &P : ProfData)
88 TripCount += P.Count * P.Value;
89 return std::round(1.0 * TripCount / Total);
90}
91
92} // namespace
93
94/// Insert the control flow and loop counters for a memcpy/memset loop
95/// expansion.
96///
97/// This function inserts IR corresponding to the following C code before
98/// \p InsertBefore:
99/// \code
100/// LoopUnits = (Len / MainLoopStep) * MainLoopStep;
101/// ResidualUnits = Len - LoopUnits;
102/// MainLoopIndex = 0;
103/// if (LoopUnits > 0) {
104/// do {
105/// // MainLoopIP
106/// MainLoopIndex += MainLoopStep;
107/// } while (MainLoopIndex < LoopUnits);
108/// }
109/// for (size_t i = 0; i < ResidualUnits; i += ResidualLoopStep) {
110/// ResidualLoopIndex = LoopUnits + i;
111/// // ResidualLoopIP
112/// }
113/// \endcode
114///
115/// \p MainLoopStep and \p ResidualLoopStep determine by how many "units" the
116/// loop index is increased in each iteration of the main and residual loops,
117/// respectively. In most cases, the "unit" will be bytes, but larger units are
118/// useful for lowering memset.pattern.
119///
120/// The computation of \c LoopUnits and \c ResidualUnits is performed at compile
121/// time if \p Len is a \c ConstantInt.
122/// The second (residual) loop is omitted if \p ResidualLoopStep is 0 or equal
123/// to \p MainLoopStep.
124/// The generated \c MainLoopIP, \c MainLoopIndex, \c ResidualLoopIP, and
125/// \c ResidualLoopIndex are returned in a \c LoopExpansionInfo object.
126///
127/// If provided, \p ExpectedUnits is used as the expected number of units
128/// handled by the loop expansion when computing branch weights.
129static LoopExpansionInfo
131 unsigned MainLoopStep, unsigned ResidualLoopStep,
132 StringRef BBNamePrefix,
133 std::optional<uint64_t> ExpectedUnits) {
134 assert((ResidualLoopStep == 0 || MainLoopStep % ResidualLoopStep == 0) &&
135 "ResidualLoopStep must divide MainLoopStep if specified");
136 assert(ResidualLoopStep <= MainLoopStep &&
137 "ResidualLoopStep cannot be larger than MainLoopStep");
138 assert(MainLoopStep > 0 && "MainLoopStep must be non-zero");
139 LoopExpansionInfo LEI;
140
141 // If the length is known to be zero, there is nothing to do.
142 if (auto *CLen = dyn_cast<ConstantInt>(Len))
143 if (CLen->isZero())
144 return LEI;
145
146 BasicBlock *PreLoopBB = InsertBefore->getParent();
147 BasicBlock *PostLoopBB = PreLoopBB->splitBasicBlock(
148 InsertBefore, BBNamePrefix + "-post-expansion");
149 Function *ParentFunc = PreLoopBB->getParent();
150 LLVMContext &Ctx = PreLoopBB->getContext();
151 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
152 IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
153 PreLoopBuilder.SetCurrentDebugLocation(DbgLoc);
154
155 // Calculate the main loop trip count and remaining units to cover after the
156 // loop.
157 Type *LenType = Len->getType();
158 IntegerType *ILenType = cast<IntegerType>(LenType);
159 ConstantInt *CIMainLoopStep = ConstantInt::get(ILenType, MainLoopStep);
160 ConstantInt *Zero = ConstantInt::get(ILenType, 0U);
161
162 // We can avoid conditional branches and/or entire loops if we know any of the
163 // following:
164 // - that the main loop must be executed at least once
165 // - that the main loop will not be executed at all
166 // - that the residual loop must be executed at least once
167 // - that the residual loop will not be executed at all
168 bool MustTakeMainLoop = false;
169 bool MayTakeMainLoop = true;
170 bool MustTakeResidualLoop = false;
171 bool MayTakeResidualLoop = true;
172
173 Value *LoopUnits = Len;
174 Value *ResidualUnits = nullptr;
175 if (MainLoopStep != 1) {
176 if (auto *CLen = dyn_cast<ConstantInt>(Len)) {
177 uint64_t TotalUnits = CLen->getZExtValue();
178 uint64_t LoopEndCount = alignDown(TotalUnits, MainLoopStep);
179 uint64_t ResidualCount = TotalUnits - LoopEndCount;
180 LoopUnits = ConstantInt::get(LenType, LoopEndCount);
181 ResidualUnits = ConstantInt::get(LenType, ResidualCount);
182 MustTakeMainLoop = LoopEndCount > 0;
183 MayTakeMainLoop = MustTakeMainLoop;
184 MustTakeResidualLoop = ResidualCount > 0;
185 MayTakeResidualLoop = MustTakeResidualLoop;
186 // TODO: This could also use known bits to check if a non-constant loop
187 // count is guaranteed to be a multiple of MainLoopStep, in which case we
188 // could omit the residual loop. It's unclear if that is worthwhile.
189 } else {
190 ResidualUnits = getRuntimeLoopRemainder(PreLoopBuilder, Len,
191 CIMainLoopStep, MainLoopStep);
192 LoopUnits = getRuntimeLoopUnits(PreLoopBuilder, Len, CIMainLoopStep,
193 MainLoopStep, ResidualUnits);
194 }
195 } else if (auto *CLen = dyn_cast<ConstantInt>(Len)) {
196 MustTakeMainLoop = CLen->getZExtValue() > 0;
197 MayTakeMainLoop = MustTakeMainLoop;
198 }
199
200 // The case where both loops are omitted (i.e., the length is known zero) is
201 // already handled at the beginning of this function.
202 assert((MayTakeMainLoop || MayTakeResidualLoop) &&
203 "At least one of the loops must be generated");
204
205 BasicBlock *MainLoopBB = nullptr;
206 CondBrInst *MainLoopBr = nullptr;
207
208 // Construct the main loop unless we statically known that it is not taken.
209 if (MayTakeMainLoop) {
210 MainLoopBB = BasicBlock::Create(Ctx, BBNamePrefix + "-expansion-main-body",
211 ParentFunc, PostLoopBB);
212 IRBuilder<> LoopBuilder(MainLoopBB);
213 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
214
215 PHINode *LoopIndex = LoopBuilder.CreatePHI(LenType, 2, "loop-index");
216 LEI.MainLoopIndex = LoopIndex;
217 LoopIndex->addIncoming(ConstantInt::get(LenType, 0U), PreLoopBB);
218
219 Value *NewIndex = LoopBuilder.CreateAdd(
220 LoopIndex, ConstantInt::get(LenType, MainLoopStep));
221 LoopIndex->addIncoming(NewIndex, MainLoopBB);
222
223 // One argument of the addition is a loop-variant PHI, so it must be an
224 // Instruction (i.e., it cannot be a Constant).
225 LEI.MainLoopIP = cast<Instruction>(NewIndex);
226
227 // Stay in the MainLoop until we have handled all the LoopUnits. The False
228 // target is adjusted below if a residual is generated.
229 MainLoopBr = LoopBuilder.CreateCondBr(
230 LoopBuilder.CreateICmpULT(NewIndex, LoopUnits), MainLoopBB, PostLoopBB);
231
232 if (ExpectedUnits.has_value()) {
233 uint64_t BackedgeTakenCount = ExpectedUnits.value() / MainLoopStep;
234 if (BackedgeTakenCount > 0)
235 BackedgeTakenCount -= 1; // The last iteration goes to the False target.
236 MDBuilder MDB(ParentFunc->getContext());
237 setFittedBranchWeights(*MainLoopBr, {BackedgeTakenCount, 1},
238 /*IsExpected=*/false);
239 } else {
241 }
242 }
243
244 // Construct the residual loop if it is requested from the caller unless we
245 // statically know that it won't be taken.
246 bool ResidualLoopRequested =
247 ResidualLoopStep > 0 && ResidualLoopStep < MainLoopStep;
248 BasicBlock *ResidualLoopBB = nullptr;
249 BasicBlock *ResidualCondBB = nullptr;
250 if (ResidualLoopRequested && MayTakeResidualLoop) {
251 ResidualLoopBB =
252 BasicBlock::Create(Ctx, BBNamePrefix + "-expansion-residual-body",
253 PreLoopBB->getParent(), PostLoopBB);
254
255 // The residual loop body is either reached from the ResidualCondBB (which
256 // checks if the residual loop needs to be executed), from the main loop
257 // body if we know statically that the residual must be executed, or from
258 // the pre-loop BB (conditionally or unconditionally) if the main loop is
259 // omitted.
260 BasicBlock *PredOfResLoopBody = PreLoopBB;
261 if (MainLoopBB) {
262 // If it's statically known that the residual must be executed, we don't
263 // need to create a preheader BB.
264 if (MustTakeResidualLoop) {
265 MainLoopBr->setSuccessor(1, ResidualLoopBB);
266 PredOfResLoopBody = MainLoopBB;
267 } else {
268 // Construct a preheader BB to check if the residual loop is executed.
269 ResidualCondBB =
270 BasicBlock::Create(Ctx, BBNamePrefix + "-expansion-residual-cond",
271 PreLoopBB->getParent(), ResidualLoopBB);
272
273 // Determine if we need to branch to the residual loop or bypass it.
274 IRBuilder<> RCBuilder(ResidualCondBB);
275 RCBuilder.SetCurrentDebugLocation(DbgLoc);
276 auto *BR =
277 RCBuilder.CreateCondBr(RCBuilder.CreateICmpNE(ResidualUnits, Zero),
278 ResidualLoopBB, PostLoopBB);
279 if (ExpectedUnits.has_value()) {
280 MDBuilder MDB(ParentFunc->getContext());
281 BR->setMetadata(LLVMContext::MD_prof,
283 } else {
285 }
286
287 MainLoopBr->setSuccessor(1, ResidualCondBB);
288 PredOfResLoopBody = ResidualCondBB;
289 }
290 }
291
292 IRBuilder<> ResBuilder(ResidualLoopBB);
293 ResBuilder.SetCurrentDebugLocation(DbgLoc);
294 PHINode *ResidualIndex =
295 ResBuilder.CreatePHI(LenType, 2, "residual-loop-index");
296 ResidualIndex->addIncoming(Zero, PredOfResLoopBody);
297
298 // Add the offset at the end of the main loop to the loop counter of the
299 // residual loop to get the proper index. If the main loop was omitted, we
300 // can also omit the addition.
301 if (MainLoopBB)
302 LEI.ResidualLoopIndex = ResBuilder.CreateAdd(LoopUnits, ResidualIndex);
303 else
304 LEI.ResidualLoopIndex = ResidualIndex;
305
306 Value *ResNewIndex = ResBuilder.CreateAdd(
307 ResidualIndex, ConstantInt::get(LenType, ResidualLoopStep));
308 ResidualIndex->addIncoming(ResNewIndex, ResidualLoopBB);
309
310 // One argument of the addition is a loop-variant PHI, so it must be an
311 // Instruction (i.e., it cannot be a Constant).
312 LEI.ResidualLoopIP = cast<Instruction>(ResNewIndex);
313
314 // Stay in the residual loop until all ResidualUnits are handled.
315 CondBrInst *BR = ResBuilder.CreateCondBr(
316 ResBuilder.CreateICmpULT(ResNewIndex, ResidualUnits), ResidualLoopBB,
317 PostLoopBB);
318
319 if (ExpectedUnits.has_value()) {
320 uint64_t BackedgeTakenCount =
321 (ExpectedUnits.value() % MainLoopStep) / ResidualLoopStep;
322 if (BackedgeTakenCount > 0)
323 BackedgeTakenCount -= 1; // The last iteration goes to the False target.
324 MDBuilder MDB(ParentFunc->getContext());
325 setFittedBranchWeights(*BR, {BackedgeTakenCount, 1},
326 /*IsExpected=*/false);
327 } else {
329 }
330 }
331
332 // Create the branch in the pre-loop block.
333 if (MustTakeMainLoop) {
334 // Go unconditionally to the main loop if it's statically known that it must
335 // be executed.
336 assert(MainLoopBB);
337 PreLoopBuilder.CreateBr(MainLoopBB);
338 } else if (!MainLoopBB && ResidualLoopBB) {
339 if (MustTakeResidualLoop) {
340 // If the main loop is omitted and the residual loop is statically known
341 // to be executed, go there unconditionally.
342 PreLoopBuilder.CreateBr(ResidualLoopBB);
343 } else {
344 // If the main loop is omitted and we don't know if the residual loop is
345 // executed, go there if necessary. The PreLoopBB takes the role of the
346 // preheader for the residual loop in this case.
347 auto *BR = PreLoopBuilder.CreateCondBr(
348 PreLoopBuilder.CreateICmpNE(ResidualUnits, Zero), ResidualLoopBB,
349 PostLoopBB);
350 if (ExpectedUnits.has_value()) {
351 MDBuilder MDB(ParentFunc->getContext());
352 BR->setMetadata(LLVMContext::MD_prof, MDB.createLikelyBranchWeights());
353 } else {
355 }
356 }
357 } else {
358 // Otherwise, go conditionally to the main loop or its successor.
359 // If there is no residual loop, the successor is the post-loop BB.
360 BasicBlock *FalseBB = PostLoopBB;
361 if (ResidualCondBB) {
362 // If we constructed a pre-header for the residual loop, that is the
363 // successor.
364 FalseBB = ResidualCondBB;
365 } else if (ResidualLoopBB) {
366 // If there is a residual loop but the preheader is omitted (because the
367 // residual loop is statically known to be executed), the successor
368 // is the residual loop body.
369 assert(MustTakeResidualLoop);
370 FalseBB = ResidualLoopBB;
371 }
372
373 auto *BR = PreLoopBuilder.CreateCondBr(
374 PreLoopBuilder.CreateICmpNE(LoopUnits, Zero), MainLoopBB, FalseBB);
375
376 if (ExpectedUnits.has_value()) {
377 MDBuilder MDB(ParentFunc->getContext());
378 BR->setMetadata(LLVMContext::MD_prof, MDB.createLikelyBranchWeights());
379 } else {
381 }
382 }
383 // Delete the unconditional branch inserted by splitBasicBlock.
384 PreLoopBB->getTerminator()->eraseFromParent();
385
386 return LEI;
387}
388
390 Value *DstAddr, ConstantInt *CopyLen,
391 Align SrcAlign, Align DstAlign,
392 bool SrcIsVolatile, bool DstIsVolatile,
393 bool CanOverlap,
395 std::optional<uint32_t> AtomicElementSize,
396 std::optional<uint64_t> AverageTripCount) {
397 // No need to expand zero length copies.
398 if (CopyLen->isZero())
399 return;
400
401 BasicBlock *PreLoopBB = InsertBefore->getParent();
402 Function *ParentFunc = PreLoopBB->getParent();
403 LLVMContext &Ctx = PreLoopBB->getContext();
404 const DataLayout &DL = ParentFunc->getDataLayout();
405 MDBuilder MDB(Ctx);
406 MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("MemCopyDomain");
407 StringRef Name = "MemCopyAliasScope";
408 MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
409
410 unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
411 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
412
413 Type *TypeOfCopyLen = CopyLen->getType();
414 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
415 Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
416 assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
417 "Atomic memcpy lowering is not supported for vector operand type");
418
419 Type *Int8Type = Type::getInt8Ty(Ctx);
420 TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
421 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
422 assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
423 "Atomic memcpy lowering is not supported for selected operand size");
424
425 uint64_t LoopEndCount =
426 alignDown(CopyLen->getZExtValue(), LoopOpSize.getFixedValue());
427
428 // Skip the loop expansion entirely if the loop would never be taken.
429 if (LoopEndCount != 0) {
430 LoopExpansionInfo LEI =
431 insertLoopExpansion(InsertBefore, CopyLen, LoopOpSize, 0,
432 "static-memcpy", AverageTripCount);
433 assert(LEI.MainLoopIP && LEI.MainLoopIndex &&
434 "Main loop should be generated for non-zero loop count");
435
436 // Fill MainLoopBB
437 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
438 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
439 Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
440
441 // If we used LoopOpType as GEP element type, we would iterate over the
442 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
443 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
444 // byte offsets computed from the TypeStoreSize.
445 Value *SrcGEP =
446 MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LEI.MainLoopIndex);
447 LoadInst *Load = MainLoopBuilder.CreateAlignedLoad(
448 LoopOpType, SrcGEP, PartSrcAlign, SrcIsVolatile);
449 if (!CanOverlap) {
450 // Set alias scope for loads.
451 Load->setMetadata(LLVMContext::MD_alias_scope,
452 MDNode::get(Ctx, NewScope));
453 }
454 Value *DstGEP =
455 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
456 StoreInst *Store = MainLoopBuilder.CreateAlignedStore(
457 Load, DstGEP, PartDstAlign, DstIsVolatile);
458 if (!CanOverlap) {
459 // Indicate that stores don't overlap loads.
460 Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
461 }
462 if (AtomicElementSize) {
463 Load->setAtomic(AtomicOrdering::Unordered);
464 Store->setAtomic(AtomicOrdering::Unordered);
465 }
466 assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
467 "No residual loop was requested");
468 }
469
470 // Copy the remaining bytes with straight-line code.
471 uint64_t BytesCopied = LoopEndCount;
472 uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
473 if (RemainingBytes == 0)
474 return;
475
476 IRBuilder<> RBuilder(InsertBefore);
477 SmallVector<Type *, 5> RemainingOps;
478 TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
479 SrcAS, DstAS, SrcAlign, DstAlign,
480 AtomicElementSize);
481
482 for (auto *OpTy : RemainingOps) {
483 Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
484 Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
485
486 TypeSize OperandSize = DL.getTypeStoreSize(OpTy);
487 assert((!AtomicElementSize || OperandSize % *AtomicElementSize == 0) &&
488 "Atomic memcpy lowering is not supported for selected operand size");
489
490 Value *SrcGEP = RBuilder.CreateInBoundsGEP(
491 Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
492 LoadInst *Load =
493 RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
494 if (!CanOverlap) {
495 // Set alias scope for loads.
496 Load->setMetadata(LLVMContext::MD_alias_scope,
497 MDNode::get(Ctx, NewScope));
498 }
499 Value *DstGEP = RBuilder.CreateInBoundsGEP(
500 Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
501 StoreInst *Store =
502 RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
503 if (!CanOverlap) {
504 // Indicate that stores don't overlap loads.
505 Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
506 }
507 if (AtomicElementSize) {
508 Load->setAtomic(AtomicOrdering::Unordered);
509 Store->setAtomic(AtomicOrdering::Unordered);
510 }
511 BytesCopied += OperandSize;
512 }
513 assert(BytesCopied == CopyLen->getZExtValue() &&
514 "Bytes copied should match size in the call!");
515}
516
518 Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
519 Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
520 bool CanOverlap, const TargetTransformInfo &TTI,
521 std::optional<uint32_t> AtomicElementSize,
522 std::optional<uint64_t> AverageTripCount) {
523 BasicBlock *PreLoopBB = InsertBefore->getParent();
524 Function *ParentFunc = PreLoopBB->getParent();
525 const DataLayout &DL = ParentFunc->getDataLayout();
526 LLVMContext &Ctx = PreLoopBB->getContext();
527 MDBuilder MDB(Ctx);
528 MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("MemCopyDomain");
529 StringRef Name = "MemCopyAliasScope";
530 MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
531
532 unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
533 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
534
535 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
536 Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
537 assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
538 "Atomic memcpy lowering is not supported for vector operand type");
539 TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
540 assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
541 "Atomic memcpy lowering is not supported for selected operand size");
542
543 Type *Int8Type = Type::getInt8Ty(Ctx);
544
545 Type *ResidualLoopOpType = AtomicElementSize
546 ? Type::getIntNTy(Ctx, *AtomicElementSize * 8)
547 : Int8Type;
548 TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
549 assert(ResidualLoopOpSize == (AtomicElementSize ? *AtomicElementSize : 1) &&
550 "Store size is expected to match type size");
551
552 LoopExpansionInfo LEI =
553 insertLoopExpansion(InsertBefore, CopyLen, LoopOpSize, ResidualLoopOpSize,
554 "dynamic-memcpy", AverageTripCount);
555 assert(LEI.MainLoopIP && LEI.MainLoopIndex &&
556 "Main loop should be generated for unknown size copy");
557
558 // Fill MainLoopBB
559 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
560 Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
561 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
562
563 // If we used LoopOpType as GEP element type, we would iterate over the
564 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
565 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use byte
566 // offsets computed from the TypeStoreSize.
567 Value *SrcGEP =
568 MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LEI.MainLoopIndex);
569 LoadInst *Load = MainLoopBuilder.CreateAlignedLoad(
570 LoopOpType, SrcGEP, PartSrcAlign, SrcIsVolatile);
571 if (!CanOverlap) {
572 // Set alias scope for loads.
573 Load->setMetadata(LLVMContext::MD_alias_scope, MDNode::get(Ctx, NewScope));
574 }
575 Value *DstGEP =
576 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
577 StoreInst *Store = MainLoopBuilder.CreateAlignedStore(
578 Load, DstGEP, PartDstAlign, DstIsVolatile);
579 if (!CanOverlap) {
580 // Indicate that stores don't overlap loads.
581 Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
582 }
583 if (AtomicElementSize) {
586 }
587
588 // Fill ResidualLoopBB.
589 if (!LEI.ResidualLoopIP)
590 return;
591
592 Align ResSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize));
593 Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
594
595 IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
596 Value *ResSrcGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
597 LEI.ResidualLoopIndex);
598 LoadInst *ResLoad = ResLoopBuilder.CreateAlignedLoad(
599 ResidualLoopOpType, ResSrcGEP, ResSrcAlign, SrcIsVolatile);
600 if (!CanOverlap) {
601 // Set alias scope for loads.
602 ResLoad->setMetadata(LLVMContext::MD_alias_scope,
603 MDNode::get(Ctx, NewScope));
604 }
605 Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
606 LEI.ResidualLoopIndex);
607 StoreInst *ResStore = ResLoopBuilder.CreateAlignedStore(
608 ResLoad, ResDstGEP, ResDstAlign, DstIsVolatile);
609 if (!CanOverlap) {
610 // Indicate that stores don't overlap loads.
611 ResStore->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
612 }
613 if (AtomicElementSize) {
616 }
617}
618
619// If \p Addr1 and \p Addr2 are pointers to different address spaces, create an
620// addresspacecast to obtain a pair of pointers in the same addressspace. The
621// caller needs to ensure that addrspacecasting is possible.
622// No-op if the pointers are in the same address space.
623static std::pair<Value *, Value *>
625 const TargetTransformInfo &TTI) {
626 Value *ResAddr1 = Addr1;
627 Value *ResAddr2 = Addr2;
628
629 unsigned AS1 = cast<PointerType>(Addr1->getType())->getAddressSpace();
630 unsigned AS2 = cast<PointerType>(Addr2->getType())->getAddressSpace();
631 if (AS1 != AS2) {
632 if (TTI.isValidAddrSpaceCast(AS2, AS1))
633 ResAddr2 = B.CreateAddrSpaceCast(Addr2, Addr1->getType());
634 else if (TTI.isValidAddrSpaceCast(AS1, AS2))
635 ResAddr1 = B.CreateAddrSpaceCast(Addr1, Addr2->getType());
636 else
637 llvm_unreachable("Can only lower memmove between address spaces if they "
638 "support addrspacecast");
639 }
640 return {ResAddr1, ResAddr2};
641}
642
643// Lower memmove to IR. memmove is required to correctly copy overlapping memory
644// regions; therefore, it has to check the relative positions of the source and
645// destination pointers and choose the copy direction accordingly.
646//
647// The code below is an IR rendition of this C function:
648//
649// void* memmove(void* dst, const void* src, size_t n) {
650// unsigned char* d = dst;
651// const unsigned char* s = src;
652// if (s < d) {
653// // copy backwards
654// while (n--) {
655// d[n] = s[n];
656// }
657// } else {
658// // copy forward
659// for (size_t i = 0; i < n; ++i) {
660// d[i] = s[i];
661// }
662// }
663// return dst;
664// }
665//
666// If the TargetTransformInfo specifies a wider MemcpyLoopLoweringType, it is
667// used for the memory accesses in the loops. Then, additional loops with
668// byte-wise accesses are added for the remaining bytes.
670 Value *SrcAddr, Value *DstAddr,
671 Value *CopyLen, Align SrcAlign,
672 Align DstAlign, bool SrcIsVolatile,
673 bool DstIsVolatile,
674 const TargetTransformInfo &TTI) {
675 Type *TypeOfCopyLen = CopyLen->getType();
676 BasicBlock *OrigBB = InsertBefore->getParent();
677 Function *F = OrigBB->getParent();
678 const DataLayout &DL = F->getDataLayout();
679 LLVMContext &Ctx = OrigBB->getContext();
680 unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
681 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
682
683 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
684 SrcAlign, DstAlign);
685 TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
686 Type *Int8Type = Type::getInt8Ty(Ctx);
687 bool LoopOpIsInt8 = LoopOpType == Int8Type;
688
689 // If the memory accesses are wider than one byte, residual loops with
690 // i8-accesses are required to move remaining bytes.
691 bool RequiresResidual = !LoopOpIsInt8;
692
693 Type *ResidualLoopOpType = Int8Type;
694 TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
695
696 // Calculate the loop trip count and remaining bytes to copy after the loop.
697 IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
698 ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
699 ConstantInt *CIResidualLoopOpSize =
700 ConstantInt::get(ILengthType, ResidualLoopOpSize);
701 ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
702
703 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
704 IRBuilder<> PLBuilder(InsertBefore);
705 PLBuilder.SetCurrentDebugLocation(DbgLoc);
706
707 Value *RuntimeLoopBytes = CopyLen;
708 Value *RuntimeLoopRemainder = nullptr;
709 Value *SkipResidualCondition = nullptr;
710 if (RequiresResidual) {
711 RuntimeLoopRemainder =
712 getRuntimeLoopRemainder(PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
713 RuntimeLoopBytes = getRuntimeLoopUnits(PLBuilder, CopyLen, CILoopOpSize,
714 LoopOpSize, RuntimeLoopRemainder);
715 SkipResidualCondition =
716 PLBuilder.CreateICmpEQ(RuntimeLoopRemainder, Zero, "skip_residual");
717 }
718 Value *SkipMainCondition =
719 PLBuilder.CreateICmpEQ(RuntimeLoopBytes, Zero, "skip_main");
720
721 // Create the a comparison of src and dst, based on which we jump to either
722 // the forward-copy part of the function (if src >= dst) or the backwards-copy
723 // part (if src < dst).
724 // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
725 // structure. Its block terminators (unconditional branches) are replaced by
726 // the appropriate conditional branches when the loop is built.
727 // If the pointers are in different address spaces, they need to be converted
728 // to a compatible one. Cases where memory ranges in the different address
729 // spaces cannot overlap are lowered as memcpy and not handled here.
730 auto [CmpSrcAddr, CmpDstAddr] =
731 tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
732 Value *PtrCompare =
733 PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
734 Instruction *ThenTerm, *ElseTerm;
735 SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
736 &ThenTerm, &ElseTerm);
737
738 // If the LoopOpSize is greater than 1, each part of the function consists of
739 // four blocks:
740 // memmove_copy_backwards:
741 // skip the residual loop when 0 iterations are required
742 // memmove_bwd_residual_loop:
743 // copy the last few bytes individually so that the remaining length is
744 // a multiple of the LoopOpSize
745 // memmove_bwd_middle: skip the main loop when 0 iterations are required
746 // memmove_bwd_main_loop: the actual backwards loop BB with wide accesses
747 // memmove_copy_forward: skip the main loop when 0 iterations are required
748 // memmove_fwd_main_loop: the actual forward loop BB with wide accesses
749 // memmove_fwd_middle: skip the residual loop when 0 iterations are required
750 // memmove_fwd_residual_loop: copy the last few bytes individually
751 //
752 // The main and residual loop are switched between copying forward and
753 // backward so that the residual loop always operates on the end of the moved
754 // range. This is based on the assumption that buffers whose start is aligned
755 // with the LoopOpSize are more common than buffers whose end is.
756 //
757 // If the LoopOpSize is 1, each part of the function consists of two blocks:
758 // memmove_copy_backwards: skip the loop when 0 iterations are required
759 // memmove_bwd_main_loop: the actual backwards loop BB
760 // memmove_copy_forward: skip the loop when 0 iterations are required
761 // memmove_fwd_main_loop: the actual forward loop BB
762 BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
763 CopyBackwardsBB->setName("memmove_copy_backwards");
764 BasicBlock *CopyForwardBB = ElseTerm->getParent();
765 CopyForwardBB->setName("memmove_copy_forward");
766 BasicBlock *ExitBB = InsertBefore->getParent();
767 ExitBB->setName("memmove_done");
768
769 Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
770 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
771
772 // Accesses in the residual loops do not share the same alignment as those in
773 // the main loops.
774 Align ResidualSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize));
775 Align ResidualDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
776
777 // Copying backwards.
778 {
779 BasicBlock *MainLoopBB = BasicBlock::Create(
780 F->getContext(), "memmove_bwd_main_loop", F, CopyForwardBB);
781
782 // The predecessor of the memmove_bwd_main_loop. Updated in the
783 // following if a residual loop is emitted first.
784 BasicBlock *PredBB = CopyBackwardsBB;
785
786 if (RequiresResidual) {
787 // backwards residual loop
788 BasicBlock *ResidualLoopBB = BasicBlock::Create(
789 F->getContext(), "memmove_bwd_residual_loop", F, MainLoopBB);
790 IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
791 ResidualLoopBuilder.SetCurrentDebugLocation(DbgLoc);
792 PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(ILengthType, 0);
793 Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
794 ResidualLoopPhi, CIResidualLoopOpSize, "bwd_residual_index");
795 // If we used LoopOpType as GEP element type, we would iterate over the
796 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes,
797 // i.e., we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore,
798 // use byte offsets computed from the TypeStoreSize.
799 Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
800 ResidualIndex);
801 Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
802 ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
803 "element");
804 Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
805 ResidualIndex);
806 ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
807 ResidualDstAlign, DstIsVolatile);
808
809 // After the residual loop, go to an intermediate block.
810 BasicBlock *IntermediateBB = BasicBlock::Create(
811 F->getContext(), "memmove_bwd_middle", F, MainLoopBB);
812 // Later code expects a terminator in the PredBB.
813 IRBuilder<> IntermediateBuilder(IntermediateBB);
814 IntermediateBuilder.SetCurrentDebugLocation(DbgLoc);
815 IntermediateBuilder.CreateUnreachable();
816 ResidualLoopBuilder.CreateCondBr(
817 ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, RuntimeLoopBytes),
818 IntermediateBB, ResidualLoopBB);
819
820 ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
821 ResidualLoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
822
823 // How to get to the residual:
824 CondBrInst *BrInst =
825 CondBrInst::Create(SkipResidualCondition, IntermediateBB,
826 ResidualLoopBB, ThenTerm->getIterator());
827 BrInst->setDebugLoc(DbgLoc);
828 ThenTerm->eraseFromParent();
829
830 PredBB = IntermediateBB;
831 }
832
833 // main loop
834 IRBuilder<> MainLoopBuilder(MainLoopBB);
835 MainLoopBuilder.SetCurrentDebugLocation(DbgLoc);
836 PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(ILengthType, 0);
837 Value *MainIndex =
838 MainLoopBuilder.CreateSub(MainLoopPhi, CILoopOpSize, "bwd_main_index");
839 Value *LoadGEP =
840 MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, MainIndex);
841 Value *Element = MainLoopBuilder.CreateAlignedLoad(
842 LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
843 Value *StoreGEP =
844 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, MainIndex);
845 MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
846 DstIsVolatile);
847 MainLoopBuilder.CreateCondBr(MainLoopBuilder.CreateICmpEQ(MainIndex, Zero),
848 ExitBB, MainLoopBB);
849 MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
850 MainLoopPhi->addIncoming(RuntimeLoopBytes, PredBB);
851
852 // How to get to the main loop:
853 Instruction *PredBBTerm = PredBB->getTerminator();
855 SkipMainCondition, ExitBB, MainLoopBB, PredBBTerm->getIterator());
856 BrInst->setDebugLoc(DbgLoc);
857 PredBBTerm->eraseFromParent();
858 }
859
860 // Copying forward.
861 // main loop
862 {
863 BasicBlock *MainLoopBB =
864 BasicBlock::Create(F->getContext(), "memmove_fwd_main_loop", F, ExitBB);
865 IRBuilder<> MainLoopBuilder(MainLoopBB);
866 MainLoopBuilder.SetCurrentDebugLocation(DbgLoc);
867 PHINode *MainLoopPhi =
868 MainLoopBuilder.CreatePHI(ILengthType, 0, "fwd_main_index");
869 Value *LoadGEP =
870 MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, MainLoopPhi);
871 Value *Element = MainLoopBuilder.CreateAlignedLoad(
872 LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
873 Value *StoreGEP =
874 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, MainLoopPhi);
875 MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
876 DstIsVolatile);
877 Value *MainIndex = MainLoopBuilder.CreateAdd(MainLoopPhi, CILoopOpSize);
878 MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
879 MainLoopPhi->addIncoming(Zero, CopyForwardBB);
880
881 Instruction *CopyFwdBBTerm = CopyForwardBB->getTerminator();
882 BasicBlock *SuccessorBB = ExitBB;
883 if (RequiresResidual)
884 SuccessorBB =
885 BasicBlock::Create(F->getContext(), "memmove_fwd_middle", F, ExitBB);
886
887 // leaving or staying in the main loop
888 MainLoopBuilder.CreateCondBr(
889 MainLoopBuilder.CreateICmpEQ(MainIndex, RuntimeLoopBytes), SuccessorBB,
890 MainLoopBB);
891
892 // getting in or skipping the main loop
893 CondBrInst *BrInst =
894 CondBrInst::Create(SkipMainCondition, SuccessorBB, MainLoopBB,
895 CopyFwdBBTerm->getIterator());
896 BrInst->setDebugLoc(DbgLoc);
897 CopyFwdBBTerm->eraseFromParent();
898
899 if (RequiresResidual) {
900 BasicBlock *IntermediateBB = SuccessorBB;
901 IRBuilder<> IntermediateBuilder(IntermediateBB);
902 IntermediateBuilder.SetCurrentDebugLocation(DbgLoc);
903 BasicBlock *ResidualLoopBB = BasicBlock::Create(
904 F->getContext(), "memmove_fwd_residual_loop", F, ExitBB);
905 IntermediateBuilder.CreateCondBr(SkipResidualCondition, ExitBB,
906 ResidualLoopBB);
907
908 // Residual loop
909 IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
910 ResidualLoopBuilder.SetCurrentDebugLocation(DbgLoc);
911 PHINode *ResidualLoopPhi =
912 ResidualLoopBuilder.CreatePHI(ILengthType, 0, "fwd_residual_index");
913 Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
914 ResidualLoopPhi);
915 Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
916 ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
917 "element");
918 Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
919 ResidualLoopPhi);
920 ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
921 ResidualDstAlign, DstIsVolatile);
922 Value *ResidualIndex =
923 ResidualLoopBuilder.CreateAdd(ResidualLoopPhi, CIResidualLoopOpSize);
924 ResidualLoopBuilder.CreateCondBr(
925 ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, CopyLen), ExitBB,
926 ResidualLoopBB);
927 ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
928 ResidualLoopPhi->addIncoming(RuntimeLoopBytes, IntermediateBB);
929 }
930 }
931}
932
933// Similar to createMemMoveLoopUnknownSize, only the trip counts are computed at
934// compile time, obsolete loops and branches are omitted, and the residual code
935// is straight-line code instead of a loop.
936static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
937 Value *SrcAddr, Value *DstAddr,
938 ConstantInt *CopyLen, Align SrcAlign,
939 Align DstAlign, bool SrcIsVolatile,
940 bool DstIsVolatile,
941 const TargetTransformInfo &TTI) {
942 // No need to expand zero length moves.
943 if (CopyLen->isZero())
944 return;
945
946 Type *TypeOfCopyLen = CopyLen->getType();
947 BasicBlock *OrigBB = InsertBefore->getParent();
948 Function *F = OrigBB->getParent();
949 const DataLayout &DL = F->getDataLayout();
950 LLVMContext &Ctx = OrigBB->getContext();
951 unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
952 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
953
954 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
955 SrcAlign, DstAlign);
956 TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
957 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
958 Type *Int8Type = Type::getInt8Ty(Ctx);
959
960 // Calculate the loop trip count and remaining bytes to copy after the loop.
961 uint64_t BytesCopiedInLoop =
962 alignDown(CopyLen->getZExtValue(), LoopOpSize.getFixedValue());
963 uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop;
964
965 IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
966 ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
967 ConstantInt *LoopBound = ConstantInt::get(ILengthType, BytesCopiedInLoop);
968 ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
969
970 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
971 IRBuilder<> PLBuilder(InsertBefore);
972 PLBuilder.SetCurrentDebugLocation(DbgLoc);
973
974 auto [CmpSrcAddr, CmpDstAddr] =
975 tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
976 Value *PtrCompare =
977 PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
978 Instruction *ThenTerm, *ElseTerm;
979 SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
980 &ThenTerm, &ElseTerm);
981
982 BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
983 BasicBlock *CopyForwardBB = ElseTerm->getParent();
984 BasicBlock *ExitBB = InsertBefore->getParent();
985 ExitBB->setName("memmove_done");
986
987 Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
988 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
989
990 // Helper function to generate a load/store pair of a given type in the
991 // residual. Used in the forward and backward branches.
992 auto GenerateResidualLdStPair = [&](Type *OpTy, IRBuilderBase &Builder,
993 uint64_t &BytesCopied) {
994 Align ResSrcAlign(commonAlignment(SrcAlign, BytesCopied));
995 Align ResDstAlign(commonAlignment(DstAlign, BytesCopied));
996
997 TypeSize OperandSize = DL.getTypeStoreSize(OpTy);
998
999 // If we used LoopOpType as GEP element type, we would iterate over the
1000 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
1001 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
1002 // byte offsets computed from the TypeStoreSize.
1003 Value *SrcGEP = Builder.CreateInBoundsGEP(
1004 Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
1005 LoadInst *Load =
1006 Builder.CreateAlignedLoad(OpTy, SrcGEP, ResSrcAlign, SrcIsVolatile);
1007 Value *DstGEP = Builder.CreateInBoundsGEP(
1008 Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
1009 Builder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
1010 BytesCopied += OperandSize;
1011 };
1012
1013 // Copying backwards.
1014 if (RemainingBytes != 0) {
1015 CopyBackwardsBB->setName("memmove_bwd_residual");
1016 uint64_t BytesCopied = BytesCopiedInLoop;
1017
1018 // Residual code is required to move the remaining bytes. We need the same
1019 // instructions as in the forward case, only in reverse. So we generate code
1020 // the same way, except that we change the IRBuilder insert point for each
1021 // load/store pair so that each one is inserted before the previous one
1022 // instead of after it.
1023 IRBuilder<> BwdResBuilder(CopyBackwardsBB,
1024 CopyBackwardsBB->getFirstNonPHIIt());
1025 BwdResBuilder.SetCurrentDebugLocation(DbgLoc);
1026 SmallVector<Type *, 5> RemainingOps;
1027 TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
1028 SrcAS, DstAS, PartSrcAlign,
1029 PartDstAlign);
1030 for (auto *OpTy : RemainingOps) {
1031 // reverse the order of the emitted operations
1032 BwdResBuilder.SetInsertPoint(CopyBackwardsBB,
1033 CopyBackwardsBB->getFirstNonPHIIt());
1034 GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied);
1035 }
1036 }
1037 if (BytesCopiedInLoop != 0) {
1038 BasicBlock *LoopBB = CopyBackwardsBB;
1039 BasicBlock *PredBB = OrigBB;
1040 if (RemainingBytes != 0) {
1041 // if we introduce residual code, it needs its separate BB
1042 LoopBB = CopyBackwardsBB->splitBasicBlock(
1043 CopyBackwardsBB->getTerminator(), "memmove_bwd_loop");
1044 PredBB = CopyBackwardsBB;
1045 } else {
1046 CopyBackwardsBB->setName("memmove_bwd_loop");
1047 }
1048 IRBuilder<> LoopBuilder(LoopBB->getTerminator());
1049 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
1050 PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0);
1051 Value *Index = LoopBuilder.CreateSub(LoopPhi, CILoopOpSize, "bwd_index");
1052 Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, Index);
1053 Value *Element = LoopBuilder.CreateAlignedLoad(
1054 LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
1055 Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, Index);
1056 LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
1057 DstIsVolatile);
1058
1059 // Replace the unconditional branch introduced by
1060 // SplitBlockAndInsertIfThenElse to turn LoopBB into a loop.
1061 Instruction *UncondTerm = LoopBB->getTerminator();
1062 LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, Zero), ExitBB,
1063 LoopBB);
1064 UncondTerm->eraseFromParent();
1065
1066 LoopPhi->addIncoming(Index, LoopBB);
1067 LoopPhi->addIncoming(LoopBound, PredBB);
1068 }
1069
1070 // Copying forward.
1071 BasicBlock *FwdResidualBB = CopyForwardBB;
1072 if (BytesCopiedInLoop != 0) {
1073 CopyForwardBB->setName("memmove_fwd_loop");
1074 BasicBlock *LoopBB = CopyForwardBB;
1075 BasicBlock *SuccBB = ExitBB;
1076 if (RemainingBytes != 0) {
1077 // if we introduce residual code, it needs its separate BB
1078 SuccBB = CopyForwardBB->splitBasicBlock(CopyForwardBB->getTerminator(),
1079 "memmove_fwd_residual");
1080 FwdResidualBB = SuccBB;
1081 }
1082 IRBuilder<> LoopBuilder(LoopBB->getTerminator());
1083 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
1084 PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0, "fwd_index");
1085 Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopPhi);
1086 Value *Element = LoopBuilder.CreateAlignedLoad(
1087 LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
1088 Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopPhi);
1089 LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
1090 DstIsVolatile);
1091 Value *Index = LoopBuilder.CreateAdd(LoopPhi, CILoopOpSize);
1092 LoopPhi->addIncoming(Index, LoopBB);
1093 LoopPhi->addIncoming(Zero, OrigBB);
1094
1095 // Replace the unconditional branch to turn LoopBB into a loop.
1096 Instruction *UncondTerm = LoopBB->getTerminator();
1097 LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, LoopBound), SuccBB,
1098 LoopBB);
1099 UncondTerm->eraseFromParent();
1100 }
1101
1102 if (RemainingBytes != 0) {
1103 uint64_t BytesCopied = BytesCopiedInLoop;
1104
1105 // Residual code is required to move the remaining bytes. In the forward
1106 // case, we emit it in the normal order.
1107 IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
1108 FwdResBuilder.SetCurrentDebugLocation(DbgLoc);
1109 SmallVector<Type *, 5> RemainingOps;
1110 TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
1111 SrcAS, DstAS, PartSrcAlign,
1112 PartDstAlign);
1113 for (auto *OpTy : RemainingOps)
1114 GenerateResidualLdStPair(OpTy, FwdResBuilder, BytesCopied);
1115 }
1116}
1117
1118/// Create a Value of \p DstType that consists of a sequence of copies of
1119/// \p SetValue, using bitcasts and a vector splat.
1121 Value *SetValue, Type *DstType) {
1122 TypeSize DstSize = DL.getTypeStoreSize(DstType);
1123 Type *SetValueType = SetValue->getType();
1124 TypeSize SetValueSize = DL.getTypeStoreSize(SetValueType);
1125 assert(SetValueSize == DL.getTypeAllocSize(SetValueType) &&
1126 "Store size and alloc size of SetValue's type must match");
1127 assert(SetValueSize != 0 && DstSize % SetValueSize == 0 &&
1128 "DstType size must be a multiple of SetValue size");
1129
1130 Value *Result = SetValue;
1131 if (DstSize != SetValueSize) {
1132 if (!SetValueType->isIntegerTy() && !SetValueType->isFloatingPointTy()) {
1133 // If the type cannot be put into a vector, bitcast to iN first.
1134 LLVMContext &Ctx = SetValue->getContext();
1135 Result = B.CreateBitCast(Result, Type::getIntNTy(Ctx, SetValueSize * 8),
1136 "setvalue.toint");
1137 }
1138 // Form a sufficiently large vector consisting of SetValue, repeated.
1139 Result =
1140 B.CreateVectorSplat(DstSize / SetValueSize, Result, "setvalue.splat");
1141 }
1142
1143 // The value has the right size, but we might have to bitcast it to the right
1144 // type.
1145 Result = B.CreateBitCast(Result, DstType, "setvalue.splat.cast");
1146 return Result;
1147}
1148
1149static void
1151 ConstantInt *Len, Value *SetValue, Align DstAlign,
1152 bool IsVolatile, const TargetTransformInfo *TTI,
1153 std::optional<uint64_t> AverageTripCount) {
1154 // No need to expand zero length memsets.
1155 if (Len->isZero())
1156 return;
1157
1158 BasicBlock *PreLoopBB = InsertBefore->getParent();
1159 Function *ParentFunc = PreLoopBB->getParent();
1160 const DataLayout &DL = ParentFunc->getDataLayout();
1161 LLVMContext &Ctx = PreLoopBB->getContext();
1162
1163 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
1164
1165 Type *TypeOfLen = Len->getType();
1166 Type *Int8Type = Type::getInt8Ty(Ctx);
1167 assert(SetValue->getType() == Int8Type && "Can only set bytes");
1168
1169 Type *LoopOpType = Int8Type;
1170 if (TTI) {
1171 // Use the same memory access type as for a memcpy with the same Dst and Src
1172 // alignment and address space.
1173 LoopOpType = TTI->getMemcpyLoopLoweringType(
1174 Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
1175 }
1176 TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
1177 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
1178
1179 uint64_t LoopEndCount =
1180 alignDown(Len->getZExtValue(), LoopOpSize.getFixedValue());
1181
1182 if (LoopEndCount != 0) {
1183 Value *SplatSetValue = nullptr;
1184 {
1185 IRBuilder<> PreLoopBuilder(InsertBefore);
1186 SplatSetValue =
1187 createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType);
1188 }
1189
1190 // Don't generate a residual loop, the remaining bytes are set with
1191 // straight-line code.
1192 LoopExpansionInfo LEI = insertLoopExpansion(
1193 InsertBefore, Len, LoopOpSize, 0, "static-memset", AverageTripCount);
1194 assert(LEI.MainLoopIP && LEI.MainLoopIndex &&
1195 "Main loop should be generated for non-zero loop count");
1196
1197 // Fill MainLoopBB
1198 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
1199 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
1200
1201 Value *DstGEP =
1202 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
1203
1204 MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
1205 IsVolatile);
1206
1207 assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
1208 "No residual loop was requested");
1209 }
1210
1211 uint64_t BytesSet = LoopEndCount;
1212 uint64_t RemainingBytes = Len->getZExtValue() - BytesSet;
1213 if (RemainingBytes == 0)
1214 return;
1215
1216 IRBuilder<> RBuilder(InsertBefore);
1217
1218 assert(TTI && "there cannot be a residual loop without TTI");
1219 SmallVector<Type *, 5> RemainingOps;
1220 TTI->getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
1221 DstAS, DstAS, DstAlign, DstAlign,
1222 std::nullopt);
1223
1224 Type *PreviousOpTy = nullptr;
1225 Value *SplatSetValue = nullptr;
1226 for (auto *OpTy : RemainingOps) {
1227 TypeSize OperandSize = DL.getTypeStoreSize(OpTy);
1228 assert(OperandSize.isFixed() &&
1229 "Operand types cannot be scalable vector types");
1230 Align PartDstAlign(commonAlignment(DstAlign, BytesSet));
1231
1232 // Avoid recomputing the splat SetValue if it's the same as for the last
1233 // iteration.
1234 if (OpTy != PreviousOpTy)
1235 SplatSetValue = createMemSetSplat(DL, RBuilder, SetValue, OpTy);
1236
1237 Value *DstGEP = RBuilder.CreateInBoundsGEP(
1238 Int8Type, DstAddr, ConstantInt::get(TypeOfLen, BytesSet));
1239 RBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
1240 IsVolatile);
1241 BytesSet += OperandSize;
1242 PreviousOpTy = OpTy;
1243 }
1244 assert(BytesSet == Len->getZExtValue() &&
1245 "Bytes set should match size in the call!");
1246}
1247
1248static void
1250 Value *Len, Value *SetValue, Align DstAlign,
1251 bool IsVolatile, const TargetTransformInfo *TTI,
1252 std::optional<uint64_t> AverageTripCount) {
1253 BasicBlock *PreLoopBB = InsertBefore->getParent();
1254 Function *ParentFunc = PreLoopBB->getParent();
1255 const DataLayout &DL = ParentFunc->getDataLayout();
1256 LLVMContext &Ctx = PreLoopBB->getContext();
1257
1258 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
1259
1260 Type *Int8Type = Type::getInt8Ty(Ctx);
1261 assert(SetValue->getType() == Int8Type && "Can only set bytes");
1262
1263 Type *LoopOpType = Int8Type;
1264 if (TTI) {
1265 LoopOpType = TTI->getMemcpyLoopLoweringType(
1266 Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
1267 }
1268 TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
1269 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
1270
1271 Type *ResidualLoopOpType = Int8Type;
1272 TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
1273
1274 Value *SplatSetValue = SetValue;
1275 {
1276 IRBuilder<> PreLoopBuilder(InsertBefore);
1277 SplatSetValue = createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType);
1278 }
1279
1280 LoopExpansionInfo LEI =
1281 insertLoopExpansion(InsertBefore, Len, LoopOpSize, ResidualLoopOpSize,
1282 "dynamic-memset", AverageTripCount);
1283 assert(LEI.MainLoopIP && LEI.MainLoopIndex &&
1284 "Main loop should be generated for unknown size memset");
1285
1286 // Fill MainLoopBB
1287 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
1288 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
1289
1290 Value *DstGEP =
1291 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
1292 MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
1293 IsVolatile);
1294
1295 // Fill ResidualLoopBB
1296 if (!LEI.ResidualLoopIP)
1297 return;
1298
1299 Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
1300
1301 IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
1302
1303 Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
1304 LEI.ResidualLoopIndex);
1305 ResLoopBuilder.CreateAlignedStore(SetValue, ResDstGEP, ResDstAlign,
1306 IsVolatile);
1307}
1308
1309static void createMemSetPatternLoop(Instruction *InsertBefore, Value *DstAddr,
1310 Value *Len, Value *SetValue, Align DstAlign,
1311 bool IsVolatile,
1312 const TargetTransformInfo *TTI,
1313 std::optional<uint64_t> AverageTripCount) {
1314 // No need to expand zero length memset.pattern.
1315 if (auto *CLen = dyn_cast<ConstantInt>(Len))
1316 if (CLen->isZero())
1317 return;
1318
1319 BasicBlock *PreLoopBB = InsertBefore->getParent();
1320 Function *ParentFunc = PreLoopBB->getParent();
1321 const DataLayout &DL = ParentFunc->getDataLayout();
1322 LLVMContext &Ctx = PreLoopBB->getContext();
1323
1324 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
1325
1326 Type *PreferredLoopOpType = SetValue->getType();
1327 if (TTI) {
1328 PreferredLoopOpType = TTI->getMemcpyLoopLoweringType(
1329 Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
1330 }
1331 TypeSize PreferredLoopOpStoreSize = DL.getTypeStoreSize(PreferredLoopOpType);
1332 assert(PreferredLoopOpStoreSize.isFixed() &&
1333 "PreferredLoopOpType cannot be a scalable vector type");
1334
1335 TypeSize PreferredLoopOpAllocSize = DL.getTypeAllocSize(PreferredLoopOpType);
1336
1337 Type *OriginalType = SetValue->getType();
1338 TypeSize OriginalTypeStoreSize = DL.getTypeStoreSize(OriginalType);
1339 TypeSize OriginalTypeAllocSize = DL.getTypeAllocSize(OriginalType);
1340
1341 // The semantics of memset.pattern restrict what vectorization we can do: It
1342 // has to behave like a series of stores of the SetValue type at offsets that
1343 // are spaced by the alloc size of the SetValue type. If store and alloc size
1344 // of the SetValue type don't match, the bytes that aren't covered by these
1345 // stores must not be overwritten. We therefore only vectorize memset.pattern
1346 // if the store and alloc sizes of the SetValue are equal and properly divide
1347 // the size of the preferred lowering type (and only if store and alloc size
1348 // for the preferred lowering type are also equal).
1349
1350 unsigned MainLoopStep = 1;
1351 Type *MainLoopType = OriginalType;
1352 TypeSize MainLoopAllocSize = OriginalTypeAllocSize;
1353 unsigned ResidualLoopStep = 0;
1354 Type *ResidualLoopType = nullptr;
1355
1356 if (PreferredLoopOpStoreSize == PreferredLoopOpAllocSize &&
1357 OriginalTypeStoreSize == OriginalTypeAllocSize &&
1358 OriginalTypeStoreSize < PreferredLoopOpStoreSize &&
1359 PreferredLoopOpStoreSize % OriginalTypeStoreSize == 0) {
1360 // Multiple instances of SetValue can be combined to reach the preferred
1361 // loop op size.
1362 MainLoopStep = PreferredLoopOpStoreSize / OriginalTypeStoreSize;
1363 MainLoopType = PreferredLoopOpType;
1364 MainLoopAllocSize = PreferredLoopOpStoreSize;
1365
1366 ResidualLoopStep = 1;
1367 ResidualLoopType = OriginalType;
1368 }
1369
1370 // The step arguments here are in terms of the alloc size of the SetValue, not
1371 // in terms of bytes.
1372 LoopExpansionInfo LEI =
1373 insertLoopExpansion(InsertBefore, Len, MainLoopStep, ResidualLoopStep,
1374 "memset.pattern", AverageTripCount);
1375
1376 Align PartDstAlign(commonAlignment(DstAlign, MainLoopAllocSize));
1377
1378 if (LEI.MainLoopIP) {
1379 // Create the loop-invariant splat value before the loop.
1380 IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
1381 Value *MainLoopSetValue = SetValue;
1382 if (MainLoopType != OriginalType)
1383 MainLoopSetValue =
1384 createMemSetSplat(DL, PreLoopBuilder, SetValue, MainLoopType);
1385
1386 // Fill MainLoopBB
1387 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
1388 Value *DstGEP = MainLoopBuilder.CreateInBoundsGEP(MainLoopType, DstAddr,
1389 LEI.MainLoopIndex);
1390 MainLoopBuilder.CreateAlignedStore(MainLoopSetValue, DstGEP, PartDstAlign,
1391 IsVolatile);
1392 }
1393
1394 if (!LEI.ResidualLoopIP)
1395 return;
1396
1397 // Fill ResidualLoopBB
1398 Align ResDstAlign(
1399 commonAlignment(PartDstAlign, DL.getTypeAllocSize(ResidualLoopType)));
1400
1401 IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
1402 Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(ResidualLoopType, DstAddr,
1403 LEI.ResidualLoopIndex);
1404 ResLoopBuilder.CreateAlignedStore(SetValue, ResDstGEP, ResDstAlign,
1405 IsVolatile);
1406}
1407
1408template <typename T>
1410 if (SE) {
1411 const SCEV *SrcSCEV = SE->getSCEV(Memcpy->getRawSource());
1412 const SCEV *DestSCEV = SE->getSCEV(Memcpy->getRawDest());
1413 if (SE->isKnownPredicateAt(CmpInst::ICMP_NE, SrcSCEV, DestSCEV, Memcpy))
1414 return false;
1415 }
1416 return true;
1417}
1418
1420 const TargetTransformInfo &TTI,
1421 ScalarEvolution *SE) {
1422 bool CanOverlap = canOverlap(Memcpy, SE);
1423 auto TripCount = getAverageMemOpLoopTripCount(*Memcpy);
1424 if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
1426 /*InsertBefore=*/Memcpy,
1427 /*SrcAddr=*/Memcpy->getRawSource(),
1428 /*DstAddr=*/Memcpy->getRawDest(),
1429 /*CopyLen=*/CI,
1430 /*SrcAlign=*/Memcpy->getSourceAlign().valueOrOne(),
1431 /*DstAlign=*/Memcpy->getDestAlign().valueOrOne(),
1432 /*SrcIsVolatile=*/Memcpy->isVolatile(),
1433 /*DstIsVolatile=*/Memcpy->isVolatile(),
1434 /*CanOverlap=*/CanOverlap,
1435 /*TTI=*/TTI,
1436 /*AtomicElementSize=*/std::nullopt,
1437 /*AverageTripCount=*/TripCount);
1438 } else {
1440 /*InsertBefore=*/Memcpy,
1441 /*SrcAddr=*/Memcpy->getRawSource(),
1442 /*DstAddr=*/Memcpy->getRawDest(),
1443 /*CopyLen=*/Memcpy->getLength(),
1444 /*SrcAlign=*/Memcpy->getSourceAlign().valueOrOne(),
1445 /*DstAlign=*/Memcpy->getDestAlign().valueOrOne(),
1446 /*SrcIsVolatile=*/Memcpy->isVolatile(),
1447 /*DstIsVolatile=*/Memcpy->isVolatile(),
1448 /*CanOverlap=*/CanOverlap,
1449 /*TTI=*/TTI,
1450 /*AtomicElementSize=*/std::nullopt,
1451 /*AverageTripCount=*/TripCount);
1452 }
1453}
1454
1456 const TargetTransformInfo &TTI) {
1457 Value *CopyLen = Memmove->getLength();
1458 Value *SrcAddr = Memmove->getRawSource();
1459 Value *DstAddr = Memmove->getRawDest();
1460 Align SrcAlign = Memmove->getSourceAlign().valueOrOne();
1461 Align DstAlign = Memmove->getDestAlign().valueOrOne();
1462 bool SrcIsVolatile = Memmove->isVolatile();
1463 bool DstIsVolatile = SrcIsVolatile;
1464 IRBuilder<> CastBuilder(Memmove);
1465 CastBuilder.SetCurrentDebugLocation(Memmove->getStableDebugLoc());
1466
1467 unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
1468 unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
1469 if (SrcAS != DstAS) {
1470 if (!TTI.addrspacesMayAlias(SrcAS, DstAS)) {
1471 // We may not be able to emit a pointer comparison, but we don't have
1472 // to. Expand as memcpy.
1473 auto AverageTripCount = getAverageMemOpLoopTripCount(*Memmove);
1474 if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
1476 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CI, SrcAlign, DstAlign,
1477 SrcIsVolatile, DstIsVolatile,
1478 /*CanOverlap=*/false, TTI, std::nullopt, AverageTripCount);
1479 } else {
1481 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign,
1482 DstAlign, SrcIsVolatile, DstIsVolatile,
1483 /*CanOverlap=*/false, TTI, std::nullopt, AverageTripCount);
1484 }
1485
1486 return true;
1487 }
1488
1489 if (!(TTI.isValidAddrSpaceCast(DstAS, SrcAS) ||
1490 TTI.isValidAddrSpaceCast(SrcAS, DstAS))) {
1491 // We don't know generically if it's legal to introduce an
1492 // addrspacecast. We need to know either if it's legal to insert an
1493 // addrspacecast, or if the address spaces cannot alias.
1494 LLVM_DEBUG(
1495 dbgs() << "Do not know how to expand memmove between different "
1496 "address spaces\n");
1497 return false;
1498 }
1499 }
1500
1501 if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
1503 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CI, SrcAlign, DstAlign,
1504 SrcIsVolatile, DstIsVolatile, TTI);
1505 } else {
1507 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
1508 SrcIsVolatile, DstIsVolatile, TTI);
1509 }
1510 return true;
1511}
1512
1514 const TargetTransformInfo *TTI) {
1515 auto AverageTripCount = getAverageMemOpLoopTripCount(*Memset);
1516 if (ConstantInt *CI = dyn_cast<ConstantInt>(Memset->getLength())) {
1518 /*InsertBefore=*/Memset,
1519 /*DstAddr=*/Memset->getRawDest(),
1520 /*Len=*/CI,
1521 /*SetValue=*/Memset->getValue(),
1522 /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
1523 /*IsVolatile=*/Memset->isVolatile(),
1524 /*TTI=*/TTI,
1525 /*AverageTripCount=*/AverageTripCount);
1526 } else {
1528 /*InsertBefore=*/Memset,
1529 /*DstAddr=*/Memset->getRawDest(),
1530 /*Len=*/Memset->getLength(),
1531 /*SetValue=*/Memset->getValue(),
1532 /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
1533 /*IsVolatile=*/Memset->isVolatile(),
1534 /*TTI=*/TTI,
1535 /*AverageTripCount=*/AverageTripCount);
1536 }
1537}
1538
1540 const TargetTransformInfo &TTI) {
1541 expandMemSetAsLoop(MemSet, &TTI);
1542}
1543
1545 const TargetTransformInfo *TTI) {
1547 /*InsertBefore=*/Memset,
1548 /*DstAddr=*/Memset->getRawDest(),
1549 /*Len=*/Memset->getLength(),
1550 /*SetValue=*/Memset->getValue(),
1551 /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
1552 /*IsVolatile=*/Memset->isVolatile(),
1553 /*TTI=*/TTI,
1554 /*AverageTripCount=*/getAverageMemOpLoopTripCount(*Memset));
1555}
1556
1561
1563 const TargetTransformInfo &TTI,
1564 ScalarEvolution *SE) {
1565 assert(AtomicMemcpy->isAtomic());
1566 if (ConstantInt *CI = dyn_cast<ConstantInt>(AtomicMemcpy->getLength())) {
1568 /*InsertBefore=*/AtomicMemcpy,
1569 /*SrcAddr=*/AtomicMemcpy->getRawSource(),
1570 /*DstAddr=*/AtomicMemcpy->getRawDest(),
1571 /*CopyLen=*/CI,
1572 /*SrcAlign=*/AtomicMemcpy->getSourceAlign().valueOrOne(),
1573 /*DstAlign=*/AtomicMemcpy->getDestAlign().valueOrOne(),
1574 /*SrcIsVolatile=*/AtomicMemcpy->isVolatile(),
1575 /*DstIsVolatile=*/AtomicMemcpy->isVolatile(),
1576 /*CanOverlap=*/false, // SrcAddr & DstAddr may not overlap by spec.
1577 /*TTI=*/TTI,
1578 /*AtomicElementSize=*/AtomicMemcpy->getElementSizeInBytes());
1579 } else {
1581 /*InsertBefore=*/AtomicMemcpy,
1582 /*SrcAddr=*/AtomicMemcpy->getRawSource(),
1583 /*DstAddr=*/AtomicMemcpy->getRawDest(),
1584 /*CopyLen=*/AtomicMemcpy->getLength(),
1585 /*SrcAlign=*/AtomicMemcpy->getSourceAlign().valueOrOne(),
1586 /*DstAlign=*/AtomicMemcpy->getDestAlign().valueOrOne(),
1587 /*SrcIsVolatile=*/AtomicMemcpy->isVolatile(),
1588 /*DstIsVolatile=*/AtomicMemcpy->isVolatile(),
1589 /*CanOverlap=*/false, // SrcAddr & DstAddr may not overlap by spec.
1590 /*TargetTransformInfo=*/TTI,
1591 /*AtomicElementSize=*/AtomicMemcpy->getElementSizeInBytes());
1592 }
1593}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static void SetValue(Value *V, GenericValue Val, ExecutionContext &SF)
Definition Execution.cpp:41
#define DEBUG_TYPE
static Value * createMemSetSplat(const DataLayout &DL, IRBuilderBase &B, Value *SetValue, Type *DstType)
Create a Value of DstType that consists of a sequence of copies of SetValue, using bitcasts and a vec...
static std::pair< Value *, Value * > tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2, const TargetTransformInfo &TTI)
static void createMemSetPatternLoop(Instruction *InsertBefore, Value *DstAddr, Value *Len, Value *SetValue, Align DstAlign, bool IsVolatile, const TargetTransformInfo *TTI, std::optional< uint64_t > AverageTripCount)
static bool canOverlap(MemTransferBase< T > *Memcpy, ScalarEvolution *SE)
static void createMemMoveLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, ConstantInt *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI)
static void createMemSetLoopUnknownSize(Instruction *InsertBefore, Value *DstAddr, Value *Len, Value *SetValue, Align DstAlign, bool IsVolatile, const TargetTransformInfo *TTI, std::optional< uint64_t > AverageTripCount)
static Value * getRuntimeLoopRemainder(IRBuilderBase &B, Value *Len, Value *OpSize, unsigned OpSizeVal)
static void createMemSetLoopKnownSize(Instruction *InsertBefore, Value *DstAddr, ConstantInt *Len, Value *SetValue, Align DstAlign, bool IsVolatile, const TargetTransformInfo *TTI, std::optional< uint64_t > AverageTripCount)
static Value * getRuntimeLoopUnits(IRBuilderBase &B, Value *Len, Value *OpSize, unsigned OpSizeVal, Value *RTLoopRemainder=nullptr)
static LoopExpansionInfo insertLoopExpansion(Instruction *InsertBefore, Value *Len, unsigned MainLoopStep, unsigned ResidualLoopStep, StringRef BBNamePrefix, std::optional< uint64_t > ExpectedUnits)
Insert the control flow and loop counters for a memcpy/memset loop expansion.
static void createMemMoveLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define P(N)
This file contains the declarations for profiling metadata utility functions.
#define LLVM_DEBUG(...)
Definition Debug.h:119
This pass exposes codegen information to IR-level passes.
This class represents any memcpy intrinsic i.e.
uint32_t getElementSizeInBytes() const
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
@ ICMP_NE
not equal
Definition InstrTypes.h:762
Conditional Branch instruction.
static CondBrInst * Create(Value *Cond, BasicBlock *IfTrue, BasicBlock *IfFalse, InsertPosition InsertBefore=nullptr)
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:124
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:357
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:353
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2380
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1923
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1216
UnreachableInst * CreateUnreachable()
Definition IRBuilder.h:1358
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
Definition IRBuilder.h:221
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:2008
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2368
UncondBrInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition IRBuilder.h:1210
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2529
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2364
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1439
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1422
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:181
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition IRBuilder.h:1942
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2848
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
MDNode * createAnonymousAliasScope(MDNode *Domain, StringRef Name=StringRef())
Return metadata appropriate for an alias scope root node.
Definition MDBuilder.h:195
LLVM_ABI MDNode * createLikelyBranchWeights()
Return metadata containing two branch weights, with significant bias towards true destination.
Definition MDBuilder.cpp:43
MDNode * createAnonymousAliasScopeDomain(StringRef Name=StringRef())
Return metadata appropriate for an alias scope domain node.
Definition MDBuilder.h:188
Metadata node.
Definition Metadata.h:1069
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1561
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
This is the common base class for memset/memcpy/memmove.
bool isVolatile() const
This class wraps the llvm.memmove intrinsic.
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.experimental.memset.pattern intrinsic.
Common base class for all memory transfer intrinsics.
Value * getRawSource() const
Return the arguments to the instruction.
MaybeAlign getSourceAlign() const
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI bool isKnownPredicateAt(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS, const Instruction *CtxI)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:394
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI void createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, ConstantInt *CopyLen, Align SrcAlign, Align DestAlign, bool SrcIsVolatile, bool DstIsVolatile, bool CanOverlap, const TargetTransformInfo &TTI, std::optional< uint32_t > AtomicCpySize=std::nullopt, std::optional< uint64_t > AverageTripCount=std::nullopt)
Emit a loop implementing the semantics of an llvm.memcpy whose size is a compile time constant.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI cl::opt< bool > ProfcheckDisableMetadataFixes
Definition LoopInfo.cpp:60
LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, StringRef PassName, const Function *F=nullptr)
Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch weights in the new instruct...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool expandMemMoveAsLoop(MemMoveInst *MemMove, const TargetTransformInfo &TTI)
Expand MemMove as a loop.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI SmallVector< InstrProfValueData, 4 > getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, uint64_t &TotalC, bool GetNoICPValue=false)
Extract the value profile data from Inst and returns them if Inst is annotated with value profile dat...
TargetTransformInfo TTI
LLVM_ABI void expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemCpy, const TargetTransformInfo &TTI, ScalarEvolution *SE)
Expand AtomicMemCpy as a loop. AtomicMemCpy is not deleted.
LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet, const TargetTransformInfo *TTI=nullptr)
Expand MemSet as a loop.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet, const TargetTransformInfo *TTI=nullptr)
Expand MemSetPattern as a loop.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI void expandMemCpyAsLoop(MemCpyInst *MemCpy, const TargetTransformInfo &TTI, ScalarEvolution *SE=nullptr)
Expand MemCpy as a loop. MemCpy is not deleted.
LLVM_ABI void setFittedBranchWeights(Instruction &I, ArrayRef< uint64_t > Weights, bool IsExpected, bool ElideAllZero=false)
Variant of setBranchWeights where the Weights will be fit first to uint32_t by shifting right.
LLVM_ABI void createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen, Align SrcAlign, Align DestAlign, bool SrcIsVolatile, bool DstIsVolatile, bool CanOverlap, const TargetTransformInfo &TTI, std::optional< unsigned > AtomicSize=std::nullopt, std::optional< uint64_t > AverageTripCount=std::nullopt)
Emit a loop implementing the semantics of llvm.memcpy where the size is not a compile-time constant.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130