LLVM  15.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1 //===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// This file implements the OpenMPIRBuilder class, which is used as a
11 /// convenient way to create LLVM instructions for OpenMP directives.
12 ///
13 //===----------------------------------------------------------------------===//
14 
16 #include "llvm/ADT/SmallSet.h"
17 #include "llvm/ADT/StringRef.h"
20 #include "llvm/Analysis/LoopInfo.h"
24 #include "llvm/IR/CFG.h"
25 #include "llvm/IR/Constants.h"
27 #include "llvm/IR/GlobalVariable.h"
28 #include "llvm/IR/IRBuilder.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/IR/PassManager.h"
31 #include "llvm/IR/Value.h"
32 #include "llvm/MC/TargetRegistry.h"
40 
41 #include <cstdint>
42 
43 #define DEBUG_TYPE "openmp-ir-builder"
44 
45 using namespace llvm;
46 using namespace omp;
47 
48 static cl::opt<bool>
49  OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
50  cl::desc("Use optimistic attributes describing "
51  "'as-if' properties of runtime calls."),
52  cl::init(false));
53 
55  "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
56  cl::desc("Factor for the unroll threshold to account for code "
57  "simplifications still taking place"),
58  cl::init(1.5));
59 
60 #ifndef NDEBUG
61 /// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
62 /// at position IP1 may change the meaning of IP2 or vice-versa. This is because
63 /// an InsertPoint stores the instruction before something is inserted. For
64 /// instance, if both point to the same instruction, two IRBuilders alternating
65 /// creating instruction will cause the instructions to be interleaved.
68  if (!IP1.isSet() || !IP2.isSet())
69  return false;
70  return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
71 }
72 
74  // Valid ordered/unordered and base algorithm combinations.
75  switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
117  break;
118  default:
119  return false;
120  }
121 
122  // Must not set both monotonicity modifiers at the same time.
123  OMPScheduleType MonotonicityFlags =
125  if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
126  return false;
127 
128  return true;
129 }
130 #endif
131 
132 /// Determine which scheduling algorithm to use, determined from schedule clause
133 /// arguments.
134 static OMPScheduleType
135 getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
136  bool HasSimdModifier) {
137  // Currently, the default schedule it static.
138  switch (ClauseKind) {
139  case OMP_SCHEDULE_Default:
140  case OMP_SCHEDULE_Static:
141  return HasChunks ? OMPScheduleType::BaseStaticChunked
143  case OMP_SCHEDULE_Dynamic:
145  case OMP_SCHEDULE_Guided:
146  return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
148  case OMP_SCHEDULE_Auto:
150  case OMP_SCHEDULE_Runtime:
151  return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
153  }
154  llvm_unreachable("unhandled schedule clause argument");
155 }
156 
157 /// Adds ordering modifier flags to schedule type.
158 static OMPScheduleType
160  bool HasOrderedClause) {
161  assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
163  "Must not have ordering nor monotonicity flags already set");
164 
165  OMPScheduleType OrderingModifier = HasOrderedClause
168  OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
169 
170  // Unsupported combinations
171  if (OrderingScheduleType ==
174  else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
177 
178  return OrderingScheduleType;
179 }
180 
181 /// Adds monotonicity modifier flags to schedule type.
182 static OMPScheduleType
184  bool HasSimdModifier, bool HasMonotonic,
185  bool HasNonmonotonic, bool HasOrderedClause) {
186  assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
188  "Must not have monotonicity flags already set");
189  assert((!HasMonotonic || !HasNonmonotonic) &&
190  "Monotonic and Nonmonotonic are contradicting each other");
191 
192  if (HasMonotonic) {
193  return ScheduleType | OMPScheduleType::ModifierMonotonic;
194  } else if (HasNonmonotonic) {
195  return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
196  } else {
197  // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
198  // If the static schedule kind is specified or if the ordered clause is
199  // specified, and if the nonmonotonic modifier is not specified, the
200  // effect is as if the monotonic modifier is specified. Otherwise, unless
201  // the monotonic modifier is specified, the effect is as if the
202  // nonmonotonic modifier is specified.
203  OMPScheduleType BaseScheduleType =
204  ScheduleType & ~OMPScheduleType::ModifierMask;
205  if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
206  (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
207  HasOrderedClause) {
208  // The monotonic is used by default in openmp runtime library, so no need
209  // to set it.
210  return ScheduleType;
211  } else {
212  return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
213  }
214  }
215 }
216 
217 /// Determine the schedule type using schedule and ordering clause arguments.
218 static OMPScheduleType
219 computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
220  bool HasSimdModifier, bool HasMonotonicModifier,
221  bool HasNonmonotonicModifier, bool HasOrderedClause) {
222  OMPScheduleType BaseSchedule =
223  getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
224  OMPScheduleType OrderedSchedule =
225  getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
227  OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
228  HasNonmonotonicModifier, HasOrderedClause);
229 
231  return Result;
232 }
233 
234 /// Make \p Source branch to \p Target.
235 ///
236 /// Handles two situations:
237 /// * \p Source already has an unconditional branch.
238 /// * \p Source is a degenerate block (no terminator because the BB is
239 /// the current head of the IR construction).
241  if (Instruction *Term = Source->getTerminator()) {
242  auto *Br = cast<BranchInst>(Term);
243  assert(!Br->isConditional() &&
244  "BB's terminator must be an unconditional branch (or degenerate)");
245  BasicBlock *Succ = Br->getSuccessor(0);
246  Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
247  Br->setSuccessor(0, Target);
248  return;
249  }
250 
251  auto *NewBr = BranchInst::Create(Target, Source);
252  NewBr->setDebugLoc(DL);
253 }
254 
256  bool CreateBranch) {
257  assert(New->getFirstInsertionPt() == New->begin() &&
258  "Target BB must not have PHI nodes");
259 
260  // Move instructions to new block.
261  BasicBlock *Old = IP.getBlock();
262  New->getInstList().splice(New->begin(), Old->getInstList(), IP.getPoint(),
263  Old->end());
264 
265  if (CreateBranch)
266  BranchInst::Create(New, Old);
267 }
268 
269 void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
270  DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
271  BasicBlock *Old = Builder.GetInsertBlock();
272 
273  spliceBB(Builder.saveIP(), New, CreateBranch);
274  if (CreateBranch)
275  Builder.SetInsertPoint(Old->getTerminator());
276  else
277  Builder.SetInsertPoint(Old);
278 
279  // SetInsertPoint also updates the Builder's debug location, but we want to
280  // keep the one the Builder was configured to use.
281  Builder.SetCurrentDebugLocation(DebugLoc);
282 }
283 
285  llvm::Twine Name) {
286  BasicBlock *Old = IP.getBlock();
288  Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
289  Old->getParent(), Old->getNextNode());
290  spliceBB(IP, New, CreateBranch);
291  New->replaceSuccessorsPhiUsesWith(Old, New);
292  return New;
293 }
294 
296  llvm::Twine Name) {
297  DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
298  BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
299  if (CreateBranch)
300  Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
301  else
302  Builder.SetInsertPoint(Builder.GetInsertBlock());
303  // SetInsertPoint also updates the Builder's debug location, but we want to
304  // keep the one the Builder was configured to use.
305  Builder.SetCurrentDebugLocation(DebugLoc);
306  return New;
307 }
308 
310  llvm::Twine Name) {
311  DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
312  BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
313  if (CreateBranch)
314  Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
315  else
316  Builder.SetInsertPoint(Builder.GetInsertBlock());
317  // SetInsertPoint also updates the Builder's debug location, but we want to
318  // keep the one the Builder was configured to use.
319  Builder.SetCurrentDebugLocation(DebugLoc);
320  return New;
321 }
322 
324  llvm::Twine Suffix) {
325  BasicBlock *Old = Builder.GetInsertBlock();
326  return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
327 }
328 
330  LLVMContext &Ctx = Fn.getContext();
331 
332  // Get the function's current attributes.
333  auto Attrs = Fn.getAttributes();
334  auto FnAttrs = Attrs.getFnAttrs();
335  auto RetAttrs = Attrs.getRetAttrs();
337  for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
338  ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
339 
340 #define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
341 #include "llvm/Frontend/OpenMP/OMPKinds.def"
342 
343  // Add attributes to the function declaration.
344  switch (FnID) {
345 #define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
346  case Enum: \
347  FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
348  RetAttrs = RetAttrs.addAttributes(Ctx, RetAttrSet); \
349  for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
350  ArgAttrs[ArgNo] = \
351  ArgAttrs[ArgNo].addAttributes(Ctx, ArgAttrSets[ArgNo]); \
352  Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
353  break;
354 #include "llvm/Frontend/OpenMP/OMPKinds.def"
355  default:
356  // Attributes are optional.
357  break;
358  }
359 }
360 
363  FunctionType *FnTy = nullptr;
364  Function *Fn = nullptr;
365 
366  // Try to find the declation in the module first.
367  switch (FnID) {
368 #define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
369  case Enum: \
370  FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
371  IsVarArg); \
372  Fn = M.getFunction(Str); \
373  break;
374 #include "llvm/Frontend/OpenMP/OMPKinds.def"
375  }
376 
377  if (!Fn) {
378  // Create a new declaration if we need one.
379  switch (FnID) {
380 #define OMP_RTL(Enum, Str, ...) \
381  case Enum: \
382  Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
383  break;
384 #include "llvm/Frontend/OpenMP/OMPKinds.def"
385  }
386 
387  // Add information if the runtime function takes a callback function
388  if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
389  if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
390  LLVMContext &Ctx = Fn->getContext();
391  MDBuilder MDB(Ctx);
392  // Annotate the callback behavior of the runtime function:
393  // - The callback callee is argument number 2 (microtask).
394  // - The first two arguments of the callback callee are unknown (-1).
395  // - All variadic arguments to the runtime function are passed to the
396  // callback callee.
397  Fn->addMetadata(
398  LLVMContext::MD_callback,
400  2, {-1, -1}, /* VarArgsArePassed */ true)}));
401  }
402  }
403 
404  LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
405  << " with type " << *Fn->getFunctionType() << "\n");
406  addAttributes(FnID, *Fn);
407 
408  } else {
409  LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
410  << " with type " << *Fn->getFunctionType() << "\n");
411  }
412 
413  assert(Fn && "Failed to create OpenMP runtime function");
414 
415  // Cast the function to the expected type if necessary
417  return {FnTy, C};
418 }
419 
421  FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
422  auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
423  assert(Fn && "Failed to create OpenMP runtime function pointer");
424  return Fn;
425 }
426 
427 void OpenMPIRBuilder::initialize() { initializeTypes(M); }
428 
430  SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
432  SmallVector<OutlineInfo, 16> DeferredOutlines;
433  for (OutlineInfo &OI : OutlineInfos) {
434  // Skip functions that have not finalized yet; may happen with nested
435  // function generation.
436  if (Fn && OI.getFunction() != Fn) {
437  DeferredOutlines.push_back(OI);
438  continue;
439  }
440 
441  ParallelRegionBlockSet.clear();
442  Blocks.clear();
443  OI.collectBlocks(ParallelRegionBlockSet, Blocks);
444 
445  Function *OuterFn = OI.getFunction();
446  CodeExtractorAnalysisCache CEAC(*OuterFn);
447  CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
448  /* AggregateArgs */ true,
449  /* BlockFrequencyInfo */ nullptr,
450  /* BranchProbabilityInfo */ nullptr,
451  /* AssumptionCache */ nullptr,
452  /* AllowVarArgs */ true,
453  /* AllowAlloca */ true,
454  /* AllocaBlock*/ OI.OuterAllocaBB,
455  /* Suffix */ ".omp_par");
456 
457  LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
458  LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
459  << " Exit: " << OI.ExitBB->getName() << "\n");
460  assert(Extractor.isEligible() &&
461  "Expected OpenMP outlining to be possible!");
462 
463  for (auto *V : OI.ExcludeArgsFromAggregate)
464  Extractor.excludeArgFromAggregate(V);
465 
466  Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
467 
468  LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
469  LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
470  assert(OutlinedFn->getReturnType()->isVoidTy() &&
471  "OpenMP outlined functions should not return a value!");
472 
473  // For compability with the clang CG we move the outlined function after the
474  // one with the parallel region.
475  OutlinedFn->removeFromParent();
476  M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
477 
478  // Remove the artificial entry introduced by the extractor right away, we
479  // made our own entry block after all.
480  {
481  BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
482  assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
483  assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
484  // Move instructions from the to-be-deleted ArtificialEntry to the entry
485  // basic block of the parallel region. CodeExtractor generates
486  // instructions to unwrap the aggregate argument and may sink
487  // allocas/bitcasts for values that are solely used in the outlined region
488  // and do not escape.
489  assert(!ArtificialEntry.empty() &&
490  "Expected instructions to add in the outlined region entry");
491  for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
492  End = ArtificialEntry.rend();
493  It != End;) {
494  Instruction &I = *It;
495  It++;
496 
497  if (I.isTerminator())
498  continue;
499 
500  I.moveBefore(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
501  }
502 
503  OI.EntryBB->moveBefore(&ArtificialEntry);
504  ArtificialEntry.eraseFromParent();
505  }
506  assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
507  assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
508 
509  // Run a user callback, e.g. to add attributes.
510  if (OI.PostOutlineCB)
511  OI.PostOutlineCB(*OutlinedFn);
512  }
513 
514  // Remove work items that have been completed.
515  OutlineInfos = std::move(DeferredOutlines);
516 }
517 
519  assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
520 }
521 
523  IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
524  auto *GV =
525  new GlobalVariable(M, I32Ty,
526  /* isConstant = */ true, GlobalValue::WeakODRLinkage,
527  ConstantInt::get(I32Ty, Value), Name);
528  GV->setVisibility(GlobalValue::HiddenVisibility);
529 
530  return GV;
531 }
532 
534  uint32_t SrcLocStrSize,
535  IdentFlag LocFlags,
536  unsigned Reserve2Flags) {
537  // Enable "C-mode".
538  LocFlags |= OMP_IDENT_FLAG_KMPC;
539 
540  Constant *&Ident =
541  IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
542  if (!Ident) {
543  Constant *I32Null = ConstantInt::getNullValue(Int32);
544  Constant *IdentData[] = {I32Null,
545  ConstantInt::get(Int32, uint32_t(LocFlags)),
546  ConstantInt::get(Int32, Reserve2Flags),
547  ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
548  Constant *Initializer =
549  ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
550 
551  // Look for existing encoding of the location + flags, not needed but
552  // minimizes the difference to the existing solution while we transition.
553  for (GlobalVariable &GV : M.getGlobalList())
554  if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
555  if (GV.getInitializer() == Initializer)
556  Ident = &GV;
557 
558  if (!Ident) {
559  auto *GV = new GlobalVariable(
560  M, OpenMPIRBuilder::Ident,
561  /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
563  M.getDataLayout().getDefaultGlobalsAddressSpace());
564  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
565  GV->setAlignment(Align(8));
566  Ident = GV;
567  }
568  }
569 
570  return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
571 }
572 
574  uint32_t &SrcLocStrSize) {
575  SrcLocStrSize = LocStr.size();
576  Constant *&SrcLocStr = SrcLocStrMap[LocStr];
577  if (!SrcLocStr) {
578  Constant *Initializer =
579  ConstantDataArray::getString(M.getContext(), LocStr);
580 
581  // Look for existing encoding of the location, not needed but minimizes the
582  // difference to the existing solution while we transition.
583  for (GlobalVariable &GV : M.getGlobalList())
584  if (GV.isConstant() && GV.hasInitializer() &&
585  GV.getInitializer() == Initializer)
586  return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
587 
588  SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "",
589  /* AddressSpace */ 0, &M);
590  }
591  return SrcLocStr;
592 }
593 
595  StringRef FileName,
596  unsigned Line, unsigned Column,
597  uint32_t &SrcLocStrSize) {
598  SmallString<128> Buffer;
599  Buffer.push_back(';');
600  Buffer.append(FileName);
601  Buffer.push_back(';');
602  Buffer.append(FunctionName);
603  Buffer.push_back(';');
604  Buffer.append(std::to_string(Line));
605  Buffer.push_back(';');
606  Buffer.append(std::to_string(Column));
607  Buffer.push_back(';');
608  Buffer.push_back(';');
609  return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
610 }
611 
612 Constant *
614  StringRef UnknownLoc = ";unknown;unknown;0;0;;";
615  return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
616 }
617 
619  uint32_t &SrcLocStrSize,
620  Function *F) {
621  DILocation *DIL = DL.get();
622  if (!DIL)
623  return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
624  StringRef FileName = M.getName();
625  if (DIFile *DIF = DIL->getFile())
626  if (Optional<StringRef> Source = DIF->getSource())
627  FileName = *Source;
628  StringRef Function = DIL->getScope()->getSubprogram()->getName();
629  if (Function.empty() && F)
630  Function = F->getName();
631  return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
632  DIL->getColumn(), SrcLocStrSize);
633 }
634 
636  uint32_t &SrcLocStrSize) {
637  return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
638  Loc.IP.getBlock()->getParent());
639 }
640 
642  return Builder.CreateCall(
643  getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
644  "omp_global_thread_num");
645 }
646 
649  bool ForceSimpleCall, bool CheckCancelFlag) {
650  if (!updateToLocation(Loc))
651  return Loc.IP;
652  return emitBarrierImpl(Loc, DK, ForceSimpleCall, CheckCancelFlag);
653 }
654 
657  bool ForceSimpleCall, bool CheckCancelFlag) {
658  // Build call __kmpc_cancel_barrier(loc, thread_id) or
659  // __kmpc_barrier(loc, thread_id);
660 
661  IdentFlag BarrierLocFlags;
662  switch (Kind) {
663  case OMPD_for:
664  BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
665  break;
666  case OMPD_sections:
667  BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
668  break;
669  case OMPD_single:
670  BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
671  break;
672  case OMPD_barrier:
673  BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
674  break;
675  default:
676  BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
677  break;
678  }
679 
680  uint32_t SrcLocStrSize;
681  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
682  Value *Args[] = {
683  getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
684  getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
685 
686  // If we are in a cancellable parallel region, barriers are cancellation
687  // points.
688  // TODO: Check why we would force simple calls or to ignore the cancel flag.
689  bool UseCancelBarrier =
690  !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
691 
692  Value *Result =
693  Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
694  UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
695  : OMPRTL___kmpc_barrier),
696  Args);
697 
698  if (UseCancelBarrier && CheckCancelFlag)
699  emitCancelationCheckImpl(Result, OMPD_parallel);
700 
701  return Builder.saveIP();
702 }
703 
706  Value *IfCondition,
707  omp::Directive CanceledDirective) {
708  if (!updateToLocation(Loc))
709  return Loc.IP;
710 
711  // LLVM utilities like blocks with terminators.
712  auto *UI = Builder.CreateUnreachable();
713 
714  Instruction *ThenTI = UI, *ElseTI = nullptr;
715  if (IfCondition)
716  SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
717  Builder.SetInsertPoint(ThenTI);
718 
719  Value *CancelKind = nullptr;
720  switch (CanceledDirective) {
721 #define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
722  case DirectiveEnum: \
723  CancelKind = Builder.getInt32(Value); \
724  break;
725 #include "llvm/Frontend/OpenMP/OMPKinds.def"
726  default:
727  llvm_unreachable("Unknown cancel kind!");
728  }
729 
730  uint32_t SrcLocStrSize;
731  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
732  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
733  Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
734  Value *Result = Builder.CreateCall(
735  getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
736  auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) {
737  if (CanceledDirective == OMPD_parallel) {
739  Builder.restoreIP(IP);
740  createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
741  omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
742  /* CheckCancelFlag */ false);
743  }
744  };
745 
746  // The actual cancel logic is shared with others, e.g., cancel_barriers.
747  emitCancelationCheckImpl(Result, CanceledDirective, ExitCB);
748 
749  // Update the insertion point and remove the terminator we introduced.
750  Builder.SetInsertPoint(UI->getParent());
751  UI->eraseFromParent();
752 
753  return Builder.saveIP();
754 }
755 
757  uint64_t Size, int32_t Flags,
759  Type *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
760  Type *Int32Ty = Type::getInt32Ty(M.getContext());
761  Type *SizeTy = M.getDataLayout().getIntPtrType(M.getContext());
762 
763  Constant *AddrName = ConstantDataArray::getString(M.getContext(), Name);
764 
765  // Create the constant string used to look up the symbol in the device.
766  auto *Str =
767  new llvm::GlobalVariable(M, AddrName->getType(), /*isConstant=*/true,
769  ".omp_offloading.entry_name");
770  Str->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
771 
772  // Construct the offloading entry.
773  Constant *EntryData[] = {
776  ConstantInt::get(SizeTy, Size),
777  ConstantInt::get(Int32Ty, Flags),
779  };
780  Constant *EntryInitializer =
781  ConstantStruct::get(OpenMPIRBuilder::OffloadEntry, EntryData);
782 
783  auto *Entry = new GlobalVariable(
784  M, OpenMPIRBuilder::OffloadEntry,
785  /* isConstant = */ true, GlobalValue::WeakAnyLinkage, EntryInitializer,
786  ".omp_offloading.entry." + Name, nullptr, GlobalValue::NotThreadLocal,
787  M.getDataLayout().getDefaultGlobalsAddressSpace());
788 
789  // The entry has to be created in the section the linker expects it to be.
790  Entry->setSection(SectionName);
791  Entry->setAlignment(Align(1));
792 }
793 
795  omp::Directive CanceledDirective,
796  FinalizeCallbackTy ExitCB) {
797  assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
798  "Unexpected cancellation!");
799 
800  // For a cancel barrier we create two new blocks.
801  BasicBlock *BB = Builder.GetInsertBlock();
802  BasicBlock *NonCancellationBlock;
803  if (Builder.GetInsertPoint() == BB->end()) {
804  // TODO: This branch will not be needed once we moved to the
805  // OpenMPIRBuilder codegen completely.
806  NonCancellationBlock = BasicBlock::Create(
807  BB->getContext(), BB->getName() + ".cont", BB->getParent());
808  } else {
809  NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
810  BB->getTerminator()->eraseFromParent();
811  Builder.SetInsertPoint(BB);
812  }
813  BasicBlock *CancellationBlock = BasicBlock::Create(
814  BB->getContext(), BB->getName() + ".cncl", BB->getParent());
815 
816  // Jump to them based on the return value.
817  Value *Cmp = Builder.CreateIsNull(CancelFlag);
818  Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
819  /* TODO weight */ nullptr, nullptr);
820 
821  // From the cancellation block we finalize all variables and go to the
822  // post finalization block that is known to the FiniCB callback.
823  Builder.SetInsertPoint(CancellationBlock);
824  if (ExitCB)
825  ExitCB(Builder.saveIP());
826  auto &FI = FinalizationStack.back();
827  FI.FiniCB(Builder.saveIP());
828 
829  // The continuation block is where code generation continues.
830  Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
831 }
832 
834  const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
835  BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
836  FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
837  omp::ProcBindKind ProcBind, bool IsCancellable) {
838  assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
839 
840  if (!updateToLocation(Loc))
841  return Loc.IP;
842 
843  uint32_t SrcLocStrSize;
844  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
845  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
846  Value *ThreadID = getOrCreateThreadID(Ident);
847 
848  if (NumThreads) {
849  // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
850  Value *Args[] = {
851  Ident, ThreadID,
852  Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
853  Builder.CreateCall(
854  getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
855  }
856 
857  if (ProcBind != OMP_PROC_BIND_default) {
858  // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
859  Value *Args[] = {
860  Ident, ThreadID,
861  ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
862  Builder.CreateCall(
863  getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
864  }
865 
866  BasicBlock *InsertBB = Builder.GetInsertBlock();
867  Function *OuterFn = InsertBB->getParent();
868 
869  // Save the outer alloca block because the insertion iterator may get
870  // invalidated and we still need this later.
871  BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
872 
873  // Vector to remember instructions we used only during the modeling but which
874  // we want to delete at the end.
875  SmallVector<Instruction *, 4> ToBeDeleted;
876 
877  // Change the location to the outer alloca insertion point to create and
878  // initialize the allocas we pass into the parallel region.
879  Builder.restoreIP(OuterAllocaIP);
880  AllocaInst *TIDAddr = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
881  AllocaInst *ZeroAddr = Builder.CreateAlloca(Int32, nullptr, "zero.addr");
882 
883  // If there is an if condition we actually use the TIDAddr and ZeroAddr in the
884  // program, otherwise we only need them for modeling purposes to get the
885  // associated arguments in the outlined function. In the former case,
886  // initialize the allocas properly, in the latter case, delete them later.
887  if (IfCondition) {
888  Builder.CreateStore(Constant::getNullValue(Int32), TIDAddr);
889  Builder.CreateStore(Constant::getNullValue(Int32), ZeroAddr);
890  } else {
891  ToBeDeleted.push_back(TIDAddr);
892  ToBeDeleted.push_back(ZeroAddr);
893  }
894 
895  // Create an artificial insertion point that will also ensure the blocks we
896  // are about to split are not degenerated.
897  auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
898 
899  Instruction *ThenTI = UI, *ElseTI = nullptr;
900  if (IfCondition)
901  SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
902 
903  BasicBlock *ThenBB = ThenTI->getParent();
904  BasicBlock *PRegEntryBB = ThenBB->splitBasicBlock(ThenTI, "omp.par.entry");
905  BasicBlock *PRegBodyBB =
906  PRegEntryBB->splitBasicBlock(ThenTI, "omp.par.region");
907  BasicBlock *PRegPreFiniBB =
908  PRegBodyBB->splitBasicBlock(ThenTI, "omp.par.pre_finalize");
909  BasicBlock *PRegExitBB =
910  PRegPreFiniBB->splitBasicBlock(ThenTI, "omp.par.exit");
911 
912  auto FiniCBWrapper = [&](InsertPointTy IP) {
913  // Hide "open-ended" blocks from the given FiniCB by setting the right jump
914  // target to the region exit block.
915  if (IP.getBlock()->end() == IP.getPoint()) {
917  Builder.restoreIP(IP);
918  Instruction *I = Builder.CreateBr(PRegExitBB);
919  IP = InsertPointTy(I->getParent(), I->getIterator());
920  }
921  assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
922  IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
923  "Unexpected insertion point for finalization call!");
924  return FiniCB(IP);
925  };
926 
927  FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
928 
929  // Generate the privatization allocas in the block that will become the entry
930  // of the outlined function.
931  Builder.SetInsertPoint(PRegEntryBB->getTerminator());
932  InsertPointTy InnerAllocaIP = Builder.saveIP();
933 
934  AllocaInst *PrivTIDAddr =
935  Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
936  Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
937 
938  // Add some fake uses for OpenMP provided arguments.
939  ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
940  Instruction *ZeroAddrUse =
941  Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
942  ToBeDeleted.push_back(ZeroAddrUse);
943 
944  // ThenBB
945  // |
946  // V
947  // PRegionEntryBB <- Privatization allocas are placed here.
948  // |
949  // V
950  // PRegionBodyBB <- BodeGen is invoked here.
951  // |
952  // V
953  // PRegPreFiniBB <- The block we will start finalization from.
954  // |
955  // V
956  // PRegionExitBB <- A common exit to simplify block collection.
957  //
958 
959  LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
960 
961  // Let the caller create the body.
962  assert(BodyGenCB && "Expected body generation callback!");
963  InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
964  BodyGenCB(InnerAllocaIP, CodeGenIP);
965 
966  LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
967 
968  FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
969  if (auto *F = dyn_cast<llvm::Function>(RTLFn.getCallee())) {
970  if (!F->hasMetadata(llvm::LLVMContext::MD_callback)) {
971  llvm::LLVMContext &Ctx = F->getContext();
972  MDBuilder MDB(Ctx);
973  // Annotate the callback behavior of the __kmpc_fork_call:
974  // - The callback callee is argument number 2 (microtask).
975  // - The first two arguments of the callback callee are unknown (-1).
976  // - All variadic arguments to the __kmpc_fork_call are passed to the
977  // callback callee.
978  F->addMetadata(
979  llvm::LLVMContext::MD_callback,
981  Ctx, {MDB.createCallbackEncoding(2, {-1, -1},
982  /* VarArgsArePassed */ true)}));
983  }
984  }
985 
986  OutlineInfo OI;
987  OI.PostOutlineCB = [=](Function &OutlinedFn) {
988  // Add some known attributes.
989  OutlinedFn.addParamAttr(0, Attribute::NoAlias);
990  OutlinedFn.addParamAttr(1, Attribute::NoAlias);
991  OutlinedFn.addFnAttr(Attribute::NoUnwind);
992  OutlinedFn.addFnAttr(Attribute::NoRecurse);
993 
994  assert(OutlinedFn.arg_size() >= 2 &&
995  "Expected at least tid and bounded tid as arguments");
996  unsigned NumCapturedVars =
997  OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
998 
999  CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1000  CI->getParent()->setName("omp_parallel");
1001  Builder.SetInsertPoint(CI);
1002 
1003  // Build call __kmpc_fork_call(Ident, n, microtask, var1, .., varn);
1004  Value *ForkCallArgs[] = {
1005  Ident, Builder.getInt32(NumCapturedVars),
1006  Builder.CreateBitCast(&OutlinedFn, ParallelTaskPtr)};
1007 
1008  SmallVector<Value *, 16> RealArgs;
1009  RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1010  RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1011 
1012  Builder.CreateCall(RTLFn, RealArgs);
1013 
1014  LLVM_DEBUG(dbgs() << "With fork_call placed: "
1015  << *Builder.GetInsertBlock()->getParent() << "\n");
1016 
1017  InsertPointTy ExitIP(PRegExitBB, PRegExitBB->end());
1018 
1019  // Initialize the local TID stack location with the argument value.
1020  Builder.SetInsertPoint(PrivTID);
1021  Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1022  Builder.CreateStore(Builder.CreateLoad(Int32, OutlinedAI), PrivTIDAddr);
1023 
1024  // If no "if" clause was present we do not need the call created during
1025  // outlining, otherwise we reuse it in the serialized parallel region.
1026  if (!ElseTI) {
1027  CI->eraseFromParent();
1028  } else {
1029 
1030  // If an "if" clause was present we are now generating the serialized
1031  // version into the "else" branch.
1032  Builder.SetInsertPoint(ElseTI);
1033 
1034  // Build calls __kmpc_serialized_parallel(&Ident, GTid);
1035  Value *SerializedParallelCallArgs[] = {Ident, ThreadID};
1036  Builder.CreateCall(
1037  getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_serialized_parallel),
1038  SerializedParallelCallArgs);
1039 
1040  // OutlinedFn(&GTid, &zero, CapturedStruct);
1041  CI->removeFromParent();
1042  Builder.Insert(CI);
1043 
1044  // __kmpc_end_serialized_parallel(&Ident, GTid);
1045  Value *EndArgs[] = {Ident, ThreadID};
1046  Builder.CreateCall(
1047  getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_serialized_parallel),
1048  EndArgs);
1049 
1050  LLVM_DEBUG(dbgs() << "With serialized parallel region: "
1051  << *Builder.GetInsertBlock()->getParent() << "\n");
1052  }
1053 
1054  for (Instruction *I : ToBeDeleted)
1055  I->eraseFromParent();
1056  };
1057 
1058  // Adjust the finalization stack, verify the adjustment, and call the
1059  // finalize function a last time to finalize values between the pre-fini
1060  // block and the exit block if we left the parallel "the normal way".
1061  auto FiniInfo = FinalizationStack.pop_back_val();
1062  (void)FiniInfo;
1063  assert(FiniInfo.DK == OMPD_parallel &&
1064  "Unexpected finalization stack state!");
1065 
1066  Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1067 
1068  InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1069  FiniCB(PreFiniIP);
1070 
1071  OI.OuterAllocaBB = OuterAllocaBlock;
1072  OI.EntryBB = PRegEntryBB;
1073  OI.ExitBB = PRegExitBB;
1074 
1075  SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1077  OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1078 
1079  // Ensure a single exit node for the outlined region by creating one.
1080  // We might have multiple incoming edges to the exit now due to finalizations,
1081  // e.g., cancel calls that cause the control flow to leave the region.
1082  BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1083  PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1084  PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1085  Blocks.push_back(PRegOutlinedExitBB);
1086 
1087  CodeExtractorAnalysisCache CEAC(*OuterFn);
1088  CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1089  /* AggregateArgs */ false,
1090  /* BlockFrequencyInfo */ nullptr,
1091  /* BranchProbabilityInfo */ nullptr,
1092  /* AssumptionCache */ nullptr,
1093  /* AllowVarArgs */ true,
1094  /* AllowAlloca */ true,
1095  /* AllocationBlock */ OuterAllocaBlock,
1096  /* Suffix */ ".omp_par");
1097 
1098  // Find inputs to, outputs from the code region.
1099  BasicBlock *CommonExit = nullptr;
1100  SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1101  Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1102  Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
1103 
1104  LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1105 
1106  FunctionCallee TIDRTLFn =
1107  getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1108 
1109  auto PrivHelper = [&](Value &V) {
1110  if (&V == TIDAddr || &V == ZeroAddr) {
1111  OI.ExcludeArgsFromAggregate.push_back(&V);
1112  return;
1113  }
1114 
1116  for (Use &U : V.uses())
1117  if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1118  if (ParallelRegionBlockSet.count(UserI->getParent()))
1119  Uses.insert(&U);
1120 
1121  // __kmpc_fork_call expects extra arguments as pointers. If the input
1122  // already has a pointer type, everything is fine. Otherwise, store the
1123  // value onto stack and load it back inside the to-be-outlined region. This
1124  // will ensure only the pointer will be passed to the function.
1125  // FIXME: if there are more than 15 trailing arguments, they must be
1126  // additionally packed in a struct.
1127  Value *Inner = &V;
1128  if (!V.getType()->isPointerTy()) {
1130  LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1131 
1132  Builder.restoreIP(OuterAllocaIP);
1133  Value *Ptr =
1134  Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1135 
1136  // Store to stack at end of the block that currently branches to the entry
1137  // block of the to-be-outlined region.
1138  Builder.SetInsertPoint(InsertBB,
1139  InsertBB->getTerminator()->getIterator());
1140  Builder.CreateStore(&V, Ptr);
1141 
1142  // Load back next to allocations in the to-be-outlined region.
1143  Builder.restoreIP(InnerAllocaIP);
1144  Inner = Builder.CreateLoad(V.getType(), Ptr);
1145  }
1146 
1147  Value *ReplacementValue = nullptr;
1148  CallInst *CI = dyn_cast<CallInst>(&V);
1149  if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1150  ReplacementValue = PrivTID;
1151  } else {
1152  Builder.restoreIP(
1153  PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
1154  assert(ReplacementValue &&
1155  "Expected copy/create callback to set replacement value!");
1156  if (ReplacementValue == &V)
1157  return;
1158  }
1159 
1160  for (Use *UPtr : Uses)
1161  UPtr->set(ReplacementValue);
1162  };
1163 
1164  // Reset the inner alloca insertion as it will be used for loading the values
1165  // wrapped into pointers before passing them into the to-be-outlined region.
1166  // Configure it to insert immediately after the fake use of zero address so
1167  // that they are available in the generated body and so that the
1168  // OpenMP-related values (thread ID and zero address pointers) remain leading
1169  // in the argument list.
1170  InnerAllocaIP = IRBuilder<>::InsertPoint(
1171  ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1172 
1173  // Reset the outer alloca insertion point to the entry of the relevant block
1174  // in case it was invalidated.
1175  OuterAllocaIP = IRBuilder<>::InsertPoint(
1176  OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1177 
1178  for (Value *Input : Inputs) {
1179  LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1180  PrivHelper(*Input);
1181  }
1182  LLVM_DEBUG({
1183  for (Value *Output : Outputs)
1184  LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1185  });
1186  assert(Outputs.empty() &&
1187  "OpenMP outlining should not produce live-out values!");
1188 
1189  LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1190  LLVM_DEBUG({
1191  for (auto *BB : Blocks)
1192  dbgs() << " PBR: " << BB->getName() << "\n";
1193  });
1194 
1195  // Register the outlined info.
1196  addOutlineInfo(std::move(OI));
1197 
1198  InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1199  UI->eraseFromParent();
1200 
1201  return AfterIP;
1202 }
1203 
1205  // Build call void __kmpc_flush(ident_t *loc)
1206  uint32_t SrcLocStrSize;
1207  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1208  Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1209 
1210  Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1211 }
1212 
1214  if (!updateToLocation(Loc))
1215  return;
1216  emitFlush(Loc);
1217 }
1218 
1220  // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1221  // global_tid);
1222  uint32_t SrcLocStrSize;
1223  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1224  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1225  Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1226 
1227  // Ignore return result until untied tasks are supported.
1228  Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1229  Args);
1230 }
1231 
1233  if (!updateToLocation(Loc))
1234  return;
1235  emitTaskwaitImpl(Loc);
1236 }
1237 
1239  // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1240  uint32_t SrcLocStrSize;
1241  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1242  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1243  Constant *I32Null = ConstantInt::getNullValue(Int32);
1244  Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1245 
1246  Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1247  Args);
1248 }
1249 
1251  if (!updateToLocation(Loc))
1252  return;
1253  emitTaskyieldImpl(Loc);
1254 }
1255 
1258  InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
1259  bool Tied, Value *Final) {
1260  if (!updateToLocation(Loc))
1261  return InsertPointTy();
1262 
1263  // The current basic block is split into four basic blocks. After outlining,
1264  // they will be mapped as follows:
1265  // ```
1266  // def current_fn() {
1267  // current_basic_block:
1268  // br label %task.exit
1269  // task.exit:
1270  // ; instructions after task
1271  // }
1272  // def outlined_fn() {
1273  // task.alloca:
1274  // br label %task.body
1275  // task.body:
1276  // ret void
1277  // }
1278  // ```
1279  BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1280  BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1281  BasicBlock *TaskAllocaBB =
1282  splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1283 
1284  OutlineInfo OI;
1285  OI.EntryBB = TaskAllocaBB;
1286  OI.OuterAllocaBB = AllocaIP.getBlock();
1287  OI.ExitBB = TaskExitBB;
1288  OI.PostOutlineCB = [this, &Loc, Tied, Final](Function &OutlinedFn) {
1289  // The input IR here looks like the following-
1290  // ```
1291  // func @current_fn() {
1292  // outlined_fn(%args)
1293  // }
1294  // func @outlined_fn(%args) { ... }
1295  // ```
1296  //
1297  // This is changed to the following-
1298  //
1299  // ```
1300  // func @current_fn() {
1301  // runtime_call(..., wrapper_fn, ...)
1302  // }
1303  // func @wrapper_fn(..., %args) {
1304  // outlined_fn(%args)
1305  // }
1306  // func @outlined_fn(%args) { ... }
1307  // ```
1308 
1309  // The stale call instruction will be replaced with a new call instruction
1310  // for runtime call with a wrapper function.
1311  assert(OutlinedFn.getNumUses() == 1 &&
1312  "there must be a single user for the outlined function");
1313  CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1314 
1315  // HasTaskData is true if any variables are captured in the outlined region,
1316  // false otherwise.
1317  bool HasTaskData = StaleCI->arg_size() > 0;
1318  Builder.SetInsertPoint(StaleCI);
1319 
1320  // Gather the arguments for emitting the runtime call for
1321  // @__kmpc_omp_task_alloc
1322  Function *TaskAllocFn =
1323  getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1324 
1325  // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1326  // call.
1327  uint32_t SrcLocStrSize;
1328  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1329  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1330  Value *ThreadID = getOrCreateThreadID(Ident);
1331 
1332  // Argument - `flags`
1333  // Task is tied iff (Flags & 1) == 1.
1334  // Task is untied iff (Flags & 1) == 0.
1335  // Task is final iff (Flags & 2) == 2.
1336  // Task is not final iff (Flags & 2) == 0.
1337  // TODO: Handle the other flags.
1338  Value *Flags = Builder.getInt32(Tied);
1339  if (Final) {
1340  Value *FinalFlag =
1341  Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
1342  Flags = Builder.CreateOr(FinalFlag, Flags);
1343  }
1344 
1345  // Argument - `sizeof_kmp_task_t` (TaskSize)
1346  // Tasksize refers to the size in bytes of kmp_task_t data structure
1347  // including private vars accessed in task.
1348  Value *TaskSize = Builder.getInt64(0);
1349  if (HasTaskData) {
1350  AllocaInst *ArgStructAlloca =
1351  dyn_cast<AllocaInst>(StaleCI->getArgOperand(0));
1352  assert(ArgStructAlloca &&
1353  "Unable to find the alloca instruction corresponding to arguments "
1354  "for extracted function");
1355  StructType *ArgStructType =
1356  dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1357  assert(ArgStructType && "Unable to find struct type corresponding to "
1358  "arguments for extracted function");
1359  TaskSize =
1360  Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
1361  }
1362 
1363  // TODO: Argument - sizeof_shareds
1364 
1365  // Argument - task_entry (the wrapper function)
1366  // If the outlined function has some captured variables (i.e. HasTaskData is
1367  // true), then the wrapper function will have an additional argument (the
1368  // struct containing captured variables). Otherwise, no such argument will
1369  // be present.
1370  SmallVector<Type *> WrapperArgTys{Builder.getInt32Ty()};
1371  if (HasTaskData)
1372  WrapperArgTys.push_back(OutlinedFn.getArg(0)->getType());
1373  FunctionCallee WrapperFuncVal = M.getOrInsertFunction(
1374  (Twine(OutlinedFn.getName()) + ".wrapper").str(),
1375  FunctionType::get(Builder.getInt32Ty(), WrapperArgTys, false));
1376  Function *WrapperFunc = dyn_cast<Function>(WrapperFuncVal.getCallee());
1377  PointerType *WrapperFuncBitcastType =
1378  FunctionType::get(Builder.getInt32Ty(),
1379  {Builder.getInt32Ty(), Builder.getInt8PtrTy()}, false)
1380  ->getPointerTo();
1381  Value *WrapperFuncBitcast =
1382  ConstantExpr::getBitCast(WrapperFunc, WrapperFuncBitcastType);
1383 
1384  // Emit the @__kmpc_omp_task_alloc runtime call
1385  // The runtime call returns a pointer to an area where the task captured
1386  // variables must be copied before the task is run (NewTaskData)
1387  CallInst *NewTaskData = Builder.CreateCall(
1388  TaskAllocFn,
1389  {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1390  /*sizeof_task=*/TaskSize, /*sizeof_shared=*/Builder.getInt64(0),
1391  /*task_func=*/WrapperFuncBitcast});
1392 
1393  // Copy the arguments for outlined function
1394  if (HasTaskData) {
1395  Value *TaskData = StaleCI->getArgOperand(0);
1396  Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1397  Builder.CreateMemCpy(NewTaskData, Alignment, TaskData, Alignment,
1398  TaskSize);
1399  }
1400 
1401  // Emit the @__kmpc_omp_task runtime call to spawn the task
1402  Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
1403  Builder.CreateCall(TaskFn, {Ident, ThreadID, NewTaskData});
1404 
1405  StaleCI->eraseFromParent();
1406 
1407  // Emit the body for wrapper function
1408  BasicBlock *WrapperEntryBB =
1409  BasicBlock::Create(M.getContext(), "", WrapperFunc);
1410  Builder.SetInsertPoint(WrapperEntryBB);
1411  if (HasTaskData)
1412  Builder.CreateCall(&OutlinedFn, {WrapperFunc->getArg(1)});
1413  else
1414  Builder.CreateCall(&OutlinedFn);
1415  Builder.CreateRet(Builder.getInt32(0));
1416  };
1417 
1418  addOutlineInfo(std::move(OI));
1419 
1420  InsertPointTy TaskAllocaIP =
1421  InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1422  InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1423  BodyGenCB(TaskAllocaIP, TaskBodyIP);
1424  Builder.SetInsertPoint(TaskExitBB);
1425 
1426  return Builder.saveIP();
1427 }
1428 
1430  const LocationDescription &Loc, InsertPointTy AllocaIP,
1432  FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
1433  assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
1434 
1435  if (!updateToLocation(Loc))
1436  return Loc.IP;
1437 
1438  auto FiniCBWrapper = [&](InsertPointTy IP) {
1439  if (IP.getBlock()->end() != IP.getPoint())
1440  return FiniCB(IP);
1441  // This must be done otherwise any nested constructs using FinalizeOMPRegion
1442  // will fail because that function requires the Finalization Basic Block to
1443  // have a terminator, which is already removed by EmitOMPRegionBody.
1444  // IP is currently at cancelation block.
1445  // We need to backtrack to the condition block to fetch
1446  // the exit block and create a branch from cancelation
1447  // to exit block.
1449  Builder.restoreIP(IP);
1450  auto *CaseBB = IP.getBlock()->getSinglePredecessor();
1451  auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
1452  auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
1453  Instruction *I = Builder.CreateBr(ExitBB);
1454  IP = InsertPointTy(I->getParent(), I->getIterator());
1455  return FiniCB(IP);
1456  };
1457 
1458  FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
1459 
1460  // Each section is emitted as a switch case
1461  // Each finalization callback is handled from clang.EmitOMPSectionDirective()
1462  // -> OMP.createSection() which generates the IR for each section
1463  // Iterate through all sections and emit a switch construct:
1464  // switch (IV) {
1465  // case 0:
1466  // <SectionStmt[0]>;
1467  // break;
1468  // ...
1469  // case <NumSection> - 1:
1470  // <SectionStmt[<NumSection> - 1]>;
1471  // break;
1472  // }
1473  // ...
1474  // section_loop.after:
1475  // <FiniCB>;
1476  auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) {
1477  Builder.restoreIP(CodeGenIP);
1478  BasicBlock *Continue =
1479  splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
1480  Function *CurFn = Continue->getParent();
1481  SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
1482 
1483  unsigned CaseNumber = 0;
1484  for (auto SectionCB : SectionCBs) {
1485  BasicBlock *CaseBB = BasicBlock::Create(
1486  M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
1487  SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
1488  Builder.SetInsertPoint(CaseBB);
1489  BranchInst *CaseEndBr = Builder.CreateBr(Continue);
1490  SectionCB(InsertPointTy(),
1491  {CaseEndBr->getParent(), CaseEndBr->getIterator()});
1492  CaseNumber++;
1493  }
1494  // remove the existing terminator from body BB since there can be no
1495  // terminators after switch/case
1496  };
1497  // Loop body ends here
1498  // LowerBound, UpperBound, and STride for createCanonicalLoop
1499  Type *I32Ty = Type::getInt32Ty(M.getContext());
1500  Value *LB = ConstantInt::get(I32Ty, 0);
1501  Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
1502  Value *ST = ConstantInt::get(I32Ty, 1);
1503  llvm::CanonicalLoopInfo *LoopInfo = createCanonicalLoop(
1504  Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
1505  InsertPointTy AfterIP =
1506  applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
1507 
1508  // Apply the finalization callback in LoopAfterBB
1509  auto FiniInfo = FinalizationStack.pop_back_val();
1510  assert(FiniInfo.DK == OMPD_sections &&
1511  "Unexpected finalization stack state!");
1512  if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
1513  Builder.restoreIP(AfterIP);
1514  BasicBlock *FiniBB =
1515  splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
1516  CB(Builder.saveIP());
1517  AfterIP = {FiniBB, FiniBB->begin()};
1518  }
1519 
1520  return AfterIP;
1521 }
1522 
1525  BodyGenCallbackTy BodyGenCB,
1526  FinalizeCallbackTy FiniCB) {
1527  if (!updateToLocation(Loc))
1528  return Loc.IP;
1529 
1530  auto FiniCBWrapper = [&](InsertPointTy IP) {
1531  if (IP.getBlock()->end() != IP.getPoint())
1532  return FiniCB(IP);
1533  // This must be done otherwise any nested constructs using FinalizeOMPRegion
1534  // will fail because that function requires the Finalization Basic Block to
1535  // have a terminator, which is already removed by EmitOMPRegionBody.
1536  // IP is currently at cancelation block.
1537  // We need to backtrack to the condition block to fetch
1538  // the exit block and create a branch from cancelation
1539  // to exit block.
1541  Builder.restoreIP(IP);
1542  auto *CaseBB = Loc.IP.getBlock();
1543  auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
1544  auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
1545  Instruction *I = Builder.CreateBr(ExitBB);
1546  IP = InsertPointTy(I->getParent(), I->getIterator());
1547  return FiniCB(IP);
1548  };
1549 
1550  Directive OMPD = Directive::OMPD_sections;
1551  // Since we are using Finalization Callback here, HasFinalize
1552  // and IsCancellable have to be true
1553  return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
1554  /*Conditional*/ false, /*hasFinalize*/ true,
1555  /*IsCancellable*/ true);
1556 }
1557 
1558 /// Create a function with a unique name and a "void (i8*, i8*)" signature in
1559 /// the given module and return it.
1561  Type *VoidTy = Type::getVoidTy(M.getContext());
1562  Type *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
1563  auto *FuncTy =
1564  FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
1566  M.getDataLayout().getDefaultGlobalsAddressSpace(),
1567  ".omp.reduction.func", &M);
1568 }
1569 
1571  const LocationDescription &Loc, InsertPointTy AllocaIP,
1572  ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait) {
1573  for (const ReductionInfo &RI : ReductionInfos) {
1574  (void)RI;
1575  assert(RI.Variable && "expected non-null variable");
1576  assert(RI.PrivateVariable && "expected non-null private variable");
1577  assert(RI.ReductionGen && "expected non-null reduction generator callback");
1578  assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
1579  "expected variables and their private equivalents to have the same "
1580  "type");
1581  assert(RI.Variable->getType()->isPointerTy() &&
1582  "expected variables to be pointers");
1583  }
1584 
1585  if (!updateToLocation(Loc))
1586  return InsertPointTy();
1587 
1588  BasicBlock *InsertBlock = Loc.IP.getBlock();
1589  BasicBlock *ContinuationBlock =
1590  InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
1591  InsertBlock->getTerminator()->eraseFromParent();
1592 
1593  // Create and populate array of type-erased pointers to private reduction
1594  // values.
1595  unsigned NumReductions = ReductionInfos.size();
1596  Type *RedArrayTy = ArrayType::get(Builder.getInt8PtrTy(), NumReductions);
1597  Builder.restoreIP(AllocaIP);
1598  Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
1599 
1600  Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
1601 
1602  for (auto En : enumerate(ReductionInfos)) {
1603  unsigned Index = En.index();
1604  const ReductionInfo &RI = En.value();
1605  Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
1606  RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
1607  Value *Casted =
1608  Builder.CreateBitCast(RI.PrivateVariable, Builder.getInt8PtrTy(),
1609  "private.red.var." + Twine(Index) + ".casted");
1610  Builder.CreateStore(Casted, RedArrayElemPtr);
1611  }
1612 
1613  // Emit a call to the runtime function that orchestrates the reduction.
1614  // Declare the reduction function in the process.
1615  Function *Func = Builder.GetInsertBlock()->getParent();
1616  Module *Module = Func->getParent();
1617  Value *RedArrayPtr =
1618  Builder.CreateBitCast(RedArray, Builder.getInt8PtrTy(), "red.array.ptr");
1619  uint32_t SrcLocStrSize;
1620  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1621  bool CanGenerateAtomic =
1622  llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) {
1623  return RI.AtomicReductionGen;
1624  });
1625  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
1626  CanGenerateAtomic
1627  ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
1628  : IdentFlag(0));
1629  Value *ThreadId = getOrCreateThreadID(Ident);
1630  Constant *NumVariables = Builder.getInt32(NumReductions);
1631  const DataLayout &DL = Module->getDataLayout();
1632  unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
1633  Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
1634  Function *ReductionFunc = getFreshReductionFunc(*Module);
1635  Value *Lock = getOMPCriticalRegionLock(".reduction");
1636  Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
1637  IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
1638  : RuntimeFunction::OMPRTL___kmpc_reduce);
1639  CallInst *ReduceCall =
1640  Builder.CreateCall(ReduceFunc,
1641  {Ident, ThreadId, NumVariables, RedArraySize,
1642  RedArrayPtr, ReductionFunc, Lock},
1643  "reduce");
1644 
1645  // Create final reduction entry blocks for the atomic and non-atomic case.
1646  // Emit IR that dispatches control flow to one of the blocks based on the
1647  // reduction supporting the atomic mode.
1648  BasicBlock *NonAtomicRedBlock =
1649  BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
1650  BasicBlock *AtomicRedBlock =
1651  BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
1652  SwitchInst *Switch =
1653  Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
1654  Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
1655  Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
1656 
1657  // Populate the non-atomic reduction using the elementwise reduction function.
1658  // This loads the elements from the global and private variables and reduces
1659  // them before storing back the result to the global variable.
1660  Builder.SetInsertPoint(NonAtomicRedBlock);
1661  for (auto En : enumerate(ReductionInfos)) {
1662  const ReductionInfo &RI = En.value();
1663  Type *ValueType = RI.ElementType;
1664  Value *RedValue = Builder.CreateLoad(ValueType, RI.Variable,
1665  "red.value." + Twine(En.index()));
1666  Value *PrivateRedValue =
1667  Builder.CreateLoad(ValueType, RI.PrivateVariable,
1668  "red.private.value." + Twine(En.index()));
1669  Value *Reduced;
1670  Builder.restoreIP(
1671  RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced));
1672  if (!Builder.GetInsertBlock())
1673  return InsertPointTy();
1674  Builder.CreateStore(Reduced, RI.Variable);
1675  }
1676  Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
1677  IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
1678  : RuntimeFunction::OMPRTL___kmpc_end_reduce);
1679  Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
1680  Builder.CreateBr(ContinuationBlock);
1681 
1682  // Populate the atomic reduction using the atomic elementwise reduction
1683  // function. There are no loads/stores here because they will be happening
1684  // inside the atomic elementwise reduction.
1685  Builder.SetInsertPoint(AtomicRedBlock);
1686  if (CanGenerateAtomic) {
1687  for (const ReductionInfo &RI : ReductionInfos) {
1688  Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.ElementType,
1689  RI.Variable, RI.PrivateVariable));
1690  if (!Builder.GetInsertBlock())
1691  return InsertPointTy();
1692  }
1693  Builder.CreateBr(ContinuationBlock);
1694  } else {
1695  Builder.CreateUnreachable();
1696  }
1697 
1698  // Populate the outlined reduction function using the elementwise reduction
1699  // function. Partial values are extracted from the type-erased array of
1700  // pointers to private variables.
1701  BasicBlock *ReductionFuncBlock =
1702  BasicBlock::Create(Module->getContext(), "", ReductionFunc);
1703  Builder.SetInsertPoint(ReductionFuncBlock);
1704  Value *LHSArrayPtr = Builder.CreateBitCast(ReductionFunc->getArg(0),
1705  RedArrayTy->getPointerTo());
1706  Value *RHSArrayPtr = Builder.CreateBitCast(ReductionFunc->getArg(1),
1707  RedArrayTy->getPointerTo());
1708  for (auto En : enumerate(ReductionInfos)) {
1709  const ReductionInfo &RI = En.value();
1710  Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
1711  RedArrayTy, LHSArrayPtr, 0, En.index());
1712  Value *LHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), LHSI8PtrPtr);
1713  Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
1714  Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
1715  Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
1716  RedArrayTy, RHSArrayPtr, 0, En.index());
1717  Value *RHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), RHSI8PtrPtr);
1718  Value *RHSPtr =
1719  Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType());
1720  Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
1721  Value *Reduced;
1722  Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
1723  if (!Builder.GetInsertBlock())
1724  return InsertPointTy();
1725  Builder.CreateStore(Reduced, LHSPtr);
1726  }
1727  Builder.CreateRetVoid();
1728 
1729  Builder.SetInsertPoint(ContinuationBlock);
1730  return Builder.saveIP();
1731 }
1732 
1735  BodyGenCallbackTy BodyGenCB,
1736  FinalizeCallbackTy FiniCB) {
1737 
1738  if (!updateToLocation(Loc))
1739  return Loc.IP;
1740 
1741  Directive OMPD = Directive::OMPD_master;
1742  uint32_t SrcLocStrSize;
1743  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1744  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1745  Value *ThreadId = getOrCreateThreadID(Ident);
1746  Value *Args[] = {Ident, ThreadId};
1747 
1748  Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
1749  Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
1750 
1751  Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
1752  Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
1753 
1754  return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
1755  /*Conditional*/ true, /*hasFinalize*/ true);
1756 }
1757 
1760  BodyGenCallbackTy BodyGenCB,
1761  FinalizeCallbackTy FiniCB, Value *Filter) {
1762  if (!updateToLocation(Loc))
1763  return Loc.IP;
1764 
1765  Directive OMPD = Directive::OMPD_masked;
1766  uint32_t SrcLocStrSize;
1767  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1768  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1769  Value *ThreadId = getOrCreateThreadID(Ident);
1770  Value *Args[] = {Ident, ThreadId, Filter};
1771  Value *ArgsEnd[] = {Ident, ThreadId};
1772 
1773  Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
1774  Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
1775 
1776  Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
1777  Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
1778 
1779  return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
1780  /*Conditional*/ true, /*hasFinalize*/ true);
1781 }
1782 
1784  DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
1785  BasicBlock *PostInsertBefore, const Twine &Name) {
1786  Module *M = F->getParent();
1787  LLVMContext &Ctx = M->getContext();
1788  Type *IndVarTy = TripCount->getType();
1789 
1790  // Create the basic block structure.
1791  BasicBlock *Preheader =
1792  BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
1793  BasicBlock *Header =
1794  BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
1795  BasicBlock *Cond =
1796  BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
1797  BasicBlock *Body =
1798  BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
1799  BasicBlock *Latch =
1800  BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
1801  BasicBlock *Exit =
1802  BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
1803  BasicBlock *After =
1804  BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
1805 
1806  // Use specified DebugLoc for new instructions.
1807  Builder.SetCurrentDebugLocation(DL);
1808 
1809  Builder.SetInsertPoint(Preheader);
1810  Builder.CreateBr(Header);
1811 
1812  Builder.SetInsertPoint(Header);
1813  PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
1814  IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
1815  Builder.CreateBr(Cond);
1816 
1817  Builder.SetInsertPoint(Cond);
1818  Value *Cmp =
1819  Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
1820  Builder.CreateCondBr(Cmp, Body, Exit);
1821 
1822  Builder.SetInsertPoint(Body);
1823  Builder.CreateBr(Latch);
1824 
1825  Builder.SetInsertPoint(Latch);
1826  Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
1827  "omp_" + Name + ".next", /*HasNUW=*/true);
1828  Builder.CreateBr(Header);
1829  IndVarPHI->addIncoming(Next, Latch);
1830 
1831  Builder.SetInsertPoint(Exit);
1832  Builder.CreateBr(After);
1833 
1834  // Remember and return the canonical control flow.
1835  LoopInfos.emplace_front();
1836  CanonicalLoopInfo *CL = &LoopInfos.front();
1837 
1838  CL->Header = Header;
1839  CL->Cond = Cond;
1840  CL->Latch = Latch;
1841  CL->Exit = Exit;
1842 
1843 #ifndef NDEBUG
1844  CL->assertOK();
1845 #endif
1846  return CL;
1847 }
1848 
1851  LoopBodyGenCallbackTy BodyGenCB,
1852  Value *TripCount, const Twine &Name) {
1853  BasicBlock *BB = Loc.IP.getBlock();
1854  BasicBlock *NextBB = BB->getNextNode();
1855 
1856  CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
1857  NextBB, NextBB, Name);
1858  BasicBlock *After = CL->getAfter();
1859 
1860  // If location is not set, don't connect the loop.
1861  if (updateToLocation(Loc)) {
1862  // Split the loop at the insertion point: Branch to the preheader and move
1863  // every following instruction to after the loop (the After BB). Also, the
1864  // new successor is the loop's after block.
1865  spliceBB(Builder, After, /*CreateBranch=*/false);
1866  Builder.CreateBr(CL->getPreheader());
1867  }
1868 
1869  // Emit the body content. We do it after connecting the loop to the CFG to
1870  // avoid that the callback encounters degenerate BBs.
1871  BodyGenCB(CL->getBodyIP(), CL->getIndVar());
1872 
1873 #ifndef NDEBUG
1874  CL->assertOK();
1875 #endif
1876  return CL;
1877 }
1878 
1880  const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
1881  Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
1882  InsertPointTy ComputeIP, const Twine &Name) {
1883 
1884  // Consider the following difficulties (assuming 8-bit signed integers):
1885  // * Adding \p Step to the loop counter which passes \p Stop may overflow:
1886  // DO I = 1, 100, 50
1887  /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
1888  // DO I = 100, 0, -128
1889 
1890  // Start, Stop and Step must be of the same integer type.
1891  auto *IndVarTy = cast<IntegerType>(Start->getType());
1892  assert(IndVarTy == Stop->getType() && "Stop type mismatch");
1893  assert(IndVarTy == Step->getType() && "Step type mismatch");
1894 
1895  LocationDescription ComputeLoc =
1896  ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
1897  updateToLocation(ComputeLoc);
1898 
1899  ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
1900  ConstantInt *One = ConstantInt::get(IndVarTy, 1);
1901 
1902  // Like Step, but always positive.
1903  Value *Incr = Step;
1904 
1905  // Distance between Start and Stop; always positive.
1906  Value *Span;
1907 
1908  // Condition whether there are no iterations are executed at all, e.g. because
1909  // UB < LB.
1910  Value *ZeroCmp;
1911 
1912  if (IsSigned) {
1913  // Ensure that increment is positive. If not, negate and invert LB and UB.
1914  Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
1915  Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
1916  Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
1917  Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
1918  Span = Builder.CreateSub(UB, LB, "", false, true);
1919  ZeroCmp = Builder.CreateICmp(
1920  InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
1921  } else {
1922  Span = Builder.CreateSub(Stop, Start, "", true);
1923  ZeroCmp = Builder.CreateICmp(
1924  InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
1925  }
1926 
1927  Value *CountIfLooping;
1928  if (InclusiveStop) {
1929  CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
1930  } else {
1931  // Avoid incrementing past stop since it could overflow.
1932  Value *CountIfTwo = Builder.CreateAdd(
1933  Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
1934  Value *OneCmp = Builder.CreateICmp(
1935  InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Span, Incr);
1936  CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
1937  }
1938  Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
1939  "omp_" + Name + ".tripcount");
1940 
1941  auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
1942  Builder.restoreIP(CodeGenIP);
1943  Value *Span = Builder.CreateMul(IV, Step);
1944  Value *IndVar = Builder.CreateAdd(Span, Start);
1945  BodyGenCB(Builder.saveIP(), IndVar);
1946  };
1947  LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
1948  return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
1949 }
1950 
1951 // Returns an LLVM function to call for initializing loop bounds using OpenMP
1952 // static scheduling depending on `type`. Only i32 and i64 are supported by the
1953 // runtime. Always interpret integers as unsigned similarly to
1954 // CanonicalLoopInfo.
1956  OpenMPIRBuilder &OMPBuilder) {
1957  unsigned Bitwidth = Ty->getIntegerBitWidth();
1958  if (Bitwidth == 32)
1959  return OMPBuilder.getOrCreateRuntimeFunction(
1960  M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
1961  if (Bitwidth == 64)
1962  return OMPBuilder.getOrCreateRuntimeFunction(
1963  M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
1964  llvm_unreachable("unknown OpenMP loop iterator bitwidth");
1965 }
1966 
1968 OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
1969  InsertPointTy AllocaIP,
1970  bool NeedsBarrier) {
1971  assert(CLI->isValid() && "Requires a valid canonical loop");
1972  assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
1973  "Require dedicated allocate IP");
1974 
1975  // Set up the source location value for OpenMP runtime.
1976  Builder.restoreIP(CLI->getPreheaderIP());
1977  Builder.SetCurrentDebugLocation(DL);
1978 
1979  uint32_t SrcLocStrSize;
1980  Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
1981  Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1982 
1983  // Declare useful OpenMP runtime functions.
1984  Value *IV = CLI->getIndVar();
1985  Type *IVTy = IV->getType();
1986  FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
1987  FunctionCallee StaticFini =
1988  getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
1989 
1990  // Allocate space for computed loop bounds as expected by the "init" function.
1991  Builder.restoreIP(AllocaIP);
1992  Type *I32Type = Type::getInt32Ty(M.getContext());
1993  Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
1994  Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
1995  Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
1996  Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
1997 
1998  // At the end of the preheader, prepare for calling the "init" function by
1999  // storing the current loop bounds into the allocated space. A canonical loop
2000  // always iterates from 0 to trip-count with step 1. Note that "init" expects
2001  // and produces an inclusive upper bound.
2002  Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2003  Constant *Zero = ConstantInt::get(IVTy, 0);
2004  Constant *One = ConstantInt::get(IVTy, 1);
2005  Builder.CreateStore(Zero, PLowerBound);
2006  Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
2007  Builder.CreateStore(UpperBound, PUpperBound);
2008  Builder.CreateStore(One, PStride);
2009 
2010  Value *ThreadNum = getOrCreateThreadID(SrcLoc);
2011 
2012  Constant *SchedulingType = ConstantInt::get(
2013  I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
2014 
2015  // Call the "init" function and update the trip count of the loop with the
2016  // value it produced.
2017  Builder.CreateCall(StaticInit,
2018  {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
2019  PUpperBound, PStride, One, Zero});
2020  Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
2021  Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
2022  Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
2023  Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
2024  CLI->setTripCount(TripCount);
2025 
2026  // Update all uses of the induction variable except the one in the condition
2027  // block that compares it with the actual upper bound, and the increment in
2028  // the latch block.
2029 
2030  CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
2031  Builder.SetInsertPoint(CLI->getBody(),
2032  CLI->getBody()->getFirstInsertionPt());
2033  Builder.SetCurrentDebugLocation(DL);
2034  return Builder.CreateAdd(OldIV, LowerBound);
2035  });
2036 
2037  // In the "exit" block, call the "fini" function.
2038  Builder.SetInsertPoint(CLI->getExit(),
2039  CLI->getExit()->getTerminator()->getIterator());
2040  Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
2041 
2042  // Add the barrier if requested.
2043  if (NeedsBarrier)
2044  createBarrier(LocationDescription(Builder.saveIP(), DL),
2045  omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
2046  /* CheckCancelFlag */ false);
2047 
2048  InsertPointTy AfterIP = CLI->getAfterIP();
2049  CLI->invalidate();
2050 
2051  return AfterIP;
2052 }
2053 
2054 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
2055  DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
2056  bool NeedsBarrier, Value *ChunkSize) {
2057  assert(CLI->isValid() && "Requires a valid canonical loop");
2058  assert(ChunkSize && "Chunk size is required");
2059 
2060  LLVMContext &Ctx = CLI->getFunction()->getContext();
2061  Value *IV = CLI->getIndVar();
2062  Value *OrigTripCount = CLI->getTripCount();
2063  Type *IVTy = IV->getType();
2064  assert(IVTy->getIntegerBitWidth() <= 64 &&
2065  "Max supported tripcount bitwidth is 64 bits");
2066  Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
2067  : Type::getInt64Ty(Ctx);
2068  Type *I32Type = Type::getInt32Ty(M.getContext());
2069  Constant *Zero = ConstantInt::get(InternalIVTy, 0);
2070  Constant *One = ConstantInt::get(InternalIVTy, 1);
2071 
2072  // Declare useful OpenMP runtime functions.
2073  FunctionCallee StaticInit =
2074  getKmpcForStaticInitForType(InternalIVTy, M, *this);
2075  FunctionCallee StaticFini =
2076  getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
2077 
2078  // Allocate space for computed loop bounds as expected by the "init" function.
2079  Builder.restoreIP(AllocaIP);
2080  Builder.SetCurrentDebugLocation(DL);
2081  Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
2082  Value *PLowerBound =
2083  Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
2084  Value *PUpperBound =
2085  Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
2086  Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
2087 
2088  // Set up the source location value for the OpenMP runtime.
2089  Builder.restoreIP(CLI->getPreheaderIP());
2090  Builder.SetCurrentDebugLocation(DL);
2091 
2092  // TODO: Detect overflow in ubsan or max-out with current tripcount.
2093  Value *CastedChunkSize =
2094  Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
2095  Value *CastedTripCount =
2096  Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
2097 
2098  Constant *SchedulingType = ConstantInt::get(
2099  I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
2100  Builder.CreateStore(Zero, PLowerBound);
2101  Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
2102  Builder.CreateStore(OrigUpperBound, PUpperBound);
2103  Builder.CreateStore(One, PStride);
2104 
2105  // Call the "init" function and update the trip count of the loop with the
2106  // value it produced.
2107  uint32_t SrcLocStrSize;
2108  Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
2109  Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2110  Value *ThreadNum = getOrCreateThreadID(SrcLoc);
2111  Builder.CreateCall(StaticInit,
2112  {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
2113  /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
2114  /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
2115  /*pstride=*/PStride, /*incr=*/One,
2116  /*chunk=*/CastedChunkSize});
2117 
2118  // Load values written by the "init" function.
2119  Value *FirstChunkStart =
2120  Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
2121  Value *FirstChunkStop =
2122  Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
2123  Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
2124  Value *ChunkRange =
2125  Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
2126  Value *NextChunkStride =
2127  Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
2128 
2129  // Create outer "dispatch" loop for enumerating the chunks.
2130  BasicBlock *DispatchEnter = splitBB(Builder, true);
2131  Value *DispatchCounter;
2132  CanonicalLoopInfo *DispatchCLI = createCanonicalLoop(
2133  {Builder.saveIP(), DL},
2134  [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; },
2135  FirstChunkStart, CastedTripCount, NextChunkStride,
2136  /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
2137  "dispatch");
2138 
2139  // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
2140  // not have to preserve the canonical invariant.
2141  BasicBlock *DispatchBody = DispatchCLI->getBody();
2142  BasicBlock *DispatchLatch = DispatchCLI->getLatch();
2143  BasicBlock *DispatchExit = DispatchCLI->getExit();
2144  BasicBlock *DispatchAfter = DispatchCLI->getAfter();
2145  DispatchCLI->invalidate();
2146 
2147  // Rewire the original loop to become the chunk loop inside the dispatch loop.
2148  redirectTo(DispatchAfter, CLI->getAfter(), DL);
2149  redirectTo(CLI->getExit(), DispatchLatch, DL);
2150  redirectTo(DispatchBody, DispatchEnter, DL);
2151 
2152  // Prepare the prolog of the chunk loop.
2153  Builder.restoreIP(CLI->getPreheaderIP());
2154  Builder.SetCurrentDebugLocation(DL);
2155 
2156  // Compute the number of iterations of the chunk loop.
2157  Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2158  Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
2159  Value *IsLastChunk =
2160  Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
2161  Value *CountUntilOrigTripCount =
2162  Builder.CreateSub(CastedTripCount, DispatchCounter);
2163  Value *ChunkTripCount = Builder.CreateSelect(
2164  IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
2165  Value *BackcastedChunkTC =
2166  Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
2167  CLI->setTripCount(BackcastedChunkTC);
2168 
2169  // Update all uses of the induction variable except the one in the condition
2170  // block that compares it with the actual upper bound, and the increment in
2171  // the latch block.
2172  Value *BackcastedDispatchCounter =
2173  Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
2174  CLI->mapIndVar([&](Instruction *) -> Value * {
2175  Builder.restoreIP(CLI->getBodyIP());
2176  return Builder.CreateAdd(IV, BackcastedDispatchCounter);
2177  });
2178 
2179  // In the "exit" block, call the "fini" function.
2180  Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
2181  Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
2182 
2183  // Add the barrier if requested.
2184  if (NeedsBarrier)
2185  createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
2186  /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
2187 
2188 #ifndef NDEBUG
2189  // Even though we currently do not support applying additional methods to it,
2190  // the chunk loop should remain a canonical loop.
2191  CLI->assertOK();
2192 #endif
2193 
2194  return {DispatchAfter, DispatchAfter->getFirstInsertionPt()};
2195 }
2196 
2198  DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
2199  bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind,
2200  llvm::Value *ChunkSize, bool HasSimdModifier, bool HasMonotonicModifier,
2201  bool HasNonmonotonicModifier, bool HasOrderedClause) {
2202  OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
2203  SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
2204  HasNonmonotonicModifier, HasOrderedClause);
2205 
2206  bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
2208  switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
2210  assert(!ChunkSize && "No chunk size with static-chunked schedule");
2211  if (IsOrdered)
2212  return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
2213  NeedsBarrier, ChunkSize);
2214  // FIXME: Monotonicity ignored?
2215  return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
2216 
2218  if (IsOrdered)
2219  return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
2220  NeedsBarrier, ChunkSize);
2221  // FIXME: Monotonicity ignored?
2222  return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
2223  ChunkSize);
2224 
2232  assert(!ChunkSize &&
2233  "schedule type does not support user-defined chunk sizes");
2240  return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
2241  NeedsBarrier, ChunkSize);
2242 
2243  default:
2244  llvm_unreachable("Unknown/unimplemented schedule kind");
2245  }
2246 }
2247 
2248 /// Returns an LLVM function to call for initializing loop bounds using OpenMP
2249 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
2250 /// the runtime. Always interpret integers as unsigned similarly to
2251 /// CanonicalLoopInfo.
2252 static FunctionCallee
2254  unsigned Bitwidth = Ty->getIntegerBitWidth();
2255  if (Bitwidth == 32)
2256  return OMPBuilder.getOrCreateRuntimeFunction(
2257  M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
2258  if (Bitwidth == 64)
2259  return OMPBuilder.getOrCreateRuntimeFunction(
2260  M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
2261  llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2262 }
2263 
2264 /// Returns an LLVM function to call for updating the next loop using OpenMP
2265 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
2266 /// the runtime. Always interpret integers as unsigned similarly to
2267 /// CanonicalLoopInfo.
2268 static FunctionCallee
2270  unsigned Bitwidth = Ty->getIntegerBitWidth();
2271  if (Bitwidth == 32)
2272  return OMPBuilder.getOrCreateRuntimeFunction(
2273  M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
2274  if (Bitwidth == 64)
2275  return OMPBuilder.getOrCreateRuntimeFunction(
2276  M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
2277  llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2278 }
2279 
2280 /// Returns an LLVM function to call for finalizing the dynamic loop using
2281 /// depending on `type`. Only i32 and i64 are supported by the runtime. Always
2282 /// interpret integers as unsigned similarly to CanonicalLoopInfo.
2283 static FunctionCallee
2285  unsigned Bitwidth = Ty->getIntegerBitWidth();
2286  if (Bitwidth == 32)
2287  return OMPBuilder.getOrCreateRuntimeFunction(
2288  M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
2289  if (Bitwidth == 64)
2290  return OMPBuilder.getOrCreateRuntimeFunction(
2291  M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
2292  llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2293 }
2294 
2295 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
2296  DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
2297  OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) {
2298  assert(CLI->isValid() && "Requires a valid canonical loop");
2299  assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
2300  "Require dedicated allocate IP");
2302  "Require valid schedule type");
2303 
2304  bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
2306 
2307  // Set up the source location value for OpenMP runtime.
2308  Builder.SetCurrentDebugLocation(DL);
2309 
2310  uint32_t SrcLocStrSize;
2311  Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
2312  Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2313 
2314  // Declare useful OpenMP runtime functions.
2315  Value *IV = CLI->getIndVar();
2316  Type *IVTy = IV->getType();
2317  FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
2318  FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
2319 
2320  // Allocate space for computed loop bounds as expected by the "init" function.
2321  Builder.restoreIP(AllocaIP);
2322  Type *I32Type = Type::getInt32Ty(M.getContext());
2323  Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
2324  Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
2325  Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
2326  Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
2327 
2328  // At the end of the preheader, prepare for calling the "init" function by
2329  // storing the current loop bounds into the allocated space. A canonical loop
2330  // always iterates from 0 to trip-count with step 1. Note that "init" expects
2331  // and produces an inclusive upper bound.
2332  BasicBlock *PreHeader = CLI->getPreheader();
2333  Builder.SetInsertPoint(PreHeader->getTerminator());
2334  Constant *One = ConstantInt::get(IVTy, 1);
2335  Builder.CreateStore(One, PLowerBound);
2336  Value *UpperBound = CLI->getTripCount();
2337  Builder.CreateStore(UpperBound, PUpperBound);
2338  Builder.CreateStore(One, PStride);
2339 
2340  BasicBlock *Header = CLI->getHeader();
2341  BasicBlock *Exit = CLI->getExit();
2342  BasicBlock *Cond = CLI->getCond();
2343  BasicBlock *Latch = CLI->getLatch();
2344  InsertPointTy AfterIP = CLI->getAfterIP();
2345 
2346  // The CLI will be "broken" in the code below, as the loop is no longer
2347  // a valid canonical loop.
2348 
2349  if (!Chunk)
2350  Chunk = One;
2351 
2352  Value *ThreadNum = getOrCreateThreadID(SrcLoc);
2353 
2354  Constant *SchedulingType =
2355  ConstantInt::get(I32Type, static_cast<int>(SchedType));
2356 
2357  // Call the "init" function.
2358  Builder.CreateCall(DynamicInit,
2359  {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
2360  UpperBound, /* step */ One, Chunk});
2361 
2362  // An outer loop around the existing one.
2363  BasicBlock *OuterCond = BasicBlock::Create(
2364  PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
2365  PreHeader->getParent());
2366  // This needs to be 32-bit always, so can't use the IVTy Zero above.
2367  Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
2368  Value *Res =
2369  Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
2370  PLowerBound, PUpperBound, PStride});
2371  Constant *Zero32 = ConstantInt::get(I32Type, 0);
2372  Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
2373  Value *LowerBound =
2374  Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
2375  Builder.CreateCondBr(MoreWork, Header, Exit);
2376 
2377  // Change PHI-node in loop header to use outer cond rather than preheader,
2378  // and set IV to the LowerBound.
2379  Instruction *Phi = &Header->front();
2380  auto *PI = cast<PHINode>(Phi);
2381  PI->setIncomingBlock(0, OuterCond);
2382  PI->setIncomingValue(0, LowerBound);
2383 
2384  // Then set the pre-header to jump to the OuterCond
2385  Instruction *Term = PreHeader->getTerminator();
2386  auto *Br = cast<BranchInst>(Term);
2387  Br->setSuccessor(0, OuterCond);
2388 
2389  // Modify the inner condition:
2390  // * Use the UpperBound returned from the DynamicNext call.
2391  // * jump to the loop outer loop when done with one of the inner loops.
2392  Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
2393  UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
2394  Instruction *Comp = &*Builder.GetInsertPoint();
2395  auto *CI = cast<CmpInst>(Comp);
2396  CI->setOperand(1, UpperBound);
2397  // Redirect the inner exit to branch to outer condition.
2398  Instruction *Branch = &Cond->back();
2399  auto *BI = cast<BranchInst>(Branch);
2400  assert(BI->getSuccessor(1) == Exit);
2401  BI->setSuccessor(1, OuterCond);
2402 
2403  // Call the "fini" function if "ordered" is present in wsloop directive.
2404  if (Ordered) {
2405  Builder.SetInsertPoint(&Latch->back());
2406  FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
2407  Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
2408  }
2409 
2410  // Add the barrier if requested.
2411  if (NeedsBarrier) {
2412  Builder.SetInsertPoint(&Exit->back());
2413  createBarrier(LocationDescription(Builder.saveIP(), DL),
2414  omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
2415  /* CheckCancelFlag */ false);
2416  }
2417 
2418  CLI->invalidate();
2419  return AfterIP;
2420 }
2421 
2422 /// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
2423 /// after this \p OldTarget will be orphaned.
2424 static void redirectAllPredecessorsTo(BasicBlock *OldTarget,
2425  BasicBlock *NewTarget, DebugLoc DL) {
2426  for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
2427  redirectTo(Pred, NewTarget, DL);
2428 }
2429 
2430 /// Determine which blocks in \p BBs are reachable from outside and remove the
2431 /// ones that are not reachable from the function.
2433  SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
2434  auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
2435  for (Use &U : BB->uses()) {
2436  auto *UseInst = dyn_cast<Instruction>(U.getUser());
2437  if (!UseInst)
2438  continue;
2439  if (BBsToErase.count(UseInst->getParent()))
2440  continue;
2441  return true;
2442  }
2443  return false;
2444  };
2445 
2446  while (true) {
2447  bool Changed = false;
2448  for (BasicBlock *BB : make_early_inc_range(BBsToErase)) {
2449  if (HasRemainingUses(BB)) {
2450  BBsToErase.erase(BB);
2451  Changed = true;
2452  }
2453  }
2454  if (!Changed)
2455  break;
2456  }
2457 
2458  SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
2459  DeleteDeadBlocks(BBVec);
2460 }
2461 
2464  InsertPointTy ComputeIP) {
2465  assert(Loops.size() >= 1 && "At least one loop required");
2466  size_t NumLoops = Loops.size();
2467 
2468  // Nothing to do if there is already just one loop.
2469  if (NumLoops == 1)
2470  return Loops.front();
2471 
2472  CanonicalLoopInfo *Outermost = Loops.front();
2473  CanonicalLoopInfo *Innermost = Loops.back();
2474  BasicBlock *OrigPreheader = Outermost->getPreheader();
2475  BasicBlock *OrigAfter = Outermost->getAfter();
2476  Function *F = OrigPreheader->getParent();
2477 
2478  // Loop control blocks that may become orphaned later.
2479  SmallVector<BasicBlock *, 12> OldControlBBs;
2480  OldControlBBs.reserve(6 * Loops.size());
2481  for (CanonicalLoopInfo *Loop : Loops)
2482  Loop->collectControlBlocks(OldControlBBs);
2483 
2484  // Setup the IRBuilder for inserting the trip count computation.
2485  Builder.SetCurrentDebugLocation(DL);
2486  if (ComputeIP.isSet())
2487  Builder.restoreIP(ComputeIP);
2488  else
2489  Builder.restoreIP(Outermost->getPreheaderIP());
2490 
2491  // Derive the collapsed' loop trip count.
2492  // TODO: Find common/largest indvar type.
2493  Value *CollapsedTripCount = nullptr;
2494  for (CanonicalLoopInfo *L : Loops) {
2495  assert(L->isValid() &&
2496  "All loops to collapse must be valid canonical loops");
2497  Value *OrigTripCount = L->getTripCount();
2498  if (!CollapsedTripCount) {
2499  CollapsedTripCount = OrigTripCount;
2500  continue;
2501  }
2502 
2503  // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
2504  CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
2505  {}, /*HasNUW=*/true);
2506  }
2507 
2508  // Create the collapsed loop control flow.
2509  CanonicalLoopInfo *Result =
2510  createLoopSkeleton(DL, CollapsedTripCount, F,
2511  OrigPreheader->getNextNode(), OrigAfter, "collapsed");
2512 
2513  // Build the collapsed loop body code.
2514  // Start with deriving the input loop induction variables from the collapsed
2515  // one, using a divmod scheme. To preserve the original loops' order, the
2516  // innermost loop use the least significant bits.
2517  Builder.restoreIP(Result->getBodyIP());
2518 
2519  Value *Leftover = Result->getIndVar();
2520  SmallVector<Value *> NewIndVars;
2521  NewIndVars.resize(NumLoops);
2522  for (int i = NumLoops - 1; i >= 1; --i) {
2523  Value *OrigTripCount = Loops[i]->getTripCount();
2524 
2525  Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
2526  NewIndVars[i] = NewIndVar;
2527 
2528  Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
2529  }
2530  // Outermost loop gets all the remaining bits.
2531  NewIndVars[0] = Leftover;
2532 
2533  // Construct the loop body control flow.
2534  // We progressively construct the branch structure following in direction of
2535  // the control flow, from the leading in-between code, the loop nest body, the
2536  // trailing in-between code, and rejoining the collapsed loop's latch.
2537  // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
2538  // the ContinueBlock is set, continue with that block. If ContinuePred, use
2539  // its predecessors as sources.
2540  BasicBlock *ContinueBlock = Result->getBody();
2541  BasicBlock *ContinuePred = nullptr;
2542  auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
2543  BasicBlock *NextSrc) {
2544  if (ContinueBlock)
2545  redirectTo(ContinueBlock, Dest, DL);
2546  else
2547  redirectAllPredecessorsTo(ContinuePred, Dest, DL);
2548 
2549  ContinueBlock = nullptr;
2550  ContinuePred = NextSrc;
2551  };
2552 
2553  // The code before the nested loop of each level.
2554  // Because we are sinking it into the nest, it will be executed more often
2555  // that the original loop. More sophisticated schemes could keep track of what
2556  // the in-between code is and instantiate it only once per thread.
2557  for (size_t i = 0; i < NumLoops - 1; ++i)
2558  ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
2559 
2560  // Connect the loop nest body.
2561  ContinueWith(Innermost->getBody(), Innermost->getLatch());
2562 
2563  // The code after the nested loop at each level.
2564  for (size_t i = NumLoops - 1; i > 0; --i)
2565  ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
2566 
2567  // Connect the finished loop to the collapsed loop latch.
2568  ContinueWith(Result->getLatch(), nullptr);
2569 
2570  // Replace the input loops with the new collapsed loop.
2571  redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
2572  redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
2573 
2574  // Replace the input loop indvars with the derived ones.
2575  for (size_t i = 0; i < NumLoops; ++i)
2576  Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
2577 
2578  // Remove unused parts of the input loops.
2579  removeUnusedBlocksFromParent(OldControlBBs);
2580 
2581  for (CanonicalLoopInfo *L : Loops)
2582  L->invalidate();
2583 
2584 #ifndef NDEBUG
2585  Result->assertOK();
2586 #endif
2587  return Result;
2588 }
2589 
2590 std::vector<CanonicalLoopInfo *>
2592  ArrayRef<Value *> TileSizes) {
2593  assert(TileSizes.size() == Loops.size() &&
2594  "Must pass as many tile sizes as there are loops");
2595  int NumLoops = Loops.size();
2596  assert(NumLoops >= 1 && "At least one loop to tile required");
2597 
2598  CanonicalLoopInfo *OutermostLoop = Loops.front();
2599  CanonicalLoopInfo *InnermostLoop = Loops.back();
2600  Function *F = OutermostLoop->getBody()->getParent();
2601  BasicBlock *InnerEnter = InnermostLoop->getBody();
2602  BasicBlock *InnerLatch = InnermostLoop->getLatch();
2603 
2604  // Loop control blocks that may become orphaned later.
2605  SmallVector<BasicBlock *, 12> OldControlBBs;
2606  OldControlBBs.reserve(6 * Loops.size());
2607  for (CanonicalLoopInfo *Loop : Loops)
2608  Loop->collectControlBlocks(OldControlBBs);
2609 
2610  // Collect original trip counts and induction variable to be accessible by
2611  // index. Also, the structure of the original loops is not preserved during
2612  // the construction of the tiled loops, so do it before we scavenge the BBs of
2613  // any original CanonicalLoopInfo.
2614  SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
2615  for (CanonicalLoopInfo *L : Loops) {
2616  assert(L->isValid() && "All input loops must be valid canonical loops");
2617  OrigTripCounts.push_back(L->getTripCount());
2618  OrigIndVars.push_back(L->getIndVar());
2619  }
2620 
2621  // Collect the code between loop headers. These may contain SSA definitions
2622  // that are used in the loop nest body. To be usable with in the innermost
2623  // body, these BasicBlocks will be sunk into the loop nest body. That is,
2624  // these instructions may be executed more often than before the tiling.
2625  // TODO: It would be sufficient to only sink them into body of the
2626  // corresponding tile loop.
2628  for (int i = 0; i < NumLoops - 1; ++i) {
2629  CanonicalLoopInfo *Surrounding = Loops[i];
2630  CanonicalLoopInfo *Nested = Loops[i + 1];
2631 
2632  BasicBlock *EnterBB = Surrounding->getBody();
2633  BasicBlock *ExitBB = Nested->getHeader();
2634  InbetweenCode.emplace_back(EnterBB, ExitBB);
2635  }
2636 
2637  // Compute the trip counts of the floor loops.
2638  Builder.SetCurrentDebugLocation(DL);
2639  Builder.restoreIP(OutermostLoop->getPreheaderIP());
2640  SmallVector<Value *, 4> FloorCount, FloorRems;
2641  for (int i = 0; i < NumLoops; ++i) {
2642  Value *TileSize = TileSizes[i];
2643  Value *OrigTripCount = OrigTripCounts[i];
2644  Type *IVType = OrigTripCount->getType();
2645 
2646  Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
2647  Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
2648 
2649  // 0 if tripcount divides the tilesize, 1 otherwise.
2650  // 1 means we need an additional iteration for a partial tile.
2651  //
2652  // Unfortunately we cannot just use the roundup-formula
2653  // (tripcount + tilesize - 1)/tilesize
2654  // because the summation might overflow. We do not want introduce undefined
2655  // behavior when the untiled loop nest did not.
2656  Value *FloorTripOverflow =
2657  Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
2658 
2659  FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
2660  FloorTripCount =
2661  Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
2662  "omp_floor" + Twine(i) + ".tripcount", true);
2663 
2664  // Remember some values for later use.
2665  FloorCount.push_back(FloorTripCount);
2666  FloorRems.push_back(FloorTripRem);
2667  }
2668 
2669  // Generate the new loop nest, from the outermost to the innermost.
2670  std::vector<CanonicalLoopInfo *> Result;
2671  Result.reserve(NumLoops * 2);
2672 
2673  // The basic block of the surrounding loop that enters the nest generated
2674  // loop.
2675  BasicBlock *Enter = OutermostLoop->getPreheader();
2676 
2677  // The basic block of the surrounding loop where the inner code should
2678  // continue.
2679  BasicBlock *Continue = OutermostLoop->getAfter();
2680 
2681  // Where the next loop basic block should be inserted.
2682  BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
2683 
2684  auto EmbeddNewLoop =
2685  [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
2686  Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
2687  CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
2688  DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
2689  redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
2690  redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
2691 
2692  // Setup the position where the next embedded loop connects to this loop.
2693  Enter = EmbeddedLoop->getBody();
2694  Continue = EmbeddedLoop->getLatch();
2695  OutroInsertBefore = EmbeddedLoop->getLatch();
2696  return EmbeddedLoop;
2697  };
2698 
2699  auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
2700  const Twine &NameBase) {
2701  for (auto P : enumerate(TripCounts)) {
2702  CanonicalLoopInfo *EmbeddedLoop =
2703  EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
2704  Result.push_back(EmbeddedLoop);
2705  }
2706  };
2707 
2708  EmbeddNewLoops(FloorCount, "floor");
2709 
2710  // Within the innermost floor loop, emit the code that computes the tile
2711  // sizes.
2712  Builder.SetInsertPoint(Enter->getTerminator());
2713  SmallVector<Value *, 4> TileCounts;
2714  for (int i = 0; i < NumLoops; ++i) {
2715  CanonicalLoopInfo *FloorLoop = Result[i];
2716  Value *TileSize = TileSizes[i];
2717 
2718  Value *FloorIsEpilogue =
2719  Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
2720  Value *TileTripCount =
2721  Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
2722 
2723  TileCounts.push_back(TileTripCount);
2724  }
2725 
2726  // Create the tile loops.
2727  EmbeddNewLoops(TileCounts, "tile");
2728 
2729  // Insert the inbetween code into the body.
2730  BasicBlock *BodyEnter = Enter;
2731  BasicBlock *BodyEntered = nullptr;
2732  for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
2733  BasicBlock *EnterBB = P.first;
2734  BasicBlock *ExitBB = P.second;
2735 
2736  if (BodyEnter)
2737  redirectTo(BodyEnter, EnterBB, DL);
2738  else
2739  redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
2740 
2741  BodyEnter = nullptr;
2742  BodyEntered = ExitBB;
2743  }
2744 
2745  // Append the original loop nest body into the generated loop nest body.
2746  if (BodyEnter)
2747  redirectTo(BodyEnter, InnerEnter, DL);
2748  else
2749  redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
2750  redirectAllPredecessorsTo(InnerLatch, Continue, DL);
2751 
2752  // Replace the original induction variable with an induction variable computed
2753  // from the tile and floor induction variables.
2754  Builder.restoreIP(Result.back()->getBodyIP());
2755  for (int i = 0; i < NumLoops; ++i) {
2756  CanonicalLoopInfo *FloorLoop = Result[i];
2757  CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
2758  Value *OrigIndVar = OrigIndVars[i];
2759  Value *Size = TileSizes[i];
2760 
2761  Value *Scale =
2762  Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
2763  Value *Shift =
2764  Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
2765  OrigIndVar->replaceAllUsesWith(Shift);
2766  }
2767 
2768  // Remove unused parts of the original loops.
2769  removeUnusedBlocksFromParent(OldControlBBs);
2770 
2771  for (CanonicalLoopInfo *L : Loops)
2772  L->invalidate();
2773 
2774 #ifndef NDEBUG
2775  for (CanonicalLoopInfo *GenL : Result)
2776  GenL->assertOK();
2777 #endif
2778  return Result;
2779 }
2780 
2781 /// Attach loop metadata \p Properties to the loop described by \p Loop. If the
2782 /// loop already has metadata, the loop properties are appended.
2784  ArrayRef<Metadata *> Properties) {
2785  assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
2786 
2787  // Nothing to do if no property to attach.
2788  if (Properties.empty())
2789  return;
2790 
2791  LLVMContext &Ctx = Loop->getFunction()->getContext();
2792  SmallVector<Metadata *> NewLoopProperties;
2793  NewLoopProperties.push_back(nullptr);
2794 
2795  // If the loop already has metadata, prepend it to the new metadata.
2796  BasicBlock *Latch = Loop->getLatch();
2797  assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
2798  MDNode *Existing = Latch->getTerminator()->getMetadata(LLVMContext::MD_loop);
2799  if (Existing)
2800  append_range(NewLoopProperties, drop_begin(Existing->operands(), 1));
2801 
2802  append_range(NewLoopProperties, Properties);
2803  MDNode *LoopID = MDNode::getDistinct(Ctx, NewLoopProperties);
2804  LoopID->replaceOperandWith(0, LoopID);
2805 
2806  Latch->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID);
2807 }
2808 
2809 /// Attach llvm.access.group metadata to the memref instructions of \p Block
2810 static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
2811  LoopInfo &LI) {
2812  for (Instruction &I : *Block) {
2813  if (I.mayReadOrWriteMemory()) {
2814  // TODO: This instruction may already have access group from
2815  // other pragmas e.g. #pragma clang loop vectorize. Append
2816  // so that the existing metadata is not overwritten.
2817  I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
2818  }
2819  }
2820 }
2821 
2823  LLVMContext &Ctx = Builder.getContext();
2825  Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
2826  MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
2827 }
2828 
2830  LLVMContext &Ctx = Builder.getContext();
2832  Loop, {
2833  MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
2834  });
2835 }
2836 
2838  LLVMContext &Ctx = Builder.getContext();
2839 
2840  Function *F = CanonicalLoop->getFunction();
2841 
2843  FAM.registerPass([]() { return DominatorTreeAnalysis(); });
2844  FAM.registerPass([]() { return LoopAnalysis(); });
2845  FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
2846 
2847  LoopAnalysis LIA;
2848  LoopInfo &&LI = LIA.run(*F, FAM);
2849 
2850  Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
2851 
2852  SmallSet<BasicBlock *, 8> Reachable;
2853 
2854  // Get the basic blocks from the loop in which memref instructions
2855  // can be found.
2856  // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
2857  // preferably without running any passes.
2858  for (BasicBlock *Block : L->getBlocks()) {
2859  if (Block == CanonicalLoop->getCond() ||
2860  Block == CanonicalLoop->getHeader())
2861  continue;
2862  Reachable.insert(Block);
2863  }
2864 
2865  // Add access group metadata to memory-access instructions.
2866  MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
2867  for (BasicBlock *BB : Reachable)
2868  addSimdMetadata(BB, AccessGroup, LI);
2869 
2870  // Use the above access group metadata to create loop level
2871  // metadata, which should be distinct for each loop.
2872  ConstantAsMetadata *BoolConst =
2874  // TODO: If the loop has existing parallel access metadata, have
2875  // to combine two lists.
2877  CanonicalLoop,
2878  {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"),
2879  AccessGroup}),
2880  MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
2881  BoolConst})});
2882 }
2883 
2884 /// Create the TargetMachine object to query the backend for optimization
2885 /// preferences.
2886 ///
2887 /// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
2888 /// e.g. Clang does not pass it to its CodeGen layer and creates it only when
2889 /// needed for the LLVM pass pipline. We use some default options to avoid
2890 /// having to pass too many settings from the frontend that probably do not
2891 /// matter.
2892 ///
2893 /// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
2894 /// method. If we are going to use TargetMachine for more purposes, especially
2895 /// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
2896 /// might become be worth requiring front-ends to pass on their TargetMachine,
2897 /// or at least cache it between methods. Note that while fontends such as Clang
2898 /// have just a single main TargetMachine per translation unit, "target-cpu" and
2899 /// "target-features" that determine the TargetMachine are per-function and can
2900 /// be overrided using __attribute__((target("OPTIONS"))).
2901 static std::unique_ptr<TargetMachine>
2903  Module *M = F->getParent();
2904 
2905  StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
2906  StringRef Features = F->getFnAttribute("target-features").getValueAsString();
2907  const std::string &Triple = M->getTargetTriple();
2908 
2909  std::string Error;
2911  if (!TheTarget)
2912  return {};
2913 
2915  return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
2916  Triple, CPU, Features, Options, /*RelocModel=*/None, /*CodeModel=*/None,
2917  OptLevel));
2918 }
2919 
2920 /// Heuristically determine the best-performant unroll factor for \p CLI. This
2921 /// depends on the target processor. We are re-using the same heuristics as the
2922 /// LoopUnrollPass.
2924  Function *F = CLI->getFunction();
2925 
2926  // Assume the user requests the most aggressive unrolling, even if the rest of
2927  // the code is optimized using a lower setting.
2929  std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
2930 
2932  FAM.registerPass([]() { return TargetLibraryAnalysis(); });
2933  FAM.registerPass([]() { return AssumptionAnalysis(); });
2934  FAM.registerPass([]() { return DominatorTreeAnalysis(); });
2935  FAM.registerPass([]() { return LoopAnalysis(); });
2936  FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
2937  FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
2938  TargetIRAnalysis TIRA;
2939  if (TM)
2940  TIRA = TargetIRAnalysis(
2941  [&](const Function &F) { return TM->getTargetTransformInfo(F); });
2942  FAM.registerPass([&]() { return TIRA; });
2943 
2944  TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
2946  ScalarEvolution &&SE = SEA.run(*F, FAM);
2948  DominatorTree &&DT = DTA.run(*F, FAM);
2949  LoopAnalysis LIA;
2950  LoopInfo &&LI = LIA.run(*F, FAM);
2951  AssumptionAnalysis ACT;
2952  AssumptionCache &&AC = ACT.run(*F, FAM);
2954 
2955  Loop *L = LI.getLoopFor(CLI->getHeader());
2956  assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
2957 
2960  /*BlockFrequencyInfo=*/nullptr,
2961  /*ProfileSummaryInfo=*/nullptr, ORE, OptLevel,
2962  /*UserThreshold=*/None,
2963  /*UserCount=*/None,
2964  /*UserAllowPartial=*/true,
2965  /*UserAllowRuntime=*/true,
2966  /*UserUpperBound=*/None,
2967  /*UserFullUnrollMaxCount=*/None);
2968 
2969  UP.Force = true;
2970 
2971  // Account for additional optimizations taking place before the LoopUnrollPass
2972  // would unroll the loop.
2975 
2976  // Use normal unroll factors even if the rest of the code is optimized for
2977  // size.
2978  UP.OptSizeThreshold = UP.Threshold;
2980 
2981  LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
2982  << " Threshold=" << UP.Threshold << "\n"
2983  << " PartialThreshold=" << UP.PartialThreshold << "\n"
2984  << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
2985  << " PartialOptSizeThreshold="
2986  << UP.PartialOptSizeThreshold << "\n");
2987 
2988  // Disable peeling.
2991  /*UserAllowPeeling=*/false,
2992  /*UserAllowProfileBasedPeeling=*/false,
2993  /*UnrollingSpecficValues=*/false);
2994 
2996  CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
2997 
2998  // Assume that reads and writes to stack variables can be eliminated by
2999  // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
3000  // size.
3001  for (BasicBlock *BB : L->blocks()) {
3002  for (Instruction &I : *BB) {
3003  Value *Ptr;
3004  if (auto *Load = dyn_cast<LoadInst>(&I)) {
3005  Ptr = Load->getPointerOperand();
3006  } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3007  Ptr = Store->getPointerOperand();
3008  } else
3009  continue;
3010 
3011  Ptr = Ptr->stripPointerCasts();
3012 
3013  if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
3014  if (Alloca->getParent() == &F->getEntryBlock())
3015  EphValues.insert(&I);
3016  }
3017  }
3018  }
3019 
3020  unsigned NumInlineCandidates;
3021  bool NotDuplicatable;
3022  bool Convergent;
3023  InstructionCost LoopSizeIC =
3024  ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
3025  TTI, EphValues, UP.BEInsns);
3026  LLVM_DEBUG(dbgs() << "Estimated loop size is " << LoopSizeIC << "\n");
3027 
3028  // Loop is not unrollable if the loop contains certain instructions.
3029  if (NotDuplicatable || Convergent || !LoopSizeIC.isValid()) {
3030  LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
3031  return 1;
3032  }
3033  unsigned LoopSize = *LoopSizeIC.getValue();
3034 
3035  // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
3036  // be able to use it.
3037  int TripCount = 0;
3038  int MaxTripCount = 0;
3039  bool MaxOrZero = false;
3040  unsigned TripMultiple = 0;
3041 
3042  bool UseUpperBound = false;
3043  computeUnrollCount(L, TTI, DT, &LI, SE, EphValues, &ORE, TripCount,
3044  MaxTripCount, MaxOrZero, TripMultiple, LoopSize, UP, PP,
3045  UseUpperBound);
3046  unsigned Factor = UP.Count;
3047  LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
3048 
3049  // This function returns 1 to signal to not unroll a loop.
3050  if (Factor == 0)
3051  return 1;
3052  return Factor;
3053 }
3054 
3056  int32_t Factor,
3057  CanonicalLoopInfo **UnrolledCLI) {
3058  assert(Factor >= 0 && "Unroll factor must not be negative");
3059 
3060  Function *F = Loop->getFunction();
3061  LLVMContext &Ctx = F->getContext();
3062 
3063  // If the unrolled loop is not used for another loop-associated directive, it
3064  // is sufficient to add metadata for the LoopUnrollPass.
3065  if (!UnrolledCLI) {
3066  SmallVector<Metadata *, 2> LoopMetadata;
3067  LoopMetadata.push_back(
3068  MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
3069 
3070  if (Factor >= 1) {
3072  ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
3073  LoopMetadata.push_back(MDNode::get(
3074  Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
3075  }
3076 
3077  addLoopMetadata(Loop, LoopMetadata);
3078  return;
3079  }
3080 
3081  // Heuristically determine the unroll factor.
3082  if (Factor == 0)
3084 
3085  // No change required with unroll factor 1.
3086  if (Factor == 1) {
3087  *UnrolledCLI = Loop;
3088  return;
3089  }
3090 
3091  assert(Factor >= 2 &&
3092  "unrolling only makes sense with a factor of 2 or larger");
3093 
3094  Type *IndVarTy = Loop->getIndVarType();
3095 
3096  // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
3097  // unroll the inner loop.
3098  Value *FactorVal =
3099  ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
3100  /*isSigned=*/false));
3101  std::vector<CanonicalLoopInfo *> LoopNest =
3102  tileLoops(DL, {Loop}, {FactorVal});
3103  assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
3104  *UnrolledCLI = LoopNest[0];
3105  CanonicalLoopInfo *InnerLoop = LoopNest[1];
3106 
3107  // LoopUnrollPass can only fully unroll loops with constant trip count.
3108  // Unroll by the unroll factor with a fallback epilog for the remainder
3109  // iterations if necessary.
3111  ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
3113  InnerLoop,
3114  {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
3115  MDNode::get(
3116  Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
3117 
3118 #ifndef NDEBUG
3119  (*UnrolledCLI)->assertOK();
3120 #endif
3121 }
3122 
3125  llvm::Value *BufSize, llvm::Value *CpyBuf,
3126  llvm::Value *CpyFn, llvm::Value *DidIt) {
3127  if (!updateToLocation(Loc))
3128  return Loc.IP;
3129 
3130  uint32_t SrcLocStrSize;
3131  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3132  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3133  Value *ThreadId = getOrCreateThreadID(Ident);
3134 
3135  llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
3136 
3137  Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
3138 
3139  Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
3140  Builder.CreateCall(Fn, Args);
3141 
3142  return Builder.saveIP();
3143 }
3144 
3146  const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
3147  FinalizeCallbackTy FiniCB, bool IsNowait, llvm::Value *DidIt) {
3148 
3149  if (!updateToLocation(Loc))
3150  return Loc.IP;
3151 
3152  // If needed (i.e. not null), initialize `DidIt` with 0
3153  if (DidIt) {
3154  Builder.CreateStore(Builder.getInt32(0), DidIt);
3155  }
3156 
3157  Directive OMPD = Directive::OMPD_single;
3158  uint32_t SrcLocStrSize;
3159  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3160  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3161  Value *ThreadId = getOrCreateThreadID(Ident);
3162  Value *Args[] = {Ident, ThreadId};
3163 
3164  Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
3165  Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3166 
3167  Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
3168  Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3169 
3170  // generates the following:
3171  // if (__kmpc_single()) {
3172  // .... single region ...
3173  // __kmpc_end_single
3174  // }
3175  // __kmpc_barrier
3176 
3177  EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3178  /*Conditional*/ true,
3179  /*hasFinalize*/ true);
3180  if (!IsNowait)
3181  createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3182  omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
3183  /* CheckCancelFlag */ false);
3184  return Builder.saveIP();
3185 }
3186 
3188  const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
3189  FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
3190 
3191  if (!updateToLocation(Loc))
3192  return Loc.IP;
3193 
3194  Directive OMPD = Directive::OMPD_critical;
3195  uint32_t SrcLocStrSize;
3196  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3197  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3198  Value *ThreadId = getOrCreateThreadID(Ident);
3199  Value *LockVar = getOMPCriticalRegionLock(CriticalName);
3200  Value *Args[] = {Ident, ThreadId, LockVar};
3201 
3203  Function *RTFn = nullptr;
3204  if (HintInst) {
3205  // Add Hint to entry Args and create call
3206  EnterArgs.push_back(HintInst);
3207  RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
3208  } else {
3209  RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
3210  }
3211  Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
3212 
3213  Function *ExitRTLFn =
3214  getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
3215  Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3216 
3217  return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3218  /*Conditional*/ false, /*hasFinalize*/ true);
3219 }
3220 
3223  InsertPointTy AllocaIP, unsigned NumLoops,
3224  ArrayRef<llvm::Value *> StoreValues,
3225  const Twine &Name, bool IsDependSource) {
3226  for (size_t I = 0; I < StoreValues.size(); I++)
3227  assert(StoreValues[I]->getType()->isIntegerTy(64) &&
3228  "OpenMP runtime requires depend vec with i64 type");
3229 
3230  if (!updateToLocation(Loc))
3231  return Loc.IP;
3232 
3233  // Allocate space for vector and generate alloc instruction.
3234  auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
3235  Builder.restoreIP(AllocaIP);
3236  AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
3237  ArgsBase->setAlignment(Align(8));
3238  Builder.restoreIP(Loc.IP);
3239 
3240  // Store the index value with offset in depend vector.
3241  for (unsigned I = 0; I < NumLoops; ++I) {
3242  Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
3243  ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
3244  StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
3245  STInst->setAlignment(Align(8));
3246  }
3247 
3248  Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
3249  ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
3250 
3251  uint32_t SrcLocStrSize;
3252  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3253  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3254  Value *ThreadId = getOrCreateThreadID(Ident);
3255  Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
3256 
3257  Function *RTLFn = nullptr;
3258  if (IsDependSource)
3259  RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
3260  else
3261  RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
3262  Builder.CreateCall(RTLFn, Args);
3263 
3264  return Builder.saveIP();
3265 }
3266 
3268  const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
3269  FinalizeCallbackTy FiniCB, bool IsThreads) {
3270  if (!updateToLocation(Loc))
3271  return Loc.IP;
3272 
3273  Directive OMPD = Directive::OMPD_ordered;
3274  Instruction *EntryCall = nullptr;
3275  Instruction *ExitCall = nullptr;
3276 
3277  if (IsThreads) {
3278  uint32_t SrcLocStrSize;
3279  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3280  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3281  Value *ThreadId = getOrCreateThreadID(Ident);
3282  Value *Args[] = {Ident, ThreadId};
3283 
3284  Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
3285  EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3286 
3287  Function *ExitRTLFn =
3288  getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
3289  ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3290  }
3291 
3292  return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3293  /*Conditional*/ false, /*hasFinalize*/ true);
3294 }
3295 
3296 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion(
3297  Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
3298  BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
3299  bool HasFinalize, bool IsCancellable) {
3300 
3301  if (HasFinalize)
3302  FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
3303 
3304  // Create inlined region's entry and body blocks, in preparation
3305  // for conditional creation
3306  BasicBlock *EntryBB = Builder.GetInsertBlock();
3307  Instruction *SplitPos = EntryBB->getTerminator();
3308  if (!isa_and_nonnull<BranchInst>(SplitPos))
3309  SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
3310  BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
3311  BasicBlock *FiniBB =
3312  EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
3313 
3314  Builder.SetInsertPoint(EntryBB->getTerminator());
3315  emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
3316 
3317  // generate body
3318  BodyGenCB(/* AllocaIP */ InsertPointTy(),
3319  /* CodeGenIP */ Builder.saveIP());
3320 
3321  // emit exit call and do any needed finalization.
3322  auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
3323  assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
3324  FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
3325  "Unexpected control flow graph state!!");
3326  emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
3327  assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
3328  "Unexpected Control Flow State!");
3329  MergeBlockIntoPredecessor(FiniBB);
3330 
3331  // If we are skipping the region of a non conditional, remove the exit
3332  // block, and clear the builder's insertion point.
3333  assert(SplitPos->getParent() == ExitBB &&
3334  "Unexpected Insertion point location!");
3335  auto merged = MergeBlockIntoPredecessor(ExitBB);
3336  BasicBlock *ExitPredBB = SplitPos->getParent();
3337  auto InsertBB = merged ? ExitPredBB : ExitBB;
3338  if (!isa_and_nonnull<BranchInst>(SplitPos))
3339  SplitPos->eraseFromParent();
3340  Builder.SetInsertPoint(InsertBB);
3341 
3342  return Builder.saveIP();
3343 }
3344 
3345 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
3346  Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
3347  // if nothing to do, Return current insertion point.
3348  if (!Conditional || !EntryCall)
3349  return Builder.saveIP();
3350 
3351  BasicBlock *EntryBB = Builder.GetInsertBlock();
3352  Value *CallBool = Builder.CreateIsNotNull(EntryCall);
3353  auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
3354  auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
3355 
3356  // Emit thenBB and set the Builder's insertion point there for
3357  // body generation next. Place the block after the current block.
3358  Function *CurFn = EntryBB->getParent();
3359  CurFn->getBasicBlockList().insertAfter(EntryBB->getIterator(), ThenBB);
3360 
3361  // Move Entry branch to end of ThenBB, and replace with conditional
3362  // branch (If-stmt)
3363  Instruction *EntryBBTI = EntryBB->getTerminator();
3364  Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
3365  EntryBBTI->removeFromParent();
3366  Builder.SetInsertPoint(UI);
3367  Builder.Insert(EntryBBTI);
3368  UI->eraseFromParent();
3369  Builder.SetInsertPoint(ThenBB->getTerminator());
3370 
3371  // return an insertion point to ExitBB.
3372  return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
3373 }
3374 
3375 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveExit(
3376  omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
3377  bool HasFinalize) {
3378 
3379  Builder.restoreIP(FinIP);
3380 
3381  // If there is finalization to do, emit it before the exit call
3382  if (HasFinalize) {
3383  assert(!FinalizationStack.empty() &&
3384  "Unexpected finalization stack state!");
3385 
3386  FinalizationInfo Fi = FinalizationStack.pop_back_val();
3387  assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
3388 
3389  Fi.FiniCB(FinIP);
3390 
3391  BasicBlock *FiniBB = FinIP.getBlock();
3392  Instruction *FiniBBTI = FiniBB->getTerminator();
3393 
3394  // set Builder IP for call creation
3395  Builder.SetInsertPoint(FiniBBTI);
3396  }
3397 
3398  if (!ExitCall)
3399  return Builder.saveIP();
3400 
3401  // place the Exitcall as last instruction before Finalization block terminator
3402  ExitCall->removeFromParent();
3403  Builder.Insert(ExitCall);
3404 
3405  return IRBuilder<>::InsertPoint(ExitCall->getParent(),
3406  ExitCall->getIterator());
3407 }
3408 
3410  InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
3411  llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
3412  if (!IP.isSet())
3413  return IP;
3414 
3416 
3417  // creates the following CFG structure
3418  // OMP_Entry : (MasterAddr != PrivateAddr)?
3419  // F T
3420  // | \
3421  // | copin.not.master
3422  // | /
3423  // v /
3424  // copyin.not.master.end
3425  // |
3426  // v
3427  // OMP.Entry.Next
3428 
3429  BasicBlock *OMP_Entry = IP.getBlock();
3430  Function *CurFn = OMP_Entry->getParent();
3431  BasicBlock *CopyBegin =
3432  BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
3433  BasicBlock *CopyEnd = nullptr;
3434 
3435  // If entry block is terminated, split to preserve the branch to following
3436  // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
3437  if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
3438  CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
3439  "copyin.not.master.end");
3440  OMP_Entry->getTerminator()->eraseFromParent();
3441  } else {
3442  CopyEnd =
3443  BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
3444  }
3445 
3446  Builder.SetInsertPoint(OMP_Entry);
3447  Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
3448  Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
3449  Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
3450  Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
3451 
3452  Builder.SetInsertPoint(CopyBegin);
3453  if (BranchtoEnd)
3454  Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
3455 
3456  return Builder.saveIP();
3457 }
3458 
3460  Value *Size, Value *Allocator,
3461  std::string Name) {
3463  Builder.restoreIP(Loc.IP);
3464 
3465  uint32_t SrcLocStrSize;
3466  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3467  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3468  Value *ThreadId = getOrCreateThreadID(Ident);
3469  Value *Args[] = {ThreadId, Size, Allocator};
3470 
3471  Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
3472 
3473  return Builder.CreateCall(Fn, Args, Name);
3474 }
3475 
3478  std::string Name) {
3480  Builder.restoreIP(Loc.IP);
3481 
3482  uint32_t SrcLocStrSize;
3483  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3484  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3485  Value *ThreadId = getOrCreateThreadID(Ident);
3486  Value *Args[] = {ThreadId, Addr, Allocator};
3487  Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
3488  return Builder.CreateCall(Fn, Args, Name);
3489 }
3490 
3492  const LocationDescription &Loc, Value *InteropVar,
3493  omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
3494  Value *DependenceAddress, bool HaveNowaitClause) {
3496  Builder.restoreIP(Loc.IP);
3497 
3498  uint32_t SrcLocStrSize;
3499  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3500  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3501  Value *ThreadId = getOrCreateThreadID(Ident);
3502  if (Device == nullptr)
3503  Device = ConstantInt::get(Int32, -1);
3504  Constant *InteropTypeVal = ConstantInt::get(Int64, (int)InteropType);
3505  if (NumDependences == nullptr) {
3506  NumDependences = ConstantInt::get(Int32, 0);
3507  PointerType *PointerTypeVar = Type::getInt8PtrTy(M.getContext());
3508  DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
3509  }
3510  Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
3511  Value *Args[] = {
3512  Ident, ThreadId, InteropVar, InteropTypeVal,
3513  Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
3514 
3515  Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
3516 
3517  return Builder.CreateCall(Fn, Args);
3518 }
3519 
3521  const LocationDescription &Loc, Value *InteropVar, Value *Device,
3522  Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
3524  Builder.restoreIP(Loc.IP);
3525 
3526  uint32_t SrcLocStrSize;
3527  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3528  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3529  Value *ThreadId = getOrCreateThreadID(Ident);
3530  if (Device == nullptr)
3531  Device = ConstantInt::get(Int32, -1);
3532  if (NumDependences == nullptr) {
3533  NumDependences = ConstantInt::get(Int32, 0);
3534  PointerType *PointerTypeVar = Type::getInt8PtrTy(M.getContext());
3535  DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
3536  }
3537  Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
3538  Value *Args[] = {
3539  Ident, ThreadId, InteropVar, Device,
3540  NumDependences, DependenceAddress, HaveNowaitClauseVal};
3541 
3542  Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
3543 
3544  return Builder.CreateCall(Fn, Args);
3545 }
3546 
3548  Value *InteropVar, Value *Device,
3549  Value *NumDependences,
3550  Value *DependenceAddress,
3551  bool HaveNowaitClause) {
3553  Builder.restoreIP(Loc.IP);
3554  uint32_t SrcLocStrSize;
3555  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3556  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3557  Value *ThreadId = getOrCreateThreadID(Ident);
3558  if (Device == nullptr)
3559  Device = ConstantInt::get(Int32, -1);
3560  if (NumDependences == nullptr) {
3561  NumDependences = ConstantInt::get(Int32, 0);
3562  PointerType *PointerTypeVar = Type::getInt8PtrTy(M.getContext());
3563  DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
3564  }
3565  Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
3566  Value *Args[] = {
3567  Ident, ThreadId, InteropVar, Device,
3568  NumDependences, DependenceAddress, HaveNowaitClauseVal};
3569 
3570  Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
3571 
3572  return Builder.CreateCall(Fn, Args);
3573 }
3574 
3576  const LocationDescription &Loc, llvm::Value *Pointer,
3577  llvm::ConstantInt *Size, const llvm::Twine &Name) {
3579  Builder.restoreIP(Loc.IP);
3580 
3581  uint32_t SrcLocStrSize;
3582  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3583  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3584  Value *ThreadId = getOrCreateThreadID(Ident);
3585  Constant *ThreadPrivateCache =
3586  getOrCreateOMPInternalVariable(Int8PtrPtr, Name);
3587  llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
3588 
3589  Function *Fn =
3590  getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
3591 
3592  return Builder.CreateCall(Fn, Args);
3593 }
3594 
3597  bool RequiresFullRuntime) {
3598  if (!updateToLocation(Loc))
3599  return Loc.IP;
3600 
3601  uint32_t SrcLocStrSize;
3602  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3603  Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3604  ConstantInt *IsSPMDVal = ConstantInt::getSigned(
3605  IntegerType::getInt8Ty(Int8->getContext()),
3607  ConstantInt *UseGenericStateMachine =
3608  ConstantInt::getBool(Int32->getContext(), !IsSPMD);
3609  ConstantInt *RequiresFullRuntimeVal =
3610  ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime);
3611 
3612  Function *Fn = getOrCreateRuntimeFunctionPtr(
3613  omp::RuntimeFunction::OMPRTL___kmpc_target_init);
3614 
3615  CallInst *ThreadKind = Builder.CreateCall(
3616  Fn, {Ident, IsSPMDVal, UseGenericStateMachine, RequiresFullRuntimeVal});
3617 
3618  Value *ExecUserCode = Builder.CreateICmpEQ(
3619  ThreadKind, ConstantInt::get(ThreadKind->getType(), -1),
3620  "exec_user_code");
3621 
3622  // ThreadKind = __kmpc_target_init(...)
3623  // if (ThreadKind == -1)
3624  // user_code
3625  // else
3626  // return;
3627 
3628  auto *UI = Builder.CreateUnreachable();
3629  BasicBlock *CheckBB = UI->getParent();
3630  BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
3631 
3632  BasicBlock *WorkerExitBB = BasicBlock::Create(
3633  CheckBB->getContext(), "worker.exit", CheckBB->getParent());
3634  Builder.SetInsertPoint(WorkerExitBB);
3635  Builder.CreateRetVoid();
3636 
3637  auto *CheckBBTI = CheckBB->getTerminator();
3638  Builder.SetInsertPoint(CheckBBTI);
3639  Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
3640 
3641  CheckBBTI->eraseFromParent();
3642  UI->eraseFromParent();
3643 
3644  // Continue in the "user_code" block, see diagram above and in
3645  // openmp/libomptarget/deviceRTLs/common/include/target.h .
3646  return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
3647 }
3648 
3650  bool IsSPMD,
3651  bool RequiresFullRuntime) {
3652  if (!updateToLocation(Loc))
3653  return;
3654 
3655  uint32_t SrcLocStrSize;
3656  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3657  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3658  ConstantInt *IsSPMDVal = ConstantInt::getSigned(
3659  IntegerType::getInt8Ty(Int8->getContext()),
3661  ConstantInt *RequiresFullRuntimeVal =
3662  ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime);
3663 
3664  Function *Fn = getOrCreateRuntimeFunctionPtr(
3665  omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
3666 
3667  Builder.CreateCall(Fn, {Ident, IsSPMDVal, RequiresFullRuntimeVal});
3668 }
3669 
3670 std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
3671  StringRef FirstSeparator,
3672  StringRef Separator) {
3673  SmallString<128> Buffer;
3674  llvm::raw_svector_ostream OS(Buffer);
3675  StringRef Sep = FirstSeparator;
3676  for (StringRef Part : Parts) {
3677  OS << Sep << Part;
3678  Sep = Separator;
3679  }
3680  return OS.str().str();
3681 }
3682 
3683 Constant *OpenMPIRBuilder::getOrCreateOMPInternalVariable(
3684  llvm::Type *Ty, const llvm::Twine &Name, unsigned AddressSpace) {
3685  // TODO: Replace the twine arg with stringref to get rid of the conversion
3686  // logic. However This is taken from current implementation in clang as is.
3687  // Since this method is used in many places exclusively for OMP internal use
3688  // we will keep it as is for temporarily until we move all users to the
3689  // builder and then, if possible, fix it everywhere in one go.
3690  SmallString<256> Buffer;
3691  llvm::raw_svector_ostream Out(Buffer);
3692  Out << Name;
3693  StringRef RuntimeName = Out.str();
3694  auto &Elem = *InternalVars.try_emplace(RuntimeName, nullptr).first;
3695  if (Elem.second) {
3696  assert(cast<PointerType>(Elem.second->getType())
3697  ->isOpaqueOrPointeeTypeMatches(Ty) &&
3698  "OMP internal variable has different type than requested");
3699  } else {
3700  // TODO: investigate the appropriate linkage type used for the global
3701  // variable for possibly changing that to internal or private, or maybe
3702  // create different versions of the function for different OMP internal
3703  // variables.
3704  Elem.second = new llvm::GlobalVariable(
3705  M, Ty, /*IsConstant*/ false, llvm::GlobalValue::CommonLinkage,
3706  llvm::Constant::getNullValue(Ty), Elem.first(),
3707  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3708  AddressSpace);
3709  }
3710 
3711  return Elem.second;
3712 }
3713 
3714 Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
3715  std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
3716  std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
3717  return getOrCreateOMPInternalVariable(KmpCriticalNameTy, Name);
3718 }
3719 
3722  std::string VarName) {
3723  llvm::Constant *MaptypesArrayInit =
3724  llvm::ConstantDataArray::get(M.getContext(), Mappings);
3725  auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
3726  M, MaptypesArrayInit->getType(),
3727  /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
3728  VarName);
3729  MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
3730  return MaptypesArrayGlobal;
3731 }
3732 
3734  InsertPointTy AllocaIP,
3735  unsigned NumOperands,
3736  struct MapperAllocas &MapperAllocas) {
3737  if (!updateToLocation(Loc))
3738  return;
3739 
3740  auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
3741  auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
3742  Builder.restoreIP(AllocaIP);
3743  AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI8PtrTy);
3744  AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy);
3745  AllocaInst *ArgSizes = Builder.CreateAlloca(ArrI64Ty);
3746  Builder.restoreIP(Loc.IP);
3747  MapperAllocas.ArgsBase = ArgsBase;
3749  MapperAllocas.ArgSizes = ArgSizes;
3750 }
3751 
3753  Function *MapperFunc, Value *SrcLocInfo,
3754  Value *MaptypesArg, Value *MapnamesArg,
3755  struct MapperAllocas &MapperAllocas,
3756  int64_t DeviceID, unsigned NumOperands) {
3757  if (!updateToLocation(Loc))
3758  return;
3759 
3760  auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
3761  auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
3762  Value *ArgsBaseGEP =
3763  Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
3764  {Builder.getInt32(0), Builder.getInt32(0)});
3765  Value *ArgsGEP =
3766  Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
3767  {Builder.getInt32(0), Builder.getInt32(0)});
3768  Value *ArgSizesGEP =
3769  Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
3770  {Builder.getInt32(0), Builder.getInt32(0)});
3771  Value *NullPtr = Constant::getNullValue(Int8Ptr->getPointerTo());
3772  Builder.CreateCall(MapperFunc,
3773  {SrcLocInfo, Builder.getInt64(DeviceID),
3774  Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
3775  ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
3776 }
3777 
3778 bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
3779  const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
3782  "Unexpected Atomic Ordering.");
3783 
3784  bool Flush = false;
3786 
3787  switch (AK) {
3788  case Read:
3791  FlushAO = AtomicOrdering::Acquire;
3792  Flush = true;
3793  }
3794  break;
3795  case Write:
3796  case Compare:
3797  case Update:
3800  FlushAO = AtomicOrdering::Release;
3801  Flush = true;
3802  }
3803  break;
3804  case Capture:
3805  switch (AO) {
3807  FlushAO = AtomicOrdering::Acquire;
3808  Flush = true;
3809  break;
3811  FlushAO = AtomicOrdering::Release;
3812  Flush = true;
3813  break;
3817  Flush = true;
3818  break;
3819  default:
3820  // do nothing - leave silently.
3821  break;
3822  }
3823  }
3824 
3825  if (Flush) {
3826  // Currently Flush RT call still doesn't take memory_ordering, so for when
3827  // that happens, this tries to do the resolution of which atomic ordering
3828  // to use with but issue the flush call
3829  // TODO: pass `FlushAO` after memory ordering support is added
3830  (void)FlushAO;
3831  emitFlush(Loc);
3832  }
3833 
3834  // for AO == AtomicOrdering::Monotonic and all other case combinations
3835  // do nothing
3836  return Flush;
3837 }
3838 
3842  AtomicOrdering AO) {
3843  if (!updateToLocation(Loc))
3844  return Loc.IP;
3845 
3846  Type *XTy = X.Var->getType();
3847  assert(XTy->isPointerTy() && "OMP Atomic expects a pointer to target memory");
3848  Type *XElemTy = X.ElemTy;
3849  assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
3850  XElemTy->isPointerTy()) &&
3851  "OMP atomic read expected a scalar type");
3852 
3853  Value *XRead = nullptr;
3854 
3855  if (XElemTy->isIntegerTy()) {
3856  LoadInst *XLD =
3857  Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
3858  XLD->setAtomic(AO);
3859  XRead = cast<Value>(XLD);
3860  } else {
3861  // We need to bitcast and perform atomic op as integer
3862  unsigned Addrspace = cast<PointerType>(XTy)->getAddressSpace();
3863  IntegerType *IntCastTy =
3864  IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
3865  Value *XBCast = Builder.CreateBitCast(
3866  X.Var, IntCastTy->getPointerTo(Addrspace), "atomic.src.int.cast");
3867  LoadInst *XLoad =
3868  Builder.CreateLoad(IntCastTy, XBCast, X.IsVolatile, "omp.atomic.load");
3869  XLoad->setAtomic(AO);
3870  if (XElemTy->isFloatingPointTy()) {
3871  XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
3872  } else {
3873  XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
3874  }
3875  }
3876  checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
3877  Builder.CreateStore(XRead, V.Var, V.IsVolatile);
3878  return Builder.saveIP();
3879 }
3880 
3883  AtomicOpValue &X, Value *Expr,
3884  AtomicOrdering AO) {
3885  if (!updateToLocation(Loc))
3886  return Loc.IP;
3887 
3888  Type *XTy = X.Var->getType();
3889  assert(XTy->isPointerTy() && "OMP Atomic expects a pointer to target memory");
3890  Type *XElemTy = X.ElemTy;
3891  assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
3892  XElemTy->isPointerTy()) &&
3893  "OMP atomic write expected a scalar type");
3894 
3895  if (XElemTy->isIntegerTy()) {
3896  StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
3897  XSt->setAtomic(AO);
3898  } else {
3899  // We need to bitcast and perform atomic op as integers
3900  unsigned Addrspace = cast<PointerType>(XTy)->getAddressSpace();
3901  IntegerType *IntCastTy =
3902  IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
3903  Value *XBCast = Builder.CreateBitCast(
3904  X.Var, IntCastTy->getPointerTo(Addrspace), "atomic.dst.int.cast");
3905  Value *ExprCast =
3906  Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
3907  StoreInst *XSt = Builder.CreateStore(ExprCast, XBCast, X.IsVolatile);
3908  XSt->setAtomic(AO);
3909  }
3910 
3911  checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
3912  return Builder.saveIP();
3913 }
3914 
3916  const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
3917  Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3918  AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
3919  assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
3920  if (!updateToLocation(Loc))
3921  return Loc.IP;
3922 
3923  LLVM_DEBUG({
3924  Type *XTy = X.Var->getType();
3925  assert(XTy->isPointerTy() &&
3926  "OMP Atomic expects a pointer to target memory");
3927  Type *XElemTy = X.ElemTy;
3928  assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
3929  XElemTy->isPointerTy()) &&
3930  "OMP atomic update expected a scalar type");
3931  assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
3932  (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
3933  "OpenMP atomic does not support LT or GT operations");
3934  });
3935 
3936  emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
3937  X.IsVolatile, IsXBinopExpr);
3938  checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
3939  return Builder.saveIP();
3940 }
3941 
3942 Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
3943  AtomicRMWInst::BinOp RMWOp) {
3944  switch (RMWOp) {
3945  case AtomicRMWInst::Add:
3946  return Builder.CreateAdd(Src1, Src2);
3947  case AtomicRMWInst::Sub:
3948  return Builder.CreateSub(Src1, Src2);
3949  case AtomicRMWInst::And:
3950  return Builder.CreateAnd(Src1, Src2);
3951  case AtomicRMWInst::Nand:
3952  return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
3953  case AtomicRMWInst::Or:
3954  return Builder.CreateOr(Src1, Src2);
3955  case AtomicRMWInst::Xor:
3956  return Builder.CreateXor(Src1, Src2);
3957  case AtomicRMWInst::Xchg:
3958  case AtomicRMWInst::FAdd:
3959  case AtomicRMWInst::FSub:
3961  case AtomicRMWInst::Max:
3962  case AtomicRMWInst::Min:
3963  case AtomicRMWInst::UMax:
3964  case AtomicRMWInst::UMin:
3965  llvm_unreachable("Unsupported atomic update operation");
3966  }
3967  llvm_unreachable("Unsupported atomic update operation");
3968 }
3969 
3970 std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
3971  InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
3973  AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
3974  // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
3975  // or a complex datatype.
3976  bool emitRMWOp = false;
3977  switch (RMWOp) {
3978  case AtomicRMWInst::Add:
3979  case AtomicRMWInst::And:
3980  case AtomicRMWInst::Nand:
3981  case AtomicRMWInst::Or:
3982  case AtomicRMWInst::Xor:
3983  case AtomicRMWInst::Xchg:
3984  emitRMWOp = XElemTy;
3985  break;
3986  case AtomicRMWInst::Sub:
3987  emitRMWOp = (IsXBinopExpr && XElemTy);
3988  break;
3989  default:
3990  emitRMWOp = false;
3991  }
3992  emitRMWOp &= XElemTy->isIntegerTy();
3993 
3994  std::pair<Value *, Value *> Res;
3995  if (emitRMWOp) {
3996  Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
3997  // not needed except in case of postfix captures. Generate anyway for
3998  // consistency with the else part. Will be removed with any DCE pass.
3999  // AtomicRMWInst::Xchg does not have a coressponding instruction.
4000  if (RMWOp == AtomicRMWInst::Xchg)
4001  Res.second = Res.first;
4002  else
4003  Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
4004  } else {
4005  unsigned Addrspace = cast<PointerType>(X->getType())->getAddressSpace();
4006  IntegerType *IntCastTy =
4007  IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
4008  Value *XBCast =
4009  Builder.CreateBitCast(X, IntCastTy->getPointerTo(Addrspace));
4010  LoadInst *OldVal =
4011  Builder.CreateLoad(IntCastTy, XBCast, X->getName() + ".atomic.load");
4012  OldVal->setAtomic(AO);
4013  // CurBB
4014  // | /---\
4015  // ContBB |
4016  // | \---/
4017  // ExitBB
4018  BasicBlock *CurBB = Builder.GetInsertBlock();
4019  Instruction *CurBBTI = CurBB->getTerminator();
4020  CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
4021  BasicBlock *ExitBB =
4022  CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
4023  BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
4024  X->getName() + ".atomic.cont");
4025  ContBB->getTerminator()->eraseFromParent();
4026  Builder.restoreIP(AllocaIP);
4027  AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
4028  NewAtomicAddr->setName(X->getName() + "x.new.val");
4029  Builder.SetInsertPoint(ContBB);
4030  llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
4031  PHI->addIncoming(OldVal, CurBB);
4032  IntegerType *NewAtomicCastTy =
4033  IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
4034  bool IsIntTy = XElemTy->isIntegerTy();
4035  Value *NewAtomicIntAddr =
4036  (IsIntTy)
4037  ? NewAtomicAddr
4038  : Builder.CreateBitCast(NewAtomicAddr,
4039  NewAtomicCastTy->getPointerTo(Addrspace));
4040  Value *OldExprVal = PHI;
4041  if (!IsIntTy) {
4042  if (XElemTy->isFloatingPointTy()) {
4043  OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
4044  X->getName() + ".atomic.fltCast");
4045  } else {
4046  OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
4047  X->getName() + ".atomic.ptrCast");
4048  }
4049  }
4050 
4051  Value *Upd = UpdateOp(OldExprVal, Builder);
4052  Builder.CreateStore(Upd, NewAtomicAddr);
4053  LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicIntAddr);
4054  Value *XAddr =
4055  (IsIntTy)
4056  ? X
4057  : Builder.CreateBitCast(X, IntCastTy->getPointerTo(Addrspace));
4060  AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
4061  XAddr, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
4062  Result->setVolatile(VolatileX);
4063  Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
4064  Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
4065  PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
4066  Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
4067 
4068  Res.first = OldExprVal;
4069  Res.second = Upd;
4070 
4071  // set Insertion point in exit block
4072  if (UnreachableInst *ExitTI =
4073  dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
4074  CurBBTI->eraseFromParent();
4075  Builder.SetInsertPoint(ExitBB);
4076  } else {
4077  Builder.SetInsertPoint(ExitTI);
4078  }
4079  }
4080 
4081  return Res;
4082 }
4083 
4085  const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
4086  AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
4088  bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
4089  if (!updateToLocation(Loc))
4090  return Loc.IP;
4091 
4092  LLVM_DEBUG({
4093  Type *XTy = X.Var->getType();
4094  assert(XTy->isPointerTy() &&
4095  "OMP Atomic expects a pointer to target memory");
4096  Type *XElemTy = X.ElemTy;
4097  assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
4098  XElemTy->isPointerTy()) &&
4099  "OMP atomic capture expected a scalar type");
4100  assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
4101  "OpenMP atomic does not support LT or GT operations");
4102  });
4103 
4104  // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
4105  // 'x' is simply atomically rewritten with 'expr'.
4106  AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
4107  std::pair<Value *, Value *> Result =
4108  emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
4109  X.IsVolatile, IsXBinopExpr);
4110 
4111  Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second);
4112  Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
4113 
4114  checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
4115  return Builder.saveIP();
4116 }
4117 
4121  omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
4122  bool IsFailOnly) {
4123 
4124  if (!updateToLocation(Loc))
4125  return Loc.IP;
4126 
4127  assert(X.Var->getType()->isPointerTy() &&
4128  "OMP atomic expects a pointer to target memory");
4129  assert((X.ElemTy->isIntegerTy() || X.ElemTy->isPointerTy()) &&
4130  "OMP atomic compare expected a integer scalar type");
4131  // compare capture
4132  if (V.Var) {
4133  assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
4134  assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
4135  }
4136 
4137  if (Op == OMPAtomicCompareOp::EQ) {
4139  AtomicCmpXchgInst *Result =
4140  Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
4141  if (V.Var) {
4142  Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
4143  assert(OldValue->getType() == V.ElemTy &&
4144  "OldValue and V must be of same type");
4145  if (IsPostfixUpdate) {
4146  Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
4147  } else {
4148  Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
4149  if (IsFailOnly) {
4150  // CurBB----
4151  // | |
4152  // v |
4153  // ContBB |
4154  // | |
4155  // v |
4156  // ExitBB <-
4157  //
4158  // where ContBB only contains the store of old value to 'v'.
4159  BasicBlock *CurBB = Builder.GetInsertBlock();
4160  Instruction *CurBBTI = CurBB->getTerminator();
4161  CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
4162  BasicBlock *ExitBB = CurBB->splitBasicBlock(
4163  CurBBTI, X.Var->getName() + ".atomic.exit");
4164  BasicBlock *ContBB = CurBB->splitBasicBlock(
4165  CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
4166  ContBB->getTerminator()->eraseFromParent();
4167  CurBB->getTerminator()->eraseFromParent();
4168 
4169  Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
4170 
4171  Builder.SetInsertPoint(ContBB);
4172  Builder.CreateStore(OldValue, V.Var);
4173  Builder.CreateBr(ExitBB);
4174 
4175  if (UnreachableInst *ExitTI =
4176  dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
4177  CurBBTI->eraseFromParent();
4178  Builder.SetInsertPoint(ExitBB);
4179  } else {
4180  Builder.SetInsertPoint(ExitTI);
4181  }
4182  } else {
4183  Value *CapturedValue =
4184  Builder.CreateSelect(SuccessOrFail, E, OldValue);
4185  Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
4186  }
4187  }
4188  }
4189  // The comparison result has to be stored.
4190  if (R.Var) {
4191  assert(R.Var->getType()->isPointerTy() &&
4192  "r.var must be of pointer type");
4193  assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
4194 
4195  Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
4196  Value *ResultCast = R.IsSigned
4197  ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
4198  : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
4199  Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
4200  }
4201  } else {
4203  "Op should be either max or min at this point");
4204  assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
4205 
4206  // Reverse the ordop as the OpenMP forms are different from LLVM forms.
4207  // Let's take max as example.
4208  // OpenMP form:
4209  // x = x > expr ? expr : x;
4210  // LLVM form:
4211  // *ptr = *ptr > val ? *ptr : val;
4212  // We need to transform to LLVM form.
4213  // x = x <= expr ? x : expr;
4214  AtomicRMWInst::BinOp NewOp;
4215  if (IsXBinopExpr) {
4216  if (X.IsSigned)
4219  else
4222  } else {
4223  if (X.IsSigned)
4226  else
4229  }
4230 
4231  AtomicRMWInst *OldValue =
4232  Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
4233  if (V.Var) {
4234  Value *CapturedValue = nullptr;
4235  if (IsPostfixUpdate) {
4236  CapturedValue = OldValue;
4237  } else {
4238  CmpInst::Predicate Pred;
4239  switch (NewOp) {
4240  case AtomicRMWInst::Max:
4241  Pred = CmpInst::ICMP_SGT;
4242  break;
4243  case AtomicRMWInst::UMax:
4244  Pred = CmpInst::ICMP_UGT;
4245  break;
4246  case AtomicRMWInst::Min:
4247  Pred = CmpInst::ICMP_SLT;
4248  break;
4249  case AtomicRMWInst::UMin:
4250  Pred = CmpInst::ICMP_ULT;
4251  break;
4252  default:
4253  llvm_unreachable("unexpected comparison op");
4254  }
4255  Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
4256  CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
4257  }
4258  Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
4259  }
4260  }
4261 
4262  checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
4263 
4264  return Builder.saveIP();
4265 }
4266 
4269  std::string VarName) {
4270  llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
4272  llvm::Type::getInt8Ty(M.getContext())->getPointerTo(), Names.size()),
4273  Names);
4274  auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
4275  M, MapNamesArrayInit->getType(),
4276  /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
4277  VarName);
4278  return MapNamesArrayGlobal;
4279 }
4280 
4281 // Create all simple and struct types exposed by the runtime and remember
4282 // the llvm::PointerTypes of them for easy access later.
4283 void OpenMPIRBuilder::initializeTypes(Module &M) {
4284  LLVMContext &Ctx = M.getContext();
4285  StructType *T;
4286 #define OMP_TYPE(VarName, InitValue) VarName = InitValue;
4287 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
4288  VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
4289  VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
4290 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
4291  VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
4292  VarName##Ptr = PointerType::getUnqual(VarName);
4293 #define OMP_STRUCT_TYPE(VarName, StructName, ...) \
4294  T = StructType::getTypeByName(Ctx, StructName); \
4295  if (!T) \
4296  T = StructType::create(Ctx, {__VA_ARGS__}, StructName); \
4297  VarName = T; \
4298  VarName##Ptr = PointerType::getUnqual(T);
4299 #include "llvm/Frontend/OpenMP/OMPKinds.def"
4300 }
4301 
4304  SmallVectorImpl<BasicBlock *> &BlockVector) {
4306  BlockSet.insert(EntryBB);
4307  BlockSet.insert(ExitBB);
4308 
4309  Worklist.push_back(EntryBB);
4310  while (!Worklist.empty()) {
4311  BasicBlock *BB = Worklist.pop_back_val();
4312  BlockVector.push_back(BB);
4313  for (BasicBlock *SuccBB : successors(BB))
4314  if (BlockSet.insert(SuccBB).second)
4315  Worklist.push_back(SuccBB);
4316  }
4317 }
4318 
4319 void CanonicalLoopInfo::collectControlBlocks(
4321  // We only count those BBs as control block for which we do not need to
4322  // reverse the CFG, i.e. not the loop body which can contain arbitrary control
4323  // flow. For consistency, this also means we do not add the Body block, which
4324  // is just the entry to the body code.
4325  BBs.reserve(BBs.size() + 6);
4326  BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
4327 }
4328 
4330  assert(isValid() && "Requires a valid canonical loop");
4331  for (BasicBlock *Pred : predecessors(Header)) {
4332  if (Pred != Latch)
4333  return Pred;
4334  }
4335  llvm_unreachable("Missing preheader");
4336 }
4337 
4338 void CanonicalLoopInfo::setTripCount(Value *TripCount) {
4339  assert(isValid() && "Requires a valid canonical loop");
4340 
4341  Instruction *CmpI = &getCond()->front();
4342  assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
4343  CmpI->setOperand(1, TripCount);
4344 
4345 #ifndef NDEBUG
4346  assertOK();
4347 #endif
4348 }
4349 
4350 void CanonicalLoopInfo::mapIndVar(
4351  llvm::function_ref<Value *(Instruction *)> Updater) {
4352  assert(isValid() && "Requires a valid canonical loop");
4353 
4354  Instruction *OldIV = getIndVar();
4355 
4356  // Record all uses excluding those introduced by the updater. Uses by the
4357  // CanonicalLoopInfo itself to keep track of the number of iterations are
4358  // excluded.
4359  SmallVector<Use *> ReplacableUses;
4360  for (Use &U : OldIV->uses()) {
4361  auto *User = dyn_cast<Instruction>(U.getUser());
4362  if (!User)
4363  continue;
4364  if (User->getParent() == getCond())
4365  continue;
4366  if (User->getParent() == getLatch())
4367  continue;
4368  ReplacableUses.push_back(&U);
4369  }
4370 
4371  // Run the updater that may introduce new uses
4372  Value *NewIV = Updater(OldIV);
4373 
4374  // Replace the old uses with the value returned by the updater.
4375  for (Use *U : ReplacableUses)
4376  U->set(NewIV);
4377 
4378 #ifndef NDEBUG
4379  assertOK();
4380 #endif
4381 }
4382 
4384 #ifndef NDEBUG
4385  // No constraints if this object currently does not describe a loop.
4386  if (!isValid())
4387  return;
4388 
4389  BasicBlock *Preheader = getPreheader();
4390  BasicBlock *Body = getBody();
4391  BasicBlock *After = getAfter();
4392 
4393  // Verify standard control-flow we use for OpenMP loops.
4394  assert(Preheader);
4395  assert(isa<BranchInst>(Preheader->getTerminator()) &&
4396  "Preheader must terminate with unconditional branch");
4397  assert(Preheader->getSingleSuccessor() == Header &&
4398  "Preheader must jump to header");
4399 
4400  assert(Header);
4401  assert(isa<BranchInst>(Header->getTerminator()) &&
4402  "Header must terminate with unconditional branch");
4403  assert(Header->getSingleSuccessor() == Cond &&
4404  "Header must jump to exiting block");
4405 
4406  assert(Cond);
4407  assert(Cond->getSinglePredecessor() == Header &&
4408  "Exiting block only reachable from header");
4409 
4410  assert(isa<BranchInst>(Cond->getTerminator()) &&
4411  "Exiting block must terminate with conditional branch");
4412  assert(size(successors(Cond)) == 2 &&
4413  "Exiting block must have two successors");
4414  assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
4415  "Exiting block's first successor jump to the body");
4416  assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
4417  "Exiting block's second successor must exit the loop");
4418 
4419  assert(Body);
4420  assert(Body->getSinglePredecessor() == Cond &&
4421  "Body only reachable from exiting block");
4422  assert(!isa<PHINode>(Body->front()));
4423 
4424  assert(Latch);
4425  assert(isa<BranchInst>(Latch->getTerminator()) &&
4426  "Latch must terminate with unconditional branch");
4427  assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
4428  // TODO: To support simple redirecting of the end of the body code that has
4429  // multiple; introduce another auxiliary basic block like preheader and after.
4430  assert(Latch->getSinglePredecessor() != nullptr);
4431  assert(!isa<PHINode>(Latch->front()));
4432 
4433  assert(Exit);
4434  assert(isa<BranchInst>(Exit->getTerminator()) &&
4435  "Exit block must terminate with unconditional branch");
4436  assert(Exit->getSingleSuccessor() == After &&
4437  "Exit block must jump to after block");
4438 
4439  assert(After);
4440  assert(After->getSinglePredecessor() == Exit &&
4441  "After block only reachable from exit block");
4442  assert(After->empty() || !isa<PHINode>(After->front()));
4443 
4444  Instruction *IndVar = getIndVar();
4445  assert(IndVar && "Canonical induction variable not found?");
4446  assert(isa<IntegerType>(IndVar->getType()) &&
4447  "Induction variable must be an integer");
4448  assert(cast<PHINode>(IndVar)->getParent() == Header &&
4449  "Induction variable must be a PHI in the loop header");
4450  assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
4451  assert(
4452  cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
4453  assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
4454 
4455  auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
4456  assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
4457  assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
4458  assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
4459  assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
4460  ->isOne());
4461 
4462  Value *TripCount = getTripCount();
4463  assert(TripCount && "Loop trip count not found?");
4464  assert(IndVar->getType() == TripCount->getType() &&
4465  "Trip count and induction variable must have the same type");
4466 
4467  auto *CmpI = cast<CmpInst>(&Cond->front());
4468  assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
4469  "Exit condition must be a signed less-than comparison");
4470  assert(CmpI->getOperand(0) == IndVar &&
4471  "Exit condition must compare the induction variable");
4472  assert(CmpI->getOperand(1) == TripCount &&
4473  "Exit condition must compare with the trip count");
4474 #endif
4475 }
4476 
4478  Header = nullptr;
4479  Cond = nullptr;
4480  Latch = nullptr;
4481  Exit = nullptr;
4482 }
llvm::omp::OMPScheduleType::NomergeOrderedStatic
@ NomergeOrderedStatic
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
Int32Ty
IntegerType * Int32Ty
Definition: NVVMIntrRange.cpp:67
llvm::omp::OMPScheduleType::BaseGuidedChunked
@ BaseGuidedChunked
llvm::CanonicalLoopInfo::getPreheaderIP
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
Definition: OMPIRBuilder.h:1762
llvm::TargetTransformInfo::UnrollingPreferences::BEInsns
unsigned BEInsns
Definition: TargetTransformInfo.h:481
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
AssumptionCache.h
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:460
llvm::OpenMPIRBuilder::createCachedThreadPrivate
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
Definition: OMPIRBuilder.cpp:3575
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:299
llvm::AtomicOrdering::AcquireRelease
@ AcquireRelease
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:2479
llvm::OpenMPIRBuilder::LocationDescription
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:202
llvm::ScalarEvolutionAnalysis
Analysis pass that exposes the ScalarEvolution for a function.
Definition: ScalarEvolution.h:2115
addLoopMetadata
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
Definition: OMPIRBuilder.cpp:2783
llvm::OpenMPIRBuilder::OutlineInfo::ExitBB
BasicBlock * ExitBB
Definition: OMPIRBuilder.h:887
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:456
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::Type::getInt1Ty
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:236
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::OpenMPIRBuilder::createSection
InsertPointTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
Definition: OMPIRBuilder.cpp:1524
llvm::Value::getPointerAlignment
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:915
llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:280
llvm::OpenMPIRBuilder::createCritical
InsertPointTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
Definition: OMPIRBuilder.cpp:3187
llvm::CanonicalLoopInfo::getAfter
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Definition: OMPIRBuilder.h:1731
llvm::InstructionCost::getValue
Optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition: InstructionCost.h:87
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::OpenMPIRBuilder::OutlineInfo::ExcludeArgsFromAggregate
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Definition: OMPIRBuilder.h:888
llvm::Type::getInt8PtrTy
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:291
createTargetMachine
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOpt::Level OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
Definition: OMPIRBuilder.cpp:2902
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:719
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:218
llvm::omp::OMPScheduleType::NomergeOrderedRuntime
@ NomergeOrderedRuntime
llvm::omp::OMPScheduleType::NomergeUnorderedGuidedChunked
@ NomergeUnorderedGuidedChunked
llvm::TargetOptions
Definition: TargetOptions.h:124
llvm::OpenMPIRBuilder::createLoopSkeleton
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
Definition: OMPIRBuilder.cpp:1783
DebugInfoMetadata.h
llvm::cl::Prefix
@ Prefix
Definition: CommandLine.h:160
llvm::Function::empty
bool empty() const
Definition: Function.h:732
llvm::Function::getBasicBlockList
const BasicBlockListType & getBasicBlockList() const
Get the underlying elements of the Function...
Definition: Function.h:703
llvm::GlobalValue::HiddenVisibility
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:64
llvm::CodeExtractor::findAllocas
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Definition: CodeExtractor.cpp:494
llvm::CanonicalLoopInfo::getPreheader
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Definition: OMPIRBuilder.cpp:4329
llvm::Function
Definition: Function.h:60
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:546
StringRef.h
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::ConstantStruct::get
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1355
llvm::omp::OMPScheduleType::BaseGuidedAnalyticalChunked
@ BaseGuidedAnalyticalChunked
llvm::OpenMPIRBuilder::createOMPInteropUse
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
Definition: OMPIRBuilder.cpp:3547
llvm::AtomicRMWInst::Xor
@ Xor
*p = old ^ v
Definition: Instructions.h:740
llvm::omp::OMPScheduleType::UnorderedDynamicChunked
@ UnorderedDynamicChunked
llvm::CodeExtractor::extractCodeRegion
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
Definition: CodeExtractor.cpp:1626
Loops
Hexagon Hardware Loops
Definition: HexagonHardwareLoops.cpp:372
llvm::Target
Target - Wrapper for Target specific information.
Definition: TargetRegistry.h:145
llvm::GlobalValue::NotThreadLocal
@ NotThreadLocal
Definition: GlobalValue.h:184
llvm::AtomicRMWInst::BinOp
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:726
llvm::ilist_node_with_parent::getNextNode
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:289
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
llvm::BasicBlock::empty
bool empty() const
Definition: BasicBlock.h:308
llvm::omp::OMPScheduleType::NomergeUnorderedSteal
@ NomergeUnorderedSteal
llvm::omp::OMPScheduleType::OrderdTrapezoidal
@ OrderdTrapezoidal
llvm::OpenMPIRBuilder::tileLoops
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
Definition: OMPIRBuilder.cpp:2591
llvm::enumerate
detail::enumerator< R > enumerate(R &&TheRange)
Given an input range, returns a new range whose values are are pair (A,B) such that A is the 0-based ...
Definition: STLExtras.h:2057
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:168
llvm::Function::getEntryBlock
const BasicBlock & getEntryBlock() const
Definition: Function.h:710
llvm::OpenMPIRBuilder::InsertPointTy
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:96
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2495
llvm::omp::OMPScheduleType::NomergeUnorderedRuntime
@ NomergeUnorderedRuntime
llvm::OpenMPIRBuilder::getOrCreateThreadID
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
Definition: OMPIRBuilder.cpp:641
llvm::GlobalVariable
Definition: GlobalVariable.h:39
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:741
llvm::omp::OMPScheduleType::NomergeUnorderedBalanced
@ NomergeUnorderedBalanced
llvm::ConstantExpr::getBitCast
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2266
llvm::DeleteDeadBlocks
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
Definition: BasicBlockUtils.cpp:99
llvm::FunctionType::get
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
Definition: Type.cpp:361
llvm::OpenMPIRBuilder::createAtomicUpdate
InsertPointTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
Definition: OMPIRBuilder.cpp:3915
llvm::omp::OMPScheduleType::NomergeOrderedTrapezoidal
@ NomergeOrderedTrapezoidal
OptimizationRemarkEmitter.h
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
llvm::omp::OMPScheduleType::UnorderedGreedy
@ UnorderedGreedy
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
llvm::omp::OMPScheduleType::BaseGreedy
@ BaseGreedy
FAM
FunctionAnalysisManager FAM
Definition: PassBuilderBindings.cpp:59
llvm::tgtok::VarName
@ VarName
Definition: TGLexer.h:72
llvm::TargetTransformInfo::UnrollingPreferences::Count
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
Definition: TargetTransformInfo.h:465
llvm::omp::OMPScheduleType::UnorderedRuntimeSimd
@ UnorderedRuntimeSimd
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::DILocation
Debug location.
Definition: DebugInfoMetadata.h:1557
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:746
llvm::Function::getContext
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:319
llvm::Function::arg_size
size_t arg_size() const
Definition: Function.h:774
ScalarEvolution.h
Shift
bool Shift
Definition: README.txt:468
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::BasicBlock::eraseFromParent
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:132
llvm::OpenMPIRBuilder::AtomicOpValue
a struct to pack relevant information while generating atomic Ops
Definition: OMPIRBuilder.h:1369
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:538
llvm::CanonicalLoopInfo::getAfterIP
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
Definition: OMPIRBuilder.h:1776
getTripCount
static const SCEV * getTripCount(const SCEV *BECount, Type *IntPtr, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute trip count from the backedge taken count.
Definition: LoopIdiomRecognize.cpp:1055
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
llvm::BasicBlock::getSingleSuccessor
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:291
llvm::StoreInst::setAlignment
void setAlignment(Align Align)
Definition: Instructions.h:345
llvm::spliceBB
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
Definition: OMPIRBuilder.cpp:255
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:136
llvm::omp::OMPScheduleType::UnorderedGuidedAnalyticalChunked
@ UnorderedGuidedAnalyticalChunked
llvm::BasicBlock::splitBasicBlock
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:378
llvm::GlobalValue::UnnamedAddr::Global
@ Global
llvm::Optional
Definition: APInt.h:33
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::OpenMPIRBuilder::createSingle
InsertPointTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, llvm::Value *DidIt)
Generator for '#omp single'.
Definition: OMPIRBuilder.cpp:3145
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:420
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::omp::OMPScheduleType::BaseRuntimeSimd
@ BaseRuntimeSimd
llvm::MCID::Convergent
@ Convergent
Definition: MCInstrDesc.h:184
CodeExtractor.h
llvm::omp::OMPScheduleType::ModifierNonmonotonic
@ ModifierNonmonotonic
llvm::OpenMPIRBuilder::ReductionInfo::Variable
Value * Variable
Reduction variable of pointer type.
Definition: OMPIRBuilder.h:662
llvm::successors
auto successors(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:29
llvm::OpenMPIRBuilder::FinalizeCallbackTy
std::function< void(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:106
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:749
llvm::computeUnrollCount
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
Definition: LoopUnrollPass.cpp:889
llvm::CanonicalLoopInfo::getFunction
Function * getFunction() const
Definition: OMPIRBuilder.h:1782
llvm::BasicBlock::getSinglePredecessor
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:261
llvm::OpenMPIRBuilder::AtomicOpValue::Var
Value * Var
Definition: OMPIRBuilder.h:1370
llvm::CallBase::arg_begin
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1316
llvm::SmallVectorImpl::pop_back_val
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:654
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the six floating-point types.
Definition: Type.h:163
llvm::TargetRegistry::lookupTarget
static const Target * lookupTarget(const std::string &Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Definition: TargetRegistry.cpp:62
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:237
llvm::BasicBlock::rend
reverse_iterator rend()
Definition: BasicBlock.h:304
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:239
llvm::omp::OMPScheduleType::UnorderedSteal
@ UnorderedSteal
llvm::OpenMPIRBuilder::emitTaskyieldImpl
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
Definition: OMPIRBuilder.cpp:1238
llvm::OpenMPIRBuilder::createReductions
InsertPointTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false)
Generator for '#omp reduction'.
Definition: OMPIRBuilder.cpp:1570
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:159
computeOpenMPScheduleType
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
Definition: OMPIRBuilder.cpp:219
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::StoreInst::setAtomic
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:372
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1300
llvm::OpenMPIRBuilder::collapseLoops
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
Definition: OMPIRBuilder.cpp:2463
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::MDNode::operands
op_range operands() const
Definition: Metadata.h:1202
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1366
llvm::ConstantExpr::getPointerCast
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2078
llvm::OpenMPIRBuilder::createTask
InsertPointTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr)
Generator for #omp task
Definition: OMPIRBuilder.cpp:1257
llvm::BasicBlock::getUniqueSuccessor
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:299
Uses
SmallPtrSet< MachineInstr *, 2 > Uses
Definition: ARMLowOverheadLoops.cpp:585
llvm::AtomicOrdering::Monotonic
@ Monotonic
llvm::CanonicalLoopInfo::getIndVar
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Definition: OMPIRBuilder.h:1748
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::omp::OMPScheduleType::UnorderedGuidedChunked
@ UnorderedGuidedChunked
llvm::gatherUnrollingPreferences
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, Optional< unsigned > UserThreshold, Optional< unsigned > UserCount, Optional< bool > UserAllowPartial, Optional< bool > UserRuntime, Optional< bool > UserUpperBound, Optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
Definition: LoopUnrollPass.cpp:184
llvm::OpenMPIRBuilder::createOrderedDepend
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
Definition: OMPIRBuilder.cpp:3222
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::Lock
static sys::Mutex Lock
Definition: NVPTXUtilities.cpp:39
getKmpcForDynamicFiniForType
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
Definition: OMPIRBuilder.cpp:2284
Mappings
Inject TLI Mappings
Definition: InjectTLIMappings.cpp:171
CommandLine.h
CodeMetrics.h
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::omp::OMPInteropType
OMPInteropType
Definition: OMPConstants.h:205
llvm::Instruction::getNumSuccessors
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
Definition: Instruction.cpp:777
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1374
llvm::ApproximateLoopSize
InstructionCost ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &EphValues, unsigned BEInsns)
ApproximateLoopSize - Approximate the size of the loop.
Definition: LoopUnrollPass.cpp:666
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1617
llvm::omp::OMPScheduleType::UnorderedAuto
@ UnorderedAuto
TargetMachine.h
llvm::OpenMPIRBuilder::emitMapperCall
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
Definition: OMPIRBuilder.cpp:3752
OMPIRBuilder.h
Constants.h
llvm::omp::OMP_TGT_EXEC_MODE_GENERIC
@ OMP_TGT_EXEC_MODE_GENERIC
Definition: OMPConstants.h:189
isZero
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:524
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:114
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::SmallVectorImpl::append
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:667
llvm::CanonicalLoopInfo::assertOK
void assertOK() const
Consistency self-check.
Definition: OMPIRBuilder.cpp:4383
llvm::CanonicalLoopInfo::getCond
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
Definition: OMPIRBuilder.h:1699
llvm::OpenMPIRBuilder
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:75
llvm::OpenMPIRBuilder::createOMPFree
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
Definition: OMPIRBuilder.cpp:3476
llvm::User
Definition: User.h:44
llvm::OpenMPIRBuilder::getOrCreateDefaultSrcLocStr
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
Definition: OMPIRBuilder.cpp:613
llvm::omp::OMPScheduleType::UnorderedRuntime
@ UnorderedRuntime
llvm::OpenMPIRBuilder::emitOffloadingEntry
void emitOffloadingEntry(Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, StringRef SectionName="omp_offloading_entries")
Create an offloading section struct used to register this global at runtime.
Definition: OMPIRBuilder.cpp:756
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:496
cmp
< i32 >< i32 > cmp
Definition: README.txt:1447
llvm::CmpInst::ICMP_ULE
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:745
TileSize
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1396
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:297
llvm::OpenMPIRBuilder::createMapperAllocas
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Definition: OMPIRBuilder.cpp:3733
redirectAllPredecessorsTo
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
Definition: OMPIRBuilder.cpp:2424
llvm::OpenMPIRBuilder::createBarrier
InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive DK, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
Definition: OMPIRBuilder.cpp:648
llvm::OpenMPIRBuilder::createAtomicRead
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
Definition: OMPIRBuilder.cpp:3840
getOpcode
static Optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:190
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:194
IP
Definition: NVPTXLowerArgs.cpp:167
TargetLibraryInfo.h
llvm::Value::uses
iterator_range< use_iterator > uses()
Definition: Value.h:376
llvm::BasicBlock::getFirstInsertionPt
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:246
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
llvm::OpenMPIRBuilder::~OpenMPIRBuilder
~OpenMPIRBuilder()
Definition: OMPIRBuilder.cpp:518
llvm::omp::OMPScheduleType::ModifierMonotonic
@ ModifierMonotonic
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:187
llvm::CanonicalLoopInfo::isValid
bool isValid() const
Returns whether this object currently represents the IR of a loop.
Definition: OMPIRBuilder.h:1682
llvm::OpenMPIRBuilder::getOrCreateSrcLocStr
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
Definition: OMPIRBuilder.cpp:573
llvm::CodeExtractor::findInputsOutputs
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas) const
Compute the set of input values and output values for the code.
Definition: CodeExtractor.cpp:646
llvm::Instruction
Definition: Instruction.h:42
llvm::OpenMPIRBuilder::createAtomicCompare
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
Definition: OMPIRBuilder.cpp:4118
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::splitBBWithSuffix
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
Definition: OMPIRBuilder.cpp:323
isValidWorkshareLoopScheduleType
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
Definition: OMPIRBuilder.cpp:73
MDBuilder.h
llvm::AtomicOrdering::Acquire
@ Acquire
llvm::AtomicRMWInst::Nand
@ Nand
*p = ~(old & v)
Definition: Instructions.h:736
Options
const char LLVMTargetMachineRef LLVMPassBuilderOptionsRef Options
Definition: PassBuilderBindings.cpp:48
llvm::predecessors
auto predecessors(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:30
llvm::omp::OMPScheduleType::NomergeUnorderedAuto
@ NomergeUnorderedAuto
llvm::GlobalObject::addMetadata
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1298
llvm::Value::setName
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:372
llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition: SmallVector.h:619
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:928
llvm::OpenMPIRBuilder::applySimd
void applySimd(DebugLoc DL, CanonicalLoopInfo *Loop)
Add metadata to simd-ize a loop.
Definition: OMPIRBuilder.cpp:2837
llvm::BasicBlock::rbegin
reverse_iterator rbegin()
Definition: BasicBlock.h:302
llvm::CanonicalLoopInfo::getBodyIP
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
Definition: OMPIRBuilder.h:1769
llvm::CodeMetrics::collectEphemeralValues
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
llvm::omp::OMPAtomicCompareOp::EQ
@ EQ
llvm::Instruction::getSuccessor
BasicBlock * getSuccessor(unsigned Idx) const
Return the specified successor. This instruction must be a terminator.
Definition: Instruction.cpp:789
removeUnusedBlocksFromParent