LLVM  10.0.0svn
AMDGPUCallLowering.cpp
Go to the documentation of this file.
1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the lowering of LLVM calls to machine code calls for
11 /// GlobalISel.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUISelLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIISelLowering.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
23 #include "llvm/CodeGen/Analysis.h"
28 
29 using namespace llvm;
30 
31 namespace {
32 
33 struct OutgoingValueHandler : public CallLowering::ValueHandler {
34  OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
35  MachineInstrBuilder MIB, CCAssignFn *AssignFn)
36  : ValueHandler(B, MRI, AssignFn), MIB(MIB) {}
37 
39 
40  Register getStackAddress(uint64_t Size, int64_t Offset,
41  MachinePointerInfo &MPO) override {
42  llvm_unreachable("not implemented");
43  }
44 
45  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
46  MachinePointerInfo &MPO, CCValAssign &VA) override {
47  llvm_unreachable("not implemented");
48  }
49 
50  void assignValueToReg(Register ValVReg, Register PhysReg,
51  CCValAssign &VA) override {
52  Register ExtReg;
53  if (VA.getLocVT().getSizeInBits() < 32) {
54  // 16-bit types are reported as legal for 32-bit registers. We need to
55  // extend and do a 32-bit copy to avoid the verifier complaining about it.
56  ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
57  } else
58  ExtReg = extendRegister(ValVReg, VA);
59 
60  MIRBuilder.buildCopy(PhysReg, ExtReg);
61  MIB.addUse(PhysReg, RegState::Implicit);
62  }
63 
64  bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
65  CCValAssign::LocInfo LocInfo,
67  ISD::ArgFlagsTy Flags,
68  CCState &State) override {
69  return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
70  }
71 };
72 
73 struct IncomingArgHandler : public CallLowering::ValueHandler {
74  uint64_t StackUsed = 0;
75 
76  IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
77  CCAssignFn *AssignFn)
78  : ValueHandler(B, MRI, AssignFn) {}
79 
80  Register getStackAddress(uint64_t Size, int64_t Offset,
81  MachinePointerInfo &MPO) override {
82  auto &MFI = MIRBuilder.getMF().getFrameInfo();
83  int FI = MFI.CreateFixedObject(Size, Offset, true);
84  MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
85  Register AddrReg = MRI.createGenericVirtualRegister(
87  MIRBuilder.buildFrameIndex(AddrReg, FI);
88  StackUsed = std::max(StackUsed, Size + Offset);
89  return AddrReg;
90  }
91 
92  void assignValueToReg(Register ValVReg, Register PhysReg,
93  CCValAssign &VA) override {
94  markPhysRegUsed(PhysReg);
95 
96  if (VA.getLocVT().getSizeInBits() < 32) {
97  // 16-bit types are reported as legal for 32-bit registers. We need to do
98  // a 32-bit copy, and truncate to avoid the verifier complaining about it.
99  auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
100  MIRBuilder.buildTrunc(ValVReg, Copy);
101  return;
102  }
103 
104  switch (VA.getLocInfo()) {
105  case CCValAssign::LocInfo::SExt:
106  case CCValAssign::LocInfo::ZExt:
107  case CCValAssign::LocInfo::AExt: {
108  auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
109  MIRBuilder.buildTrunc(ValVReg, Copy);
110  break;
111  }
112  default:
113  MIRBuilder.buildCopy(ValVReg, PhysReg);
114  break;
115  }
116  }
117 
118  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
119  MachinePointerInfo &MPO, CCValAssign &VA) override {
120  // FIXME: Get alignment
121  auto MMO = MIRBuilder.getMF().getMachineMemOperand(
123  MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
124  }
125 
126  /// How the physical register gets marked varies between formal
127  /// parameters (it's a basic-block live-in), and a call instruction
128  /// (it's an implicit-def of the BL).
129  virtual void markPhysRegUsed(unsigned PhysReg) = 0;
130 
131  // FIXME: What is the point of this being a callback?
132  bool isIncomingArgumentHandler() const override { return true; }
133 };
134 
135 struct FormalArgHandler : public IncomingArgHandler {
136  FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
137  CCAssignFn *AssignFn)
138  : IncomingArgHandler(B, MRI, AssignFn) {}
139 
140  void markPhysRegUsed(unsigned PhysReg) override {
141  MIRBuilder.getMBB().addLiveIn(PhysReg);
142  }
143 };
144 
145 }
146 
148  : CallLowering(&TLI) {
149 }
150 
151 void AMDGPUCallLowering::splitToValueTypes(
152  const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
153  const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv,
154  SplitArgTy PerformArgSplit) const {
155  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
156  LLVMContext &Ctx = OrigArg.Ty->getContext();
157 
158  if (OrigArg.Ty->isVoidTy())
159  return;
160 
161  SmallVector<EVT, 4> SplitVTs;
162  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
163 
164  assert(OrigArg.Regs.size() == SplitVTs.size());
165 
166  int SplitIdx = 0;
167  for (EVT VT : SplitVTs) {
168  unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
169  Type *Ty = VT.getTypeForEVT(Ctx);
170 
171 
172 
173  if (NumParts == 1) {
174  // No splitting to do, but we want to replace the original type (e.g. [1 x
175  // double] -> double).
176  SplitArgs.emplace_back(OrigArg.Regs[SplitIdx], Ty,
177  OrigArg.Flags, OrigArg.IsFixed);
178 
179  ++SplitIdx;
180  continue;
181  }
182 
183  LLT LLTy = getLLTForType(*Ty, DL);
184 
185  SmallVector<Register, 8> SplitRegs;
186 
187  EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
188  Type *PartTy = PartVT.getTypeForEVT(Ctx);
189  LLT PartLLT = getLLTForType(*PartTy, DL);
190 
191  // FIXME: Should we be reporting all of the part registers for a single
192  // argument, and let handleAssignments take care of the repacking?
193  for (unsigned i = 0; i < NumParts; ++i) {
194  Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
195  SplitRegs.push_back(PartReg);
196  SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
197  }
198 
199  PerformArgSplit(SplitRegs, LLTy, PartLLT, SplitIdx);
200 
201  ++SplitIdx;
202  }
203 }
204 
205 // Get the appropriate type to make \p OrigTy \p Factor times bigger.
206 static LLT getMultipleType(LLT OrigTy, int Factor) {
207  if (OrigTy.isVector()) {
208  return LLT::vector(OrigTy.getNumElements() * Factor,
209  OrigTy.getElementType());
210  }
211 
212  return LLT::scalar(OrigTy.getSizeInBits() * Factor);
213 }
214 
215 // TODO: Move to generic code
217  ArrayRef<Register> DstRegs,
218  Register SrcReg,
219  LLT SrcTy,
220  LLT PartTy) {
221  assert(DstRegs.size() > 1 && "Nothing to unpack");
222 
223  MachineFunction &MF = B.getMF();
224  MachineRegisterInfo &MRI = MF.getRegInfo();
225 
226  const unsigned SrcSize = SrcTy.getSizeInBits();
227  const unsigned PartSize = PartTy.getSizeInBits();
228 
229  if (SrcTy.isVector() && !PartTy.isVector() &&
230  PartSize > SrcTy.getElementType().getSizeInBits()) {
231  // Vector was scalarized, and the elements extended.
232  auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(),
233  SrcReg);
234  for (int i = 0, e = DstRegs.size(); i != e; ++i)
235  B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
236  return;
237  }
238 
239  if (SrcSize % PartSize == 0) {
240  B.buildUnmerge(DstRegs, SrcReg);
241  return;
242  }
243 
244  const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize;
245 
246  LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
247  auto ImpDef = B.buildUndef(BigTy);
248 
249  Register BigReg = MRI.createGenericVirtualRegister(BigTy);
250  B.buildInsert(BigReg, ImpDef.getReg(0), SrcReg, 0).getReg(0);
251 
252  int64_t Offset = 0;
253  for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
254  B.buildExtract(DstRegs[i], BigReg, Offset);
255 }
256 
257 /// Lower the return value for the already existing \p Ret. This assumes that
258 /// \p B's insertion point is correct.
259 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
260  const Value *Val, ArrayRef<Register> VRegs,
261  MachineInstrBuilder &Ret) const {
262  if (!Val)
263  return true;
264 
265  auto &MF = B.getMF();
266  const auto &F = MF.getFunction();
267  const DataLayout &DL = MF.getDataLayout();
268 
269  CallingConv::ID CC = F.getCallingConv();
270  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
271  MachineRegisterInfo &MRI = MF.getRegInfo();
272 
273  ArgInfo OrigRetInfo(VRegs, Val->getType());
274  setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
275  SmallVector<ArgInfo, 4> SplitRetInfos;
276 
277  splitToValueTypes(
278  OrigRetInfo, SplitRetInfos, DL, MRI, CC,
279  [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
280  unpackRegsToOrigType(B, Regs, VRegs[VTSplitIdx], LLTy, PartLLT);
281  });
282 
283  CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
284 
285  OutgoingValueHandler RetHandler(B, MF.getRegInfo(), Ret, AssignFn);
286  return handleAssignments(B, SplitRetInfos, RetHandler);
287 }
288 
290  const Value *Val,
291  ArrayRef<Register> VRegs) const {
292 
293  MachineFunction &MF = B.getMF();
294  MachineRegisterInfo &MRI = MF.getRegInfo();
296  MFI->setIfReturnsVoid(!Val);
297 
298  assert(!Val == VRegs.empty() && "Return value without a vreg");
299 
301  const bool IsShader = AMDGPU::isShader(CC);
302  const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
303  AMDGPU::isKernel(CC);
304  if (IsWaveEnd) {
305  B.buildInstr(AMDGPU::S_ENDPGM)
306  .addImm(0);
307  return true;
308  }
309 
310  auto const &ST = B.getMF().getSubtarget<GCNSubtarget>();
311 
312  unsigned ReturnOpc =
313  IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
314 
315  auto Ret = B.buildInstrNoInsert(ReturnOpc);
316  Register ReturnAddrVReg;
317  if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
318  ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
319  Ret.addUse(ReturnAddrVReg);
320  }
321 
322  if (!lowerReturnVal(B, Val, VRegs, Ret))
323  return false;
324 
325  if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
326  const SIRegisterInfo *TRI = ST.getRegisterInfo();
327  Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
328  &AMDGPU::SGPR_64RegClass);
329  B.buildCopy(ReturnAddrVReg, LiveInReturn);
330  }
331 
332  // TODO: Handle CalleeSavedRegsViaCopy.
333 
334  B.insertInstr(Ret);
335  return true;
336 }
337 
338 Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B,
339  Type *ParamTy,
340  uint64_t Offset) const {
341 
342  MachineFunction &MF = B.getMF();
344  MachineRegisterInfo &MRI = MF.getRegInfo();
345  const Function &F = MF.getFunction();
346  const DataLayout &DL = F.getParent()->getDataLayout();
348  LLT PtrType = getLLTForType(*PtrTy, DL);
349  Register DstReg = MRI.createGenericVirtualRegister(PtrType);
350  Register KernArgSegmentPtr =
352  Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
353 
354  Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
355  B.buildConstant(OffsetReg, Offset);
356 
357  B.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
358 
359  return DstReg;
360 }
361 
362 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B,
363  Type *ParamTy, uint64_t Offset,
364  unsigned Align,
365  Register DstReg) const {
366  MachineFunction &MF = B.getMF();
367  const Function &F = MF.getFunction();
368  const DataLayout &DL = F.getParent()->getDataLayout();
370  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
371  unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
372  Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
373 
374  MachineMemOperand *MMO =
378  TypeSize, Align);
379 
380  B.buildLoad(DstReg, PtrReg, *MMO);
381 }
382 
383 // Allocate special inputs passed in user SGPRs.
384 static void allocateHSAUserSGPRs(CCState &CCInfo,
385  MachineIRBuilder &B,
386  MachineFunction &MF,
387  const SIRegisterInfo &TRI,
389  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
390  if (Info.hasPrivateSegmentBuffer()) {
391  unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
392  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
393  CCInfo.AllocateReg(PrivateSegmentBufferReg);
394  }
395 
396  if (Info.hasDispatchPtr()) {
397  unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
398  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
399  CCInfo.AllocateReg(DispatchPtrReg);
400  }
401 
402  if (Info.hasQueuePtr()) {
403  unsigned QueuePtrReg = Info.addQueuePtr(TRI);
404  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
405  CCInfo.AllocateReg(QueuePtrReg);
406  }
407 
408  if (Info.hasKernargSegmentPtr()) {
409  MachineRegisterInfo &MRI = MF.getRegInfo();
410  Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
412  Register VReg = MRI.createGenericVirtualRegister(P4);
413  MRI.addLiveIn(InputPtrReg, VReg);
414  B.getMBB().addLiveIn(InputPtrReg);
415  B.buildCopy(VReg, InputPtrReg);
416  CCInfo.AllocateReg(InputPtrReg);
417  }
418 
419  if (Info.hasDispatchID()) {
420  unsigned DispatchIDReg = Info.addDispatchID(TRI);
421  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
422  CCInfo.AllocateReg(DispatchIDReg);
423  }
424 
425  if (Info.hasFlatScratchInit()) {
426  unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
427  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
428  CCInfo.AllocateReg(FlatScratchInitReg);
429  }
430 
431  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
432  // these from the dispatch pointer.
433 }
434 
436  MachineIRBuilder &B, const Function &F,
437  ArrayRef<ArrayRef<Register>> VRegs) const {
438  MachineFunction &MF = B.getMF();
439  const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
440  MachineRegisterInfo &MRI = MF.getRegInfo();
442  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
443  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
444 
445  const DataLayout &DL = F.getParent()->getDataLayout();
446 
448  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
449 
450  allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
451 
452  unsigned i = 0;
453  const unsigned KernArgBaseAlign = 16;
454  const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
455  uint64_t ExplicitArgOffset = 0;
456 
457  // TODO: Align down to dword alignment and extract bits for extending loads.
458  for (auto &Arg : F.args()) {
459  Type *ArgTy = Arg.getType();
460  unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
461  if (AllocSize == 0)
462  continue;
463 
464  unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
465 
466  uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
467  ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
468 
469  ArrayRef<Register> OrigArgRegs = VRegs[i];
470  Register ArgReg =
471  OrigArgRegs.size() == 1
472  ? OrigArgRegs[0]
473  : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
474  unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
475  ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
476  lowerParameter(B, ArgTy, ArgOffset, Align, ArgReg);
477  if (OrigArgRegs.size() > 1)
478  unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
479  ++i;
480  }
481 
482  TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
483  TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
484  return true;
485 }
486 
487 // TODO: Move this to generic code
489  ArrayRef<Register> OrigRegs,
490  ArrayRef<Register> Regs,
491  LLT LLTy,
492  LLT PartLLT) {
493  if (!LLTy.isVector() && !PartLLT.isVector()) {
494  B.buildMerge(OrigRegs[0], Regs);
495  return;
496  }
497 
498  if (LLTy.isVector() && PartLLT.isVector()) {
499  assert(LLTy.getElementType() == PartLLT.getElementType());
500 
501  int DstElts = LLTy.getNumElements();
502  int PartElts = PartLLT.getNumElements();
503  if (DstElts % PartElts == 0)
504  B.buildConcatVectors(OrigRegs[0], Regs);
505  else {
506  // Deal with v3s16 split into v2s16
507  assert(PartElts == 2 && DstElts % 2 != 0);
508  int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts);
509 
510  LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType());
511  auto RoundedConcat = B.buildConcatVectors(RoundedDestTy, Regs);
512  B.buildExtract(OrigRegs[0], RoundedConcat, 0);
513  }
514 
515  return;
516  }
517 
518  assert(LLTy.isVector() && !PartLLT.isVector());
519 
520  LLT DstEltTy = LLTy.getElementType();
521  if (DstEltTy == PartLLT) {
522  // Vector was trivially scalarized.
523  B.buildBuildVector(OrigRegs[0], Regs);
524  } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
525  // Deal with vector with 64-bit elements decomposed to 32-bit
526  // registers. Need to create intermediate 64-bit elements.
527  SmallVector<Register, 8> EltMerges;
528  int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
529 
530  assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
531 
532  for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) {
533  auto Merge = B.buildMerge(DstEltTy,
534  Regs.take_front(PartsPerElt));
535  EltMerges.push_back(Merge.getReg(0));
536  Regs = Regs.drop_front(PartsPerElt);
537  }
538 
539  B.buildBuildVector(OrigRegs[0], EltMerges);
540  } else {
541  // Vector was split, and elements promoted to a wider type.
542  LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
543  auto BV = B.buildBuildVector(BVType, Regs);
544  B.buildTrunc(OrigRegs[0], BV);
545  }
546 }
547 
549  MachineIRBuilder &B, const Function &F,
550  ArrayRef<ArrayRef<Register>> VRegs) const {
552 
553  // The infrastructure for normal calling convention lowering is essentially
554  // useless for kernels. We want to avoid any kind of legalization or argument
555  // splitting.
556  if (CC == CallingConv::AMDGPU_KERNEL)
557  return lowerFormalArgumentsKernel(B, F, VRegs);
558 
559  const bool IsShader = AMDGPU::isShader(CC);
560  const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
561 
562  MachineFunction &MF = B.getMF();
563  MachineBasicBlock &MBB = B.getMBB();
564  MachineRegisterInfo &MRI = MF.getRegInfo();
566  const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
567  const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
568  const DataLayout &DL = F.getParent()->getDataLayout();
569 
570 
572  CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
573 
574  if (!IsEntryFunc) {
575  Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
576  Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
577  &AMDGPU::SGPR_64RegClass);
578  MBB.addLiveIn(ReturnAddrReg);
579  B.buildCopy(LiveInReturn, ReturnAddrReg);
580  }
581 
582  if (Info->hasImplicitBufferPtr()) {
583  Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
584  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
585  CCInfo.AllocateReg(ImplicitBufferPtrReg);
586  }
587 
588 
589  SmallVector<ArgInfo, 32> SplitArgs;
590  unsigned Idx = 0;
591  unsigned PSInputNum = 0;
592 
593  for (auto &Arg : F.args()) {
594  if (DL.getTypeStoreSize(Arg.getType()) == 0)
595  continue;
596 
597  const bool InReg = Arg.hasAttribute(Attribute::InReg);
598 
599  // SGPR arguments to functions not implemented.
600  if (!IsShader && InReg)
601  return false;
602 
603  if (Arg.hasAttribute(Attribute::SwiftSelf) ||
604  Arg.hasAttribute(Attribute::SwiftError) ||
605  Arg.hasAttribute(Attribute::Nest))
606  return false;
607 
608  if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
609  const bool ArgUsed = !Arg.use_empty();
610  bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
611 
612  if (!SkipArg) {
613  Info->markPSInputAllocated(PSInputNum);
614  if (ArgUsed)
615  Info->markPSInputEnabled(PSInputNum);
616  }
617 
618  ++PSInputNum;
619 
620  if (SkipArg) {
621  for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
622  B.buildUndef(VRegs[Idx][I]);
623 
624  ++Idx;
625  continue;
626  }
627  }
628 
629  ArgInfo OrigArg(VRegs[Idx], Arg.getType());
630  setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
631 
632  splitToValueTypes(
633  OrigArg, SplitArgs, DL, MRI, CC,
634  // FIXME: We should probably be passing multiple registers to
635  // handleAssignments to do this
636  [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
637  packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
638  LLTy, PartLLT);
639  });
640 
641  ++Idx;
642  }
643 
644  // At least one interpolation mode must be enabled or else the GPU will
645  // hang.
646  //
647  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
648  // set PSInputAddr, the user wants to enable some bits after the compilation
649  // based on run-time states. Since we can't know what the final PSInputEna
650  // will look like, so we shouldn't do anything here and the user should take
651  // responsibility for the correct programming.
652  //
653  // Otherwise, the following restrictions apply:
654  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
655  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
656  // enabled too.
657  if (CC == CallingConv::AMDGPU_PS) {
658  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
659  ((Info->getPSInputAddr() & 0xF) == 0 &&
660  Info->isPSInputAllocated(11))) {
661  CCInfo.AllocateReg(AMDGPU::VGPR0);
662  CCInfo.AllocateReg(AMDGPU::VGPR1);
663  Info->markPSInputAllocated(0);
664  Info->markPSInputEnabled(0);
665  }
666 
667  if (Subtarget.isAmdPalOS()) {
668  // For isAmdPalOS, the user does not enable some bits after compilation
669  // based on run-time states; the register values being generated here are
670  // the final ones set in hardware. Therefore we need to apply the
671  // workaround to PSInputAddr and PSInputEnable together. (The case where
672  // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
673  // set up an input arg for a particular interpolation mode, but nothing
674  // uses that input arg. Really we should have an earlier pass that removes
675  // such an arg.)
676  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
677  if ((PsInputBits & 0x7F) == 0 ||
678  ((PsInputBits & 0xF) == 0 &&
679  (PsInputBits >> 11 & 1)))
680  Info->markPSInputEnabled(
682  }
683  }
684 
685  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
686  CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
687 
688  if (!MBB.empty())
689  B.setInstr(*MBB.begin());
690 
691  FormalArgHandler Handler(B, MRI, AssignFn);
692  if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
693  return false;
694 
695  if (!IsEntryFunc) {
696  // Special inputs come after user arguments.
697  TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
698  }
699 
700  // Start adding system SGPRs.
701  if (IsEntryFunc) {
702  TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
703  } else {
704  CCInfo.AllocateReg(Info->getScratchRSrcReg());
705  CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
706  CCInfo.AllocateReg(Info->getFrameOffsetReg());
707  TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
708  }
709 
710  // Move back to the end of the basic block.
711  B.setMBB(MBB);
712 
713  return true;
714 }
unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI)
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition: Function.h:176
virtual MachineInstrBuilder buildConstant(const DstOp &Res, const ConstantInt &Val)
Build and insert Res = G_CONSTANT Val.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:641
Interface definition for SIRegisterInfo.
static void allocateHSAUserSGPRs(CCState &CCInfo, MachineIRBuilder &B, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:211
MachineInstrBuilder buildUnmerge(ArrayRef< LLT > Res, const SrcOp &Op)
Build and insert Res0, ...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
AMDGPU specific subclass of TargetSubtarget.
bool isPSInputAllocated(unsigned Index) const
This class represents lattice values for constants.
Definition: AllocatorList.h:23
MachineInstrBuilder buildInsert(Register Res, Register Src, Register Op, unsigned Index)
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
void addLiveIn(unsigned Reg, unsigned vreg=0)
addLiveIn - Add the specified register as a live-in.
unsigned addLiveIn(unsigned PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
This file describes how to lower LLVM calls to machine code calls.
bool handleAssignments(MachineIRBuilder &MIRBuilder, SmallVectorImpl< ArgInfo > &Args, ValueHandler &Handler) const
Invoke Handler::assignArg on each of the given Args and then use Callback to move them to the assigne...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI)
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:632
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change...
unsigned const TargetRegisterInfo * TRI
F(f)
void markPSInputEnabled(unsigned Index)
SmallVector< ISD::ArgFlagsTy, 4 > Flags
Definition: CallLowering.h:53
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
Address space for private memory.
Definition: AMDGPU.h:275
MachineInstrBuilder buildExtract(const DstOp &Res, const SrcOp &Src, uint64_t Index)
Build and insert `Res0, ...
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0&#39;s from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:119
bool isVector() const
A description of a memory reference used in the backend.
MachineInstrBuilder buildAnyExt(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_ANYEXT Op0.
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:264
unsigned addDispatchID(const SIRegisterInfo &TRI)
The returned value is undefined.
Definition: MathExtras.h:45
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
The memory access is dereferenceable (i.e., doesn&#39;t trap).
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
LocInfo getLocInfo() const
LLT getElementType() const
Returns the vector&#39;s element type. Only valid for vector types.
unsigned getSizeInBits() const
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
MachineInstrBuilder buildInstrNoInsert(unsigned Opcode)
Build but don&#39;t insert <empty> = Opcode <empty>.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
MachineFunction & getMF()
Getter for the function we currently build.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
static LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
void markPSInputAllocated(unsigned Index)
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:119
Analysis containing CSE Info
Definition: CSEInfo.cpp:20
Class to represent pointers.
Definition: DerivedTypes.h:570
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
bool isVoidTy() const
Return true if this is &#39;void&#39;.
Definition: Type.h:140
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:614
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
unsigned const MachineRegisterInfo * MRI
Machine Value Type.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:64
MachineInstrBuilder buildInstr(unsigned Opcode)
Build and insert <empty> = Opcode <empty>.
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:148
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
MachineInstrBuilder buildBuildVector(const DstOp &Res, ArrayRef< Register > Ops)
Build and insert Res = G_BUILD_VECTOR Op0, ...
unsigned getReturnAddressReg(const MachineFunction &MF) const
bool isEntryFunctionCC(CallingConv::ID CC)
Helper class to build MachineInstr.
SI DAG Lowering interface definition.
void setInstr(MachineInstr &MI)
Set the insertion point to before MI.
unsigned addQueuePtr(const SIRegisterInfo &TRI)
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
void setArgFlags(ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL, const FuncInfoTy &FuncInfo) const
MachineInstrBuilder buildGEP(const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1)
Build and insert Res = G_GEP Op0, Op1.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:205
Extended Value Type.
Definition: ValueTypes.h:33
R600 Clause Merge
MachineInstrBuilder buildCopy(const DstOp &Res, const SrcOp &Op)
Build and insert Res = COPY Op.
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1446
MachineInstrBuilder buildTrunc(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_TRUNC Op.
size_t size() const
Definition: SmallVector.h:52
Argument handling is mostly uniform between the four places that make these decisions: function forma...
Definition: CallLowering.h:112
This class contains a discriminated union of information about pointers in memory operands...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:40
unsigned addDispatchPtr(const SIRegisterInfo &TRI)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
LLT getLLTForType(Type &Ty, const DataLayout &DL)
Construct a low-level type based on an LLVM type.
void unpackRegs(ArrayRef< Register > DstRegs, Register SrcReg, Type *PackedTy, MachineIRBuilder &MIRBuilder) const
Generate instructions for unpacking SrcReg into the DstRegs corresponding to the aggregate type Packe...
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool lowerFormalArguments(MachineIRBuilder &B, const Function &F, ArrayRef< ArrayRef< Register >> VRegs) const override
This hook must be implemented to lower the incoming (formal) arguments, described by VRegs...
MachineInstrBuilder buildLoad(const DstOp &Res, const SrcOp &Addr, MachineMemOperand &MMO)
Build and insert Res = G_LOAD Addr, MMO.
CCState - This class holds information needed while lowering arguments and return values...
auto size(R &&Range, typename std::enable_if< std::is_same< typename std::iterator_traits< decltype(Range.begin())>::iterator_category, std::random_access_iterator_tag >::value, void >::type *=nullptr) -> decltype(std::distance(Range.begin(), Range.end()))
Get the size of a range.
Definition: STLExtras.h:1146
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
Interface definition of the TargetLowering class that is common to all AMD GPUs.
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument...
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
unsigned getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses...
CCValAssign - Represent assignment of one arg/retval to a location.
bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F, ArrayRef< ArrayRef< Register >> VRegs) const
unsigned getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
LLVM_READNONE bool isKernel(CallingConv::ID CC)
const Function & getFunction() const
Return the LLVM function that this machine code represents.
This file declares the MachineIRBuilder class.
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
bool isShader(CallingConv::ID cc)
unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI)
unsigned addFlatScratchInit(const SIRegisterInfo &TRI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
The memory access reads data.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Provides AMDGPU specific target descriptions.
Address space for constant memory (VTX2).
Definition: AMDGPU.h:273
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:133
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
bool lowerReturn(MachineIRBuilder &B, const Value *Val, ArrayRef< Register > VRegs) const override
This hook behaves as the extended lowerReturn function, but for targets that do not support swifterro...
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:187
const MachineBasicBlock & getMBB() const
Getter for the basic block we currently build.
void setMBB(MachineBasicBlock &MBB)
Set the insertion point to the end of MBB.
#define I(x, y, z)
Definition: MD5.cpp:58
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
The memory access always returns the same value (or traps).
uint32_t Size
Definition: Profile.cpp:46
static void packSplitRegsToOrigType(MachineIRBuilder &B, ArrayRef< Register > OrigRegs, ArrayRef< Register > Regs, LLT LLTy, LLT PartLLT)
static void unpackRegsToOrigType(MachineIRBuilder &B, ArrayRef< Register > DstRegs, Register SrcReg, LLT SrcTy, LLT PartTy)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
MachineInstrBuilder insertInstr(MachineInstrBuilder MIB)
Insert an existing instruction at the insertion point.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
LLVM Value Representation.
Definition: Value.h:73
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:445
static LLT getMultipleType(LLT OrigTy, int Factor)
uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
MachineInstrBuilder buildMerge(const DstOp &Res, ArrayRef< Register > Ops)
Build and insert Res = G_MERGE_VALUES Op0, ...
SmallVector< Register, 4 > Regs
Definition: CallLowering.h:47
MachineInstrBuilder buildUndef(const DstOp &Res)
Build and insert Res = IMPLICIT_DEF.
static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
MachineInstrBuilder buildConcatVectors(const DstOp &Res, ArrayRef< Register > Ops)
Build and insert Res = G_CONCAT_VECTORS Op0, ...
unsigned AllocateReg(unsigned Reg)
AllocateReg - Attempt to allocate one register.
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:200
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
iterator_range< arg_iterator > args()
Definition: Function.h:719
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:143
Register getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const SIRegisterInfo * getRegisterInfo() const override