LLVM  16.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIISelLowering.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
21 #include "llvm/ADT/Statistic.h"
24 #include "llvm/BinaryFormat/ELF.h"
25 #include "llvm/CodeGen/Analysis.h"
32 #include "llvm/IR/DiagnosticInfo.h"
33 #include "llvm/IR/IRBuilder.h"
34 #include "llvm/IR/IntrinsicInst.h"
35 #include "llvm/IR/IntrinsicsAMDGPU.h"
36 #include "llvm/IR/IntrinsicsR600.h"
38 #include "llvm/Support/ModRef.h"
39 #include "llvm/Support/KnownBits.h"
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "si-lower"
44 
45 STATISTIC(NumTailCalls, "Number of tail calls");
46 
48  "amdgpu-disable-loop-alignment",
49  cl::desc("Do not align and prefetch loops"),
50  cl::init(false));
51 
53  "amdgpu-use-divergent-register-indexing",
54  cl::Hidden,
55  cl::desc("Use indirect register addressing for divergent indexes"),
56  cl::init(false));
57 
58 static bool hasFP32Denormals(const MachineFunction &MF) {
60  return Info->getMode().allFP32Denormals();
61 }
62 
63 static bool hasFP64FP16Denormals(const MachineFunction &MF) {
65  return Info->getMode().allFP64FP16Denormals();
66 }
67 
68 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
69  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
70  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
71  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
72  return AMDGPU::SGPR0 + Reg;
73  }
74  }
75  llvm_unreachable("Cannot allocate sgpr");
76 }
77 
79  const GCNSubtarget &STI)
80  : AMDGPUTargetLowering(TM, STI),
81  Subtarget(&STI) {
82  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
83  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
84 
85  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
86  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
87 
88  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
89 
90  const SIRegisterInfo *TRI = STI.getRegisterInfo();
91  const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
92 
93  addRegisterClass(MVT::f64, V64RegClass);
94  addRegisterClass(MVT::v2f32, V64RegClass);
95 
96  addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
97  addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
98 
99  addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
100  addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
101 
102  addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
103  addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
104 
105  addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
106  addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
107 
108  addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
109  addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
110 
111  addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
112  addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
113 
114  addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
115  addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
116 
117  addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
118  addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
119 
120  addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
121  addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
122 
123  addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
124  addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
125 
126  addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
127  addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
128 
129  addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
130  addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
131 
132  if (Subtarget->has16BitInsts()) {
133  addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
134  addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
135 
136  // Unless there are also VOP3P operations, not operations are really legal.
137  addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
138  addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
139  addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
140  addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
141  addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
142  addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
143  addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
144  addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
145  }
146 
147  addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
148  addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
149 
151 
152  // The boolean content concept here is too inflexible. Compares only ever
153  // really produce a 1-bit result. Any copy/extend from these will turn into a
154  // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
155  // it's what most targets use.
158 
159  // We need to custom lower vector stores from local memory
163  MVT::v32i32},
164  Custom);
165 
169  MVT::v32i32},
170  Custom);
171 
188 
196 
198 
203 
206 
210 
214  Expand);
218  Expand);
219 
223  Custom);
224 
228 
230 
232 
234  Expand);
235 
236 #if 0
238 #endif
239 
240  // We only support LOAD/STORE and vector manipulation ops for vectors
241  // with > 4 elements.
242  for (MVT VT :
248  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
249  switch (Op) {
250  case ISD::LOAD:
251  case ISD::STORE:
252  case ISD::BUILD_VECTOR:
253  case ISD::BITCAST:
254  case ISD::UNDEF:
259  break;
261  case ISD::CONCAT_VECTORS:
263  break;
264  default:
266  break;
267  }
268  }
269  }
270 
272 
273  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
274  // is expanded to avoid having two separate loops in case the index is a VGPR.
275 
276  // Most operations are naturally 32-bit vector operations. We only support
277  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
278  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
281 
284 
287 
290  }
291 
292  for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
295 
298 
301 
304  }
305 
306  for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
309 
312 
315 
318  }
319 
320  for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
323 
326 
329 
332  }
333 
334  for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
337 
340 
343 
346  }
347 
350  Expand);
351 
353 
354  // Avoid stack access for these.
355  // TODO: Generalize to more vector types.
359  Custom);
360 
361  // Deal with vec3 vector operations when widened to vec4.
364 
365  // Deal with vec5/6/7 vector operations when widened to vec8.
369  Custom);
370 
371  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
372  // and output demarshalling
374 
375  // We can't return success/failure, only the old value,
376  // let LLVM add the comparison
378  Expand);
379 
380  if (Subtarget->hasFlatAddressSpace())
382 
384 
385  // FIXME: This should be narrowed to i32, but that only happens if i64 is
386  // illegal.
387  // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
389 
390  // On SI this is s_memtime and s_memrealtime on VI.
393 
394  if (Subtarget->has16BitInsts()) {
397  }
398 
399  if (Subtarget->hasMadMacF32Insts())
401 
402  if (!Subtarget->hasBFI())
403  // fcopysign can be done in a single instruction with BFI.
405 
406  if (!Subtarget->hasBCNT(32))
408 
409  if (!Subtarget->hasBCNT(64))
411 
412  if (Subtarget->hasFFBH())
414 
415  if (Subtarget->hasFFBL())
417 
418  // We only really have 32-bit BFE instructions (and 16-bit on VI).
419  //
420  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
421  // effort to match them now. We want this to be false for i64 cases when the
422  // extraction isn't restricted to the upper or lower half. Ideally we would
423  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
424  // span the midpoint are probably relatively rare, so don't worry about them
425  // for now.
426  if (Subtarget->hasBFE())
427  setHasExtractBitsInsn(true);
428 
429  // Clamp modifier on add/sub
430  if (Subtarget->hasIntClamp())
432 
433  if (Subtarget->hasAddNoCarry())
435  Legal);
436 
438  Custom);
439 
440  // These are really only legal for ieee_mode functions. We should be avoiding
441  // them for functions that don't have ieee_mode enabled, so just say they are
442  // legal.
444  {MVT::f32, MVT::f64}, Legal);
445 
446  if (Subtarget->haveRoundOpsF64())
448  else
450  MVT::f64, Custom);
451 
453 
456 
457  if (Subtarget->has16BitInsts()) {
460  MVT::i16, Legal);
461 
463 
465  MVT::i16, Expand);
466 
470  ISD::CTPOP},
471  MVT::i16, Promote);
472 
474 
476 
481 
483 
484  // F16 - Constant Actions.
486 
487  // F16 - Load/Store Actions.
492 
493  // F16 - VOP1 Actions.
496  MVT::f16, Custom);
497 
499 
502  MVT::f16, Promote);
503 
504  // F16 - VOP2 Actions.
506 
508 
509  // F16 - VOP3 Actions.
511  if (STI.hasMadF16())
513 
516  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
517  switch (Op) {
518  case ISD::LOAD:
519  case ISD::STORE:
520  case ISD::BUILD_VECTOR:
521  case ISD::BITCAST:
522  case ISD::UNDEF:
528  break;
529  case ISD::CONCAT_VECTORS:
531  break;
532  default:
534  break;
535  }
536  }
537  }
538 
539  // v_perm_b32 can handle either of these.
542 
543  // XXX - Do these do anything? Vector constants turn into build_vector.
545 
547 
552 
557 
564 
569 
574 
579 
584 
589 
594 
599 
601  MVT::v2i32, Expand);
603 
605  MVT::v4i32, Expand);
606 
608  MVT::v8i32, Expand);
609 
610  if (!Subtarget->hasVOP3PInsts())
612 
614  // This isn't really legal, but this avoids the legalizer unrolling it (and
615  // allows matching fneg (fabs x) patterns)
617 
620 
623 
626 
627  for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
630  Vec16, Custom);
632  }
633  }
634 
635  if (Subtarget->hasVOP3PInsts()) {
639  MVT::v2i16, Legal);
640 
643  MVT::v2f16, Legal);
644 
646  Custom);
647 
651  Custom);
652 
653  for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
654  // Split vector operations.
658  ISD::SSUBSAT},
659  VT, Custom);
660 
661  for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
662  // Split vector operations.
664  VT, Custom);
665 
667  Custom);
668 
671 
672  if (Subtarget->hasPackedFP32Ops()) {
674  MVT::v2f32, Legal);
677  Custom);
678  }
679  }
680 
682 
683  if (Subtarget->has16BitInsts()) {
688  } else {
689  // Legalization hack.
691 
693  }
694 
698  Custom);
699 
701 
702  if (Subtarget->hasMad64_32())
704 
708  Custom);
709 
713  MVT::i16, MVT::i8},
714  Custom);
715 
719  MVT::i8},
720  Custom);
721 
724  ISD::SUB,
726  ISD::FADD,
727  ISD::FSUB,
728  ISD::FMINNUM,
729  ISD::FMAXNUM,
732  ISD::FMA,
733  ISD::SMIN,
734  ISD::SMAX,
735  ISD::UMIN,
736  ISD::UMAX,
737  ISD::SETCC,
738  ISD::AND,
739  ISD::OR,
740  ISD::XOR,
749 
750  // All memory operations. Some folding on the pointer operand is done to help
751  // matching the constant offsets in the addressing modes.
753  ISD::STORE,
772 
773  // FIXME: In other contexts we pretend this is a per-function property.
774  setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
775 
777 }
778 
780  return Subtarget;
781 }
782 
783 //===----------------------------------------------------------------------===//
784 // TargetLowering queries
785 //===----------------------------------------------------------------------===//
786 
787 // v_mad_mix* support a conversion from f16 to f32.
788 //
789 // There is only one special case when denormals are enabled we don't currently,
790 // where this is OK to use.
791 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
792  EVT DestVT, EVT SrcVT) const {
793  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
794  (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
795  DestVT.getScalarType() == MVT::f32 &&
796  SrcVT.getScalarType() == MVT::f16 &&
797  // TODO: This probably only requires no input flushing?
799 }
800 
801 bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
802  LLT DestTy, LLT SrcTy) const {
803  return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
804  (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
805  DestTy.getScalarSizeInBits() == 32 &&
806  SrcTy.getScalarSizeInBits() == 16 &&
807  // TODO: This probably only requires no input flushing?
808  !hasFP32Denormals(*MI.getMF());
809 }
810 
812  // SI has some legal vector types, but no legal vector operations. Say no
813  // shuffles are legal in order to prefer scalarizing some vector operations.
814  return false;
815 }
816 
819  EVT VT) const {
822 
823  if (VT.isVector()) {
824  EVT ScalarVT = VT.getScalarType();
825  unsigned Size = ScalarVT.getSizeInBits();
826  if (Size == 16) {
827  if (Subtarget->has16BitInsts())
828  return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
829  return VT.isInteger() ? MVT::i32 : MVT::f32;
830  }
831 
832  if (Size < 16)
833  return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
834  return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
835  }
836 
837  if (VT.getSizeInBits() > 32)
838  return MVT::i32;
839 
841 }
842 
845  EVT VT) const {
848 
849  if (VT.isVector()) {
850  unsigned NumElts = VT.getVectorNumElements();
851  EVT ScalarVT = VT.getScalarType();
852  unsigned Size = ScalarVT.getSizeInBits();
853 
854  // FIXME: Should probably promote 8-bit vectors to i16.
855  if (Size == 16 && Subtarget->has16BitInsts())
856  return (NumElts + 1) / 2;
857 
858  if (Size <= 32)
859  return NumElts;
860 
861  if (Size > 32)
862  return NumElts * ((Size + 31) / 32);
863  } else if (VT.getSizeInBits() > 32)
864  return (VT.getSizeInBits() + 31) / 32;
865 
867 }
868 
871  EVT VT, EVT &IntermediateVT,
872  unsigned &NumIntermediates, MVT &RegisterVT) const {
873  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
874  unsigned NumElts = VT.getVectorNumElements();
875  EVT ScalarVT = VT.getScalarType();
876  unsigned Size = ScalarVT.getSizeInBits();
877  // FIXME: We should fix the ABI to be the same on targets without 16-bit
878  // support, but unless we can properly handle 3-vectors, it will be still be
879  // inconsistent.
880  if (Size == 16 && Subtarget->has16BitInsts()) {
881  RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
882  IntermediateVT = RegisterVT;
883  NumIntermediates = (NumElts + 1) / 2;
884  return NumIntermediates;
885  }
886 
887  if (Size == 32) {
888  RegisterVT = ScalarVT.getSimpleVT();
889  IntermediateVT = RegisterVT;
890  NumIntermediates = NumElts;
891  return NumIntermediates;
892  }
893 
894  if (Size < 16 && Subtarget->has16BitInsts()) {
895  // FIXME: Should probably form v2i16 pieces
896  RegisterVT = MVT::i16;
897  IntermediateVT = ScalarVT;
898  NumIntermediates = NumElts;
899  return NumIntermediates;
900  }
901 
902 
903  if (Size != 16 && Size <= 32) {
904  RegisterVT = MVT::i32;
905  IntermediateVT = ScalarVT;
906  NumIntermediates = NumElts;
907  return NumIntermediates;
908  }
909 
910  if (Size > 32) {
911  RegisterVT = MVT::i32;
912  IntermediateVT = RegisterVT;
913  NumIntermediates = NumElts * ((Size + 31) / 32);
914  return NumIntermediates;
915  }
916  }
917 
919  Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
920 }
921 
922 static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
923  assert(MaxNumLanes != 0);
924 
925  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
926  unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
927  return EVT::getVectorVT(Ty->getContext(),
928  EVT::getEVT(VT->getElementType()),
929  NumElts);
930  }
931 
932  return EVT::getEVT(Ty);
933 }
934 
935 // Peek through TFE struct returns to only use the data size.
936 static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
937  auto *ST = dyn_cast<StructType>(Ty);
938  if (!ST)
939  return memVTFromLoadIntrData(Ty, MaxNumLanes);
940 
941  // TFE intrinsics return an aggregate type.
942  assert(ST->getNumContainedTypes() == 2 &&
943  ST->getContainedType(1)->isIntegerTy(32));
944  return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
945 }
946 
948  const CallInst &CI,
949  MachineFunction &MF,
950  unsigned IntrID) const {
952  if (CI.hasMetadata(LLVMContext::MD_invariant_load))
954 
955  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
956  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
958  (Intrinsic::ID)IntrID);
959  MemoryEffects ME = Attr.getMemoryEffects();
960  if (ME.doesNotAccessMemory())
961  return false;
962 
964 
965  const GCNTargetMachine &TM =
966  static_cast<const GCNTargetMachine &>(getTargetMachine());
967 
968  if (RsrcIntr->IsImage) {
969  Info.ptrVal = MFI->getImagePSV(TM);
970  Info.align.reset();
971  } else {
972  Info.ptrVal = MFI->getBufferPSV(TM);
973  }
974 
976  if (ME.onlyReadsMemory()) {
977  unsigned MaxNumLanes = 4;
978 
979  if (RsrcIntr->IsImage) {
982  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
983  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
984 
985  if (!BaseOpcode->Gather4) {
986  // If this isn't a gather, we may have excess loaded elements in the
987  // IR type. Check the dmask for the real number of elements loaded.
988  unsigned DMask
989  = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
990  MaxNumLanes = DMask == 0 ? 1 : countPopulation(DMask);
991  }
992  }
993 
994  Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
995 
996  // FIXME: What does alignment mean for an image?
999  } else if (ME.onlyWritesMemory()) {
1000  Info.opc = ISD::INTRINSIC_VOID;
1001 
1002  Type *DataTy = CI.getArgOperand(0)->getType();
1003  if (RsrcIntr->IsImage) {
1004  unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1005  unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
1006  Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
1007  } else
1008  Info.memVT = EVT::getEVT(DataTy);
1009 
1011  } else {
1012  // Atomic
1013  Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1015  Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1016  Info.flags |= MachineMemOperand::MOLoad |
1019 
1020  // XXX - Should this be volatile without known ordering?
1022 
1023  switch (IntrID) {
1024  default:
1025  break;
1026  case Intrinsic::amdgcn_raw_buffer_load_lds:
1027  case Intrinsic::amdgcn_struct_buffer_load_lds: {
1028  unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1029  Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1030  return true;
1031  }
1032  }
1033  }
1034  return true;
1035  }
1036 
1037  switch (IntrID) {
1038  case Intrinsic::amdgcn_atomic_inc:
1039  case Intrinsic::amdgcn_atomic_dec:
1040  case Intrinsic::amdgcn_ds_ordered_add:
1041  case Intrinsic::amdgcn_ds_ordered_swap:
1042  case Intrinsic::amdgcn_ds_fadd:
1043  case Intrinsic::amdgcn_ds_fmin:
1044  case Intrinsic::amdgcn_ds_fmax: {
1046  Info.memVT = MVT::getVT(CI.getType());
1047  Info.ptrVal = CI.getOperand(0);
1048  Info.align.reset();
1050 
1051  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1052  if (!Vol->isZero())
1054 
1055  return true;
1056  }
1057  case Intrinsic::amdgcn_buffer_atomic_fadd: {
1059 
1060  const GCNTargetMachine &TM =
1061  static_cast<const GCNTargetMachine &>(getTargetMachine());
1062 
1064  Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1065  Info.ptrVal = MFI->getBufferPSV(TM);
1066  Info.align.reset();
1068 
1069  const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1070  if (!Vol || !Vol->isZero())
1072 
1073  return true;
1074  }
1075  case Intrinsic::amdgcn_ds_append:
1076  case Intrinsic::amdgcn_ds_consume: {
1078  Info.memVT = MVT::getVT(CI.getType());
1079  Info.ptrVal = CI.getOperand(0);
1080  Info.align.reset();
1082 
1083  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1084  if (!Vol->isZero())
1086 
1087  return true;
1088  }
1089  case Intrinsic::amdgcn_global_atomic_csub: {
1091  Info.memVT = MVT::getVT(CI.getType());
1092  Info.ptrVal = CI.getOperand(0);
1093  Info.align.reset();
1094  Info.flags |= MachineMemOperand::MOLoad |
1097  return true;
1098  }
1099  case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1102  Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1103 
1104  const GCNTargetMachine &TM =
1105  static_cast<const GCNTargetMachine &>(getTargetMachine());
1106 
1107  Info.ptrVal = MFI->getImagePSV(TM);
1108  Info.align.reset();
1109  Info.flags |= MachineMemOperand::MOLoad |
1111  return true;
1112  }
1113  case Intrinsic::amdgcn_global_atomic_fadd:
1114  case Intrinsic::amdgcn_global_atomic_fmin:
1115  case Intrinsic::amdgcn_global_atomic_fmax:
1116  case Intrinsic::amdgcn_flat_atomic_fadd:
1117  case Intrinsic::amdgcn_flat_atomic_fmin:
1118  case Intrinsic::amdgcn_flat_atomic_fmax:
1119  case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1120  case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1122  Info.memVT = MVT::getVT(CI.getType());
1123  Info.ptrVal = CI.getOperand(0);
1124  Info.align.reset();
1125  Info.flags |= MachineMemOperand::MOLoad |
1129  return true;
1130  }
1131  case Intrinsic::amdgcn_ds_gws_init:
1132  case Intrinsic::amdgcn_ds_gws_barrier:
1133  case Intrinsic::amdgcn_ds_gws_sema_v:
1134  case Intrinsic::amdgcn_ds_gws_sema_br:
1135  case Intrinsic::amdgcn_ds_gws_sema_p:
1136  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1137  Info.opc = ISD::INTRINSIC_VOID;
1138 
1139  const GCNTargetMachine &TM =
1140  static_cast<const GCNTargetMachine &>(getTargetMachine());
1141 
1143  Info.ptrVal = MFI->getGWSPSV(TM);
1144 
1145  // This is an abstract access, but we need to specify a type and size.
1146  Info.memVT = MVT::i32;
1147  Info.size = 4;
1148  Info.align = Align(4);
1149 
1150  if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1152  else
1154  return true;
1155  }
1156  case Intrinsic::amdgcn_global_load_lds: {
1157  Info.opc = ISD::INTRINSIC_VOID;
1158  unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1159  Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1162  return true;
1163  }
1164  case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1166 
1167  const GCNTargetMachine &TM =
1168  static_cast<const GCNTargetMachine &>(getTargetMachine());
1169 
1171  Info.ptrVal = MFI->getGWSPSV(TM);
1172 
1173  // This is an abstract access, but we need to specify a type and size.
1174  Info.memVT = MVT::i32;
1175  Info.size = 4;
1176  Info.align = Align(4);
1177 
1179  return true;
1180  }
1181  default:
1182  return false;
1183  }
1184 }
1185 
1188  Type *&AccessTy) const {
1189  switch (II->getIntrinsicID()) {
1190  case Intrinsic::amdgcn_atomic_inc:
1191  case Intrinsic::amdgcn_atomic_dec:
1192  case Intrinsic::amdgcn_ds_ordered_add:
1193  case Intrinsic::amdgcn_ds_ordered_swap:
1194  case Intrinsic::amdgcn_ds_append:
1195  case Intrinsic::amdgcn_ds_consume:
1196  case Intrinsic::amdgcn_ds_fadd:
1197  case Intrinsic::amdgcn_ds_fmin:
1198  case Intrinsic::amdgcn_ds_fmax:
1199  case Intrinsic::amdgcn_global_atomic_fadd:
1200  case Intrinsic::amdgcn_flat_atomic_fadd:
1201  case Intrinsic::amdgcn_flat_atomic_fmin:
1202  case Intrinsic::amdgcn_flat_atomic_fmax:
1203  case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1204  case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1205  case Intrinsic::amdgcn_global_atomic_csub: {
1206  Value *Ptr = II->getArgOperand(0);
1207  AccessTy = II->getType();
1208  Ops.push_back(Ptr);
1209  return true;
1210  }
1211  default:
1212  return false;
1213  }
1214 }
1215 
1216 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
1217  if (!Subtarget->hasFlatInstOffsets()) {
1218  // Flat instructions do not have offsets, and only have the register
1219  // address.
1220  return AM.BaseOffs == 0 && AM.Scale == 0;
1221  }
1222 
1223  return AM.Scale == 0 &&
1224  (AM.BaseOffs == 0 ||
1225  Subtarget->getInstrInfo()->isLegalFLATOffset(
1227 }
1228 
1230  if (Subtarget->hasFlatGlobalInsts())
1231  return AM.Scale == 0 &&
1232  (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1235 
1236  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1237  // Assume the we will use FLAT for all global memory accesses
1238  // on VI.
1239  // FIXME: This assumption is currently wrong. On VI we still use
1240  // MUBUF instructions for the r + i addressing mode. As currently
1241  // implemented, the MUBUF instructions only work on buffer < 4GB.
1242  // It may be possible to support > 4GB buffers with MUBUF instructions,
1243  // by setting the stride value in the resource descriptor which would
1244  // increase the size limit to (stride * 4GB). However, this is risky,
1245  // because it has never been validated.
1246  return isLegalFlatAddressingMode(AM);
1247  }
1248 
1249  return isLegalMUBUFAddressingMode(AM);
1250 }
1251 
1252 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1253  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1254  // additionally can do r + r + i with addr64. 32-bit has more addressing
1255  // mode options. Depending on the resource constant, it can also do
1256  // (i64 r0) + (i32 r1) * (i14 i).
1257  //
1258  // Private arrays end up using a scratch buffer most of the time, so also
1259  // assume those use MUBUF instructions. Scratch loads / stores are currently
1260  // implemented as mubuf instructions with offen bit set, so slightly
1261  // different than the normal addr64.
1262  if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
1263  return false;
1264 
1265  // FIXME: Since we can split immediate into soffset and immediate offset,
1266  // would it make sense to allow any immediate?
1267 
1268  switch (AM.Scale) {
1269  case 0: // r + i or just i, depending on HasBaseReg.
1270  return true;
1271  case 1:
1272  return true; // We have r + r or r + i.
1273  case 2:
1274  if (AM.HasBaseReg) {
1275  // Reject 2 * r + r.
1276  return false;
1277  }
1278 
1279  // Allow 2 * r as r + r
1280  // Or 2 * r + i is allowed as r + r + i.
1281  return true;
1282  default: // Don't allow n * r
1283  return false;
1284  }
1285 }
1286 
1288  const AddrMode &AM, Type *Ty,
1289  unsigned AS, Instruction *I) const {
1290  // No global is ever allowed as a base.
1291  if (AM.BaseGV)
1292  return false;
1293 
1294  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1295  return isLegalGlobalAddressingMode(AM);
1296 
1297  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1300  // If the offset isn't a multiple of 4, it probably isn't going to be
1301  // correctly aligned.
1302  // FIXME: Can we get the real alignment here?
1303  if (AM.BaseOffs % 4 != 0)
1304  return isLegalMUBUFAddressingMode(AM);
1305 
1306  // There are no SMRD extloads, so if we have to do a small type access we
1307  // will use a MUBUF load.
1308  // FIXME?: We also need to do this if unaligned, but we don't know the
1309  // alignment here.
1310  if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1311  return isLegalGlobalAddressingMode(AM);
1312 
1313  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1314  // SMRD instructions have an 8-bit, dword offset on SI.
1315  if (!isUInt<8>(AM.BaseOffs / 4))
1316  return false;
1317  } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1318  // On CI+, this can also be a 32-bit literal constant offset. If it fits
1319  // in 8-bits, it can use a smaller encoding.
1320  if (!isUInt<32>(AM.BaseOffs / 4))
1321  return false;
1322  } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1323  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1324  if (!isUInt<20>(AM.BaseOffs))
1325  return false;
1326  } else
1327  llvm_unreachable("unhandled generation");
1328 
1329  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1330  return true;
1331 
1332  if (AM.Scale == 1 && AM.HasBaseReg)
1333  return true;
1334 
1335  return false;
1336 
1337  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1338  return isLegalMUBUFAddressingMode(AM);
1339  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1340  AS == AMDGPUAS::REGION_ADDRESS) {
1341  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1342  // field.
1343  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1344  // an 8-bit dword offset but we don't know the alignment here.
1345  if (!isUInt<16>(AM.BaseOffs))
1346  return false;
1347 
1348  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1349  return true;
1350 
1351  if (AM.Scale == 1 && AM.HasBaseReg)
1352  return true;
1353 
1354  return false;
1355  } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1357  // For an unknown address space, this usually means that this is for some
1358  // reason being used for pure arithmetic, and not based on some addressing
1359  // computation. We don't have instructions that compute pointers with any
1360  // addressing modes, so treat them as having no offset like flat
1361  // instructions.
1362  return isLegalFlatAddressingMode(AM);
1363  }
1364 
1365  // Assume a user alias of global for unknown address spaces.
1366  return isLegalGlobalAddressingMode(AM);
1367 }
1368 
1369 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1370  const MachineFunction &MF) const {
1371  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1372  return (MemVT.getSizeInBits() <= 4 * 32);
1373  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1374  unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1375  return (MemVT.getSizeInBits() <= MaxPrivateBits);
1376  } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1377  return (MemVT.getSizeInBits() <= 2 * 32);
1378  }
1379  return true;
1380 }
1381 
1383  unsigned Size, unsigned AddrSpace, Align Alignment,
1384  MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1385  if (IsFast)
1386  *IsFast = 0;
1387 
1388  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1389  AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1390  // Check if alignment requirements for ds_read/write instructions are
1391  // disabled.
1392  if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1393  return false;
1394 
1395  Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1396  if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1397  Alignment < RequiredAlignment)
1398  return false;
1399 
1400  // Either, the alignment requirements are "enabled", or there is an
1401  // unaligned LDS access related hardware bug though alignment requirements
1402  // are "disabled". In either case, we need to check for proper alignment
1403  // requirements.
1404  //
1405  switch (Size) {
1406  case 64:
1407  // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1408  // address is negative, then the instruction is incorrectly treated as
1409  // out-of-bounds even if base + offsets is in bounds. Split vectorized
1410  // loads here to avoid emitting ds_read2_b32. We may re-combine the
1411  // load later in the SILoadStoreOptimizer.
1412  if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1413  return false;
1414 
1415  // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1416  // can do a 4 byte aligned, 8 byte access in a single operation using
1417  // ds_read2/write2_b32 with adjacent offsets.
1418  RequiredAlignment = Align(4);
1419 
1420  if (Subtarget->hasUnalignedDSAccessEnabled()) {
1421  // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1422  // ds_write2_b32 depending on the alignment. In either case with either
1423  // alignment there is no faster way of doing this.
1424  if (IsFast)
1425  *IsFast = 1;
1426  return true;
1427  }
1428 
1429  break;
1430  case 96:
1431  if (!Subtarget->hasDS96AndDS128())
1432  return false;
1433 
1434  // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1435  // gfx8 and older.
1436 
1437  if (Subtarget->hasUnalignedDSAccessEnabled()) {
1438  // Naturally aligned access is fastest. However, also report it is Fast
1439  // if memory is aligned less than DWORD. A narrow load or store will be
1440  // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1441  // be more of them, so overall we will pay less penalty issuing a single
1442  // instruction.
1443  if (IsFast)
1444  *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
1445  return true;
1446  }
1447 
1448  break;
1449  case 128:
1450  if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1451  return false;
1452 
1453  // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1454  // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1455  // single operation using ds_read2/write2_b64.
1456  RequiredAlignment = Align(8);
1457 
1458  if (Subtarget->hasUnalignedDSAccessEnabled()) {
1459  // Naturally aligned access is fastest. However, also report it is Fast
1460  // if memory is aligned less than DWORD. A narrow load or store will be
1461  // be equally slow as a single ds_read_b128/ds_write_b128, but there
1462  // will be more of them, so overall we will pay less penalty issuing a
1463  // single instruction.
1464  if (IsFast)
1465  *IsFast= Alignment >= RequiredAlignment || Alignment < Align(4);
1466  return true;
1467  }
1468 
1469  break;
1470  default:
1471  if (Size > 32)
1472  return false;
1473 
1474  break;
1475  }
1476 
1477  if (IsFast)
1478  *IsFast = Alignment >= RequiredAlignment;
1479 
1480  return Alignment >= RequiredAlignment ||
1481  Subtarget->hasUnalignedDSAccessEnabled();
1482  }
1483 
1484  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1485  bool AlignedBy4 = Alignment >= Align(4);
1486  if (IsFast)
1487  *IsFast = AlignedBy4;
1488 
1489  return AlignedBy4 ||
1490  Subtarget->enableFlatScratch() ||
1491  Subtarget->hasUnalignedScratchAccess();
1492  }
1493 
1494  // FIXME: We have to be conservative here and assume that flat operations
1495  // will access scratch. If we had access to the IR function, then we
1496  // could determine if any private memory was used in the function.
1497  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1498  !Subtarget->hasUnalignedScratchAccess()) {
1499  bool AlignedBy4 = Alignment >= Align(4);
1500  if (IsFast)
1501  *IsFast = AlignedBy4;
1502 
1503  return AlignedBy4;
1504  }
1505 
1506  if (Subtarget->hasUnalignedBufferAccessEnabled()) {
1507  // If we have a uniform constant load, it still requires using a slow
1508  // buffer instruction if unaligned.
1509  if (IsFast) {
1510  // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
1511  // 2-byte alignment is worse than 1 unless doing a 2-byte access.
1512  *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1513  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1514  Alignment >= Align(4) : Alignment != Align(2);
1515  }
1516 
1517  return true;
1518  }
1519 
1520  // Smaller than dword value must be aligned.
1521  if (Size < 32)
1522  return false;
1523 
1524  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1525  // byte-address are ignored, thus forcing Dword alignment.
1526  // This applies to private, global, and constant memory.
1527  if (IsFast)
1528  *IsFast = 1;
1529 
1530  return Size >= 32 && Alignment >= Align(4);
1531 }
1532 
1534  EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1535  unsigned *IsFast) const {
1537  Alignment, Flags, IsFast);
1538 
1539  if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() &&
1540  (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1541  AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
1542  // Lie it is fast if +unaligned-access-mode is passed so that DS accesses
1543  // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a
1544  // misaligned data which is faster than a pair of ds_read_b*/ds_write_b*
1545  // which would be equally misaligned.
1546  // This is only used by the common passes, selection always calls the
1547  // allowsMisalignedMemoryAccessesImpl version.
1548  *IsFast= 1;
1549  }
1550 
1551  return Allow;
1552 }
1553 
1555  const MemOp &Op, const AttributeList &FuncAttributes) const {
1556  // FIXME: Should account for address space here.
1557 
1558  // The default fallback uses the private pointer size as a guess for a type to
1559  // use. Make sure we switch these to 64-bit accesses.
1560 
1561  if (Op.size() >= 16 &&
1562  Op.isDstAligned(Align(4))) // XXX: Should only do for global
1563  return MVT::v4i32;
1564 
1565  if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1566  return MVT::v2i32;
1567 
1568  // Use the default.
1569  return MVT::Other;
1570 }
1571 
1573  const MemSDNode *MemNode = cast<MemSDNode>(N);
1574  return MemNode->getMemOperand()->getFlags() & MONoClobber;
1575 }
1576 
1578  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1580 }
1581 
1583  unsigned DestAS) const {
1584  // Flat -> private/local is a simple truncate.
1585  // Flat -> global is no-op
1586  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1587  return true;
1588 
1589  const GCNTargetMachine &TM =
1590  static_cast<const GCNTargetMachine &>(getTargetMachine());
1591  return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1592 }
1593 
1595  const MemSDNode *MemNode = cast<MemSDNode>(N);
1596 
1597  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1598 }
1599 
1602  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1606 }
1607 
1609  Type *Ty) const {
1610  // FIXME: Could be smarter if called for vector constants.
1611  return true;
1612 }
1613 
1615  unsigned Index) const {
1617  return false;
1618 
1619  // TODO: Add more cases that are cheap.
1620  return Index == 0;
1621 }
1622 
1624  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1625  switch (Op) {
1626  case ISD::LOAD:
1627  case ISD::STORE:
1628 
1629  // These operations are done with 32-bit instructions anyway.
1630  case ISD::AND:
1631  case ISD::OR:
1632  case ISD::XOR:
1633  case ISD::SELECT:
1634  // TODO: Extensions?
1635  return true;
1636  default:
1637  return false;
1638  }
1639  }
1640 
1641  // SimplifySetCC uses this function to determine whether or not it should
1642  // create setcc with i1 operands. We don't have instructions for i1 setcc.
1643  if (VT == MVT::i1 && Op == ISD::SETCC)
1644  return false;
1645 
1647 }
1648 
1649 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1650  const SDLoc &SL,
1651  SDValue Chain,
1652  uint64_t Offset) const {
1653  const DataLayout &DL = DAG.getDataLayout();
1654  MachineFunction &MF = DAG.getMachineFunction();
1656 
1657  const ArgDescriptor *InputPtrReg;
1658  const TargetRegisterClass *RC;
1659  LLT ArgTy;
1661 
1662  std::tie(InputPtrReg, RC, ArgTy) =
1664 
1665  // We may not have the kernarg segment argument if we have no kernel
1666  // arguments.
1667  if (!InputPtrReg)
1668  return DAG.getConstant(0, SL, PtrVT);
1669 
1671  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1672  MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1673 
1674  return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
1675 }
1676 
1677 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1678  const SDLoc &SL) const {
1680  FIRST_IMPLICIT);
1681  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1682 }
1683 
1684 SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1685  const SDLoc &SL) const {
1686 
1688  Optional<uint32_t> KnownSize =
1690  if (KnownSize.has_value())
1691  return DAG.getConstant(KnownSize.value(), SL, MVT::i32);
1692  return SDValue();
1693 }
1694 
1695 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1696  const SDLoc &SL, SDValue Val,
1697  bool Signed,
1698  const ISD::InputArg *Arg) const {
1699  // First, if it is a widened vector, narrow it.
1700  if (VT.isVector() &&
1701  VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1702  EVT NarrowedVT =
1704  VT.getVectorNumElements());
1705  Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1706  DAG.getConstant(0, SL, MVT::i32));
1707  }
1708 
1709  // Then convert the vector elements or scalar value.
1710  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1711  VT.bitsLT(MemVT)) {
1712  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1713  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1714  }
1715 
1716  if (MemVT.isFloatingPoint())
1717  Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1718  else if (Signed)
1719  Val = DAG.getSExtOrTrunc(Val, SL, VT);
1720  else
1721  Val = DAG.getZExtOrTrunc(Val, SL, VT);
1722 
1723  return Val;
1724 }
1725 
1726 SDValue SITargetLowering::lowerKernargMemParameter(
1727  SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
1728  uint64_t Offset, Align Alignment, bool Signed,
1729  const ISD::InputArg *Arg) const {
1731 
1732  // Try to avoid using an extload by loading earlier than the argument address,
1733  // and extracting the relevant bits. The load should hopefully be merged with
1734  // the previous argument.
1735  if (MemVT.getStoreSize() < 4 && Alignment < 4) {
1736  // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1737  int64_t AlignDownOffset = alignDown(Offset, 4);
1738  int64_t OffsetDiff = Offset - AlignDownOffset;
1739 
1740  EVT IntVT = MemVT.changeTypeToInteger();
1741 
1742  // TODO: If we passed in the base kernel offset we could have a better
1743  // alignment than 4, but we don't really need it.
1744  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1745  SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
1748 
1749  SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1750  SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1751 
1752  SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1753  ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1754  ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1755 
1756 
1757  return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1758  }
1759 
1760  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1761  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
1764 
1765  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1766  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1767 }
1768 
1769 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1770  const SDLoc &SL, SDValue Chain,
1771  const ISD::InputArg &Arg) const {
1772  MachineFunction &MF = DAG.getMachineFunction();
1773  MachineFrameInfo &MFI = MF.getFrameInfo();
1774 
1775  if (Arg.Flags.isByVal()) {
1776  unsigned Size = Arg.Flags.getByValSize();
1777  int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1778  return DAG.getFrameIndex(FrameIdx, MVT::i32);
1779  }
1780 
1781  unsigned ArgOffset = VA.getLocMemOffset();
1782  unsigned ArgSize = VA.getValVT().getStoreSize();
1783 
1784  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1785 
1786  // Create load nodes to retrieve arguments from the stack.
1787  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1788  SDValue ArgValue;
1789 
1790  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1792  MVT MemVT = VA.getValVT();
1793 
1794  switch (VA.getLocInfo()) {
1795  default:
1796  break;
1797  case CCValAssign::BCvt:
1798  MemVT = VA.getLocVT();
1799  break;
1800  case CCValAssign::SExt:
1801  ExtType = ISD::SEXTLOAD;
1802  break;
1803  case CCValAssign::ZExt:
1804  ExtType = ISD::ZEXTLOAD;
1805  break;
1806  case CCValAssign::AExt:
1807  ExtType = ISD::EXTLOAD;
1808  break;
1809  }
1810 
1811  ArgValue = DAG.getExtLoad(
1812  ExtType, SL, VA.getLocVT(), Chain, FIN,
1814  MemVT);
1815  return ArgValue;
1816 }
1817 
1818 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1819  const SIMachineFunctionInfo &MFI,
1820  EVT VT,
1822  const ArgDescriptor *Reg;
1823  const TargetRegisterClass *RC;
1824  LLT Ty;
1825 
1826  std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
1827  if (!Reg) {
1828  if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
1829  // It's possible for a kernarg intrinsic call to appear in a kernel with
1830  // no allocated segment, in which case we do not add the user sgpr
1831  // argument, so just return null.
1832  return DAG.getConstant(0, SDLoc(), VT);
1833  }
1834 
1835  // It's undefined behavior if a function marked with the amdgpu-no-*
1836  // attributes uses the corresponding intrinsic.
1837  return DAG.getUNDEF(VT);
1838  }
1839 
1840  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1841 }
1842 
1844  CallingConv::ID CallConv,
1846  FunctionType *FType,
1848  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1849  const ISD::InputArg *Arg = &Ins[I];
1850 
1851  assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1852  "vector type argument should have been split");
1853 
1854  // First check if it's a PS input addr.
1855  if (CallConv == CallingConv::AMDGPU_PS &&
1856  !Arg->Flags.isInReg() && PSInputNum <= 15) {
1857  bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1858 
1859  // Inconveniently only the first part of the split is marked as isSplit,
1860  // so skip to the end. We only want to increment PSInputNum once for the
1861  // entire split argument.
1862  if (Arg->Flags.isSplit()) {
1863  while (!Arg->Flags.isSplitEnd()) {
1864  assert((!Arg->VT.isVector() ||
1865  Arg->VT.getScalarSizeInBits() == 16) &&
1866  "unexpected vector split in ps argument type");
1867  if (!SkipArg)
1868  Splits.push_back(*Arg);
1869  Arg = &Ins[++I];
1870  }
1871  }
1872 
1873  if (SkipArg) {
1874  // We can safely skip PS inputs.
1875  Skipped.set(Arg->getOrigArgIndex());
1876  ++PSInputNum;
1877  continue;
1878  }
1879 
1880  Info->markPSInputAllocated(PSInputNum);
1881  if (Arg->Used)
1882  Info->markPSInputEnabled(PSInputNum);
1883 
1884  ++PSInputNum;
1885  }
1886 
1887  Splits.push_back(*Arg);
1888  }
1889 }
1890 
1891 // Allocate special inputs passed in VGPRs.
1893  MachineFunction &MF,
1894  const SIRegisterInfo &TRI,
1895  SIMachineFunctionInfo &Info) const {
1896  const LLT S32 = LLT::scalar(32);
1898 
1899  if (Info.hasWorkItemIDX()) {
1900  Register Reg = AMDGPU::VGPR0;
1901  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1902 
1903  CCInfo.AllocateReg(Reg);
1904  unsigned Mask = (Subtarget->hasPackedTID() &&
1905  Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1906  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
1907  }
1908 
1909  if (Info.hasWorkItemIDY()) {
1910  assert(Info.hasWorkItemIDX());
1911  if (Subtarget->hasPackedTID()) {
1912  Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1913  0x3ff << 10));
1914  } else {
1915  unsigned Reg = AMDGPU::VGPR1;
1916  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1917 
1918  CCInfo.AllocateReg(Reg);
1919  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1920  }
1921  }
1922 
1923  if (Info.hasWorkItemIDZ()) {
1924  assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
1925  if (Subtarget->hasPackedTID()) {
1926  Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1927  0x3ff << 20));
1928  } else {
1929  unsigned Reg = AMDGPU::VGPR2;
1930  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1931 
1932  CCInfo.AllocateReg(Reg);
1933  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1934  }
1935  }
1936 }
1937 
1938 // Try to allocate a VGPR at the end of the argument list, or if no argument
1939 // VGPRs are left allocating a stack slot.
1940 // If \p Mask is is given it indicates bitfield position in the register.
1941 // If \p Arg is given use it with new ]p Mask instead of allocating new.
1942 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
1944  if (Arg.isSet())
1946 
1947  ArrayRef<MCPhysReg> ArgVGPRs
1948  = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1949  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1950  if (RegIdx == ArgVGPRs.size()) {
1951  // Spill to stack required.
1952  int64_t Offset = CCInfo.AllocateStack(4, Align(4));
1953 
1954  return ArgDescriptor::createStack(Offset, Mask);
1955  }
1956 
1957  unsigned Reg = ArgVGPRs[RegIdx];
1958  Reg = CCInfo.AllocateReg(Reg);
1959  assert(Reg != AMDGPU::NoRegister);
1960 
1961  MachineFunction &MF = CCInfo.getMachineFunction();
1962  Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1963  MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
1965 }
1966 
1968  const TargetRegisterClass *RC,
1969  unsigned NumArgRegs) {
1970  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1971  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1972  if (RegIdx == ArgSGPRs.size())
1973  report_fatal_error("ran out of SGPRs for arguments");
1974 
1975  unsigned Reg = ArgSGPRs[RegIdx];
1976  Reg = CCInfo.AllocateReg(Reg);
1977  assert(Reg != AMDGPU::NoRegister);
1978 
1979  MachineFunction &MF = CCInfo.getMachineFunction();
1980  MF.addLiveIn(Reg, RC);
1982 }
1983 
1984 // If this has a fixed position, we still should allocate the register in the
1985 // CCInfo state. Technically we could get away with this for values passed
1986 // outside of the normal argument range.
1988  const TargetRegisterClass *RC,
1989  MCRegister Reg) {
1990  Reg = CCInfo.AllocateReg(Reg);
1991  assert(Reg != AMDGPU::NoRegister);
1992  MachineFunction &MF = CCInfo.getMachineFunction();
1993  MF.addLiveIn(Reg, RC);
1994 }
1995 
1997  if (Arg) {
1998  allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
1999  Arg.getRegister());
2000  } else
2001  Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2002 }
2003 
2005  if (Arg) {
2006  allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2007  Arg.getRegister());
2008  } else
2009  Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2010 }
2011 
2012 /// Allocate implicit function VGPR arguments at the end of allocated user
2013 /// arguments.
2015  CCState &CCInfo, MachineFunction &MF,
2016  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2017  const unsigned Mask = 0x3ff;
2019 
2020  if (Info.hasWorkItemIDX()) {
2021  Arg = allocateVGPR32Input(CCInfo, Mask);
2022  Info.setWorkItemIDX(Arg);
2023  }
2024 
2025  if (Info.hasWorkItemIDY()) {
2026  Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2027  Info.setWorkItemIDY(Arg);
2028  }
2029 
2030  if (Info.hasWorkItemIDZ())
2031  Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2032 }
2033 
2034 /// Allocate implicit function VGPR arguments in fixed registers.
2036  CCState &CCInfo, MachineFunction &MF,
2037  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2038  Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2039  if (!Reg)
2040  report_fatal_error("failed to allocated VGPR for implicit arguments");
2041 
2042  const unsigned Mask = 0x3ff;
2043  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2044  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2045  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2046 }
2047 
2049  CCState &CCInfo,
2050  MachineFunction &MF,
2051  const SIRegisterInfo &TRI,
2052  SIMachineFunctionInfo &Info) const {
2053  auto &ArgInfo = Info.getArgInfo();
2054 
2055  // TODO: Unify handling with private memory pointers.
2056  if (Info.hasDispatchPtr())
2057  allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2058 
2059  if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
2060  allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2061 
2062  // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2063  // constant offset from the kernarg segment.
2064  if (Info.hasImplicitArgPtr())
2065  allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2066 
2067  if (Info.hasDispatchID())
2068  allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2069 
2070  // flat_scratch_init is not applicable for non-kernel functions.
2071 
2072  if (Info.hasWorkGroupIDX())
2073  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2074 
2075  if (Info.hasWorkGroupIDY())
2076  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2077 
2078  if (Info.hasWorkGroupIDZ())
2079  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2080 
2081  if (Info.hasLDSKernelId())
2082  allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2083 }
2084 
2085 // Allocate special inputs passed in user SGPRs.
2087  MachineFunction &MF,
2088  const SIRegisterInfo &TRI,
2089  SIMachineFunctionInfo &Info) const {
2090  if (Info.hasImplicitBufferPtr()) {
2091  Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2092  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2093  CCInfo.AllocateReg(ImplicitBufferPtrReg);
2094  }
2095 
2096  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2097  if (Info.hasPrivateSegmentBuffer()) {
2098  Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2099  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2100  CCInfo.AllocateReg(PrivateSegmentBufferReg);
2101  }
2102 
2103  if (Info.hasDispatchPtr()) {
2104  Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2105  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2106  CCInfo.AllocateReg(DispatchPtrReg);
2107  }
2108 
2109  if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
2110  Register QueuePtrReg = Info.addQueuePtr(TRI);
2111  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2112  CCInfo.AllocateReg(QueuePtrReg);
2113  }
2114 
2115  if (Info.hasKernargSegmentPtr()) {
2117  Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2118  CCInfo.AllocateReg(InputPtrReg);
2119 
2120  Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2122  }
2123 
2124  if (Info.hasDispatchID()) {
2125  Register DispatchIDReg = Info.addDispatchID(TRI);
2126  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2127  CCInfo.AllocateReg(DispatchIDReg);
2128  }
2129 
2130  if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2131  Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2132  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2133  CCInfo.AllocateReg(FlatScratchInitReg);
2134  }
2135 
2136  if (Info.hasLDSKernelId()) {
2137  Register Reg = Info.addLDSKernelId();
2138  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2139  CCInfo.AllocateReg(Reg);
2140  }
2141 
2142  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2143  // these from the dispatch pointer.
2144 }
2145 
2146 // Allocate special input registers that are initialized per-wave.
2148  MachineFunction &MF,
2150  CallingConv::ID CallConv,
2151  bool IsShader) const {
2152  if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2153  // Note: user SGPRs are handled by the front-end for graphics shaders
2154  // Pad up the used user SGPRs with dead inputs.
2155  unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2156 
2157  // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2158  // rely on it to reach 16 since if we end up having no stack usage, it will
2159  // not really be added.
2160  unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2161  Info.hasWorkGroupIDY() +
2162  Info.hasWorkGroupIDZ() +
2163  Info.hasWorkGroupInfo();
2164  for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2165  Register Reg = Info.addReservedUserSGPR();
2166  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2167  CCInfo.AllocateReg(Reg);
2168  }
2169  }
2170 
2171  if (Info.hasWorkGroupIDX()) {
2172  Register Reg = Info.addWorkGroupIDX();
2173  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2174  CCInfo.AllocateReg(Reg);
2175  }
2176 
2177  if (Info.hasWorkGroupIDY()) {
2178  Register Reg = Info.addWorkGroupIDY();
2179  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2180  CCInfo.AllocateReg(Reg);
2181  }
2182 
2183  if (Info.hasWorkGroupIDZ()) {
2184  Register Reg = Info.addWorkGroupIDZ();
2185  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2186  CCInfo.AllocateReg(Reg);
2187  }
2188 
2189  if (Info.hasWorkGroupInfo()) {
2190  Register Reg = Info.addWorkGroupInfo();
2191  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2192  CCInfo.AllocateReg(Reg);
2193  }
2194 
2195  if (Info.hasPrivateSegmentWaveByteOffset()) {
2196  // Scratch wave offset passed in system SGPR.
2197  unsigned PrivateSegmentWaveByteOffsetReg;
2198 
2199  if (IsShader) {
2200  PrivateSegmentWaveByteOffsetReg =
2201  Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2202 
2203  // This is true if the scratch wave byte offset doesn't have a fixed
2204  // location.
2205  if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2206  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2207  Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2208  }
2209  } else
2210  PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2211 
2212  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2213  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2214  }
2215 
2216  assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2217  Info.getNumPreloadedSGPRs() >= 16);
2218 }
2219 
2221  MachineFunction &MF,
2222  const SIRegisterInfo &TRI,
2224  // Now that we've figured out where the scratch register inputs are, see if
2225  // should reserve the arguments and use them directly.
2226  MachineFrameInfo &MFI = MF.getFrameInfo();
2227  bool HasStackObjects = MFI.hasStackObjects();
2228  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2229 
2230  // Record that we know we have non-spill stack objects so we don't need to
2231  // check all stack objects later.
2232  if (HasStackObjects)
2233  Info.setHasNonSpillStackObjects(true);
2234 
2235  // Everything live out of a block is spilled with fast regalloc, so it's
2236  // almost certain that spilling will be required.
2237  if (TM.getOptLevel() == CodeGenOpt::None)
2238  HasStackObjects = true;
2239 
2240  // For now assume stack access is needed in any callee functions, so we need
2241  // the scratch registers to pass in.
2242  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2243 
2244  if (!ST.enableFlatScratch()) {
2245  if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2246  // If we have stack objects, we unquestionably need the private buffer
2247  // resource. For the Code Object V2 ABI, this will be the first 4 user
2248  // SGPR inputs. We can reserve those and use them directly.
2249 
2250  Register PrivateSegmentBufferReg =
2252  Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2253  } else {
2254  unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2255  // We tentatively reserve the last registers (skipping the last registers
2256  // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2257  // we'll replace these with the ones immediately after those which were
2258  // really allocated. In the prologue copies will be inserted from the
2259  // argument to these reserved registers.
2260 
2261  // Without HSA, relocations are used for the scratch pointer and the
2262  // buffer resource setup is always inserted in the prologue. Scratch wave
2263  // offset is still in an input SGPR.
2264  Info.setScratchRSrcReg(ReservedBufferReg);
2265  }
2266  }
2267 
2269 
2270  // For entry functions we have to set up the stack pointer if we use it,
2271  // whereas non-entry functions get this "for free". This means there is no
2272  // intrinsic advantage to using S32 over S34 in cases where we do not have
2273  // calls but do need a frame pointer (i.e. if we are requested to have one
2274  // because frame pointer elimination is disabled). To keep things simple we
2275  // only ever use S32 as the call ABI stack pointer, and so using it does not
2276  // imply we need a separate frame pointer.
2277  //
2278  // Try to use s32 as the SP, but move it if it would interfere with input
2279  // arguments. This won't work with calls though.
2280  //
2281  // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2282  // registers.
2283  if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2284  Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2285  } else {
2287 
2288  if (MFI.hasCalls())
2289  report_fatal_error("call in graphics shader with too many input SGPRs");
2290 
2291  for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2292  if (!MRI.isLiveIn(Reg)) {
2293  Info.setStackPtrOffsetReg(Reg);
2294  break;
2295  }
2296  }
2297 
2298  if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2299  report_fatal_error("failed to find register for SP");
2300  }
2301 
2302  // hasFP should be accurate for entry functions even before the frame is
2303  // finalized, because it does not rely on the known stack size, only
2304  // properties like whether variable sized objects are present.
2305  if (ST.getFrameLowering()->hasFP(MF)) {
2306  Info.setFrameOffsetReg(AMDGPU::SGPR33);
2307  }
2308 }
2309 
2312  return !Info->isEntryFunction();
2313 }
2314 
2316 
2317 }
2318 
2320  MachineBasicBlock *Entry,
2321  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2323 
2324  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2325  if (!IStart)
2326  return;
2327 
2328  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2329  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2330  MachineBasicBlock::iterator MBBI = Entry->begin();
2331  for (const MCPhysReg *I = IStart; *I; ++I) {
2332  const TargetRegisterClass *RC = nullptr;
2333  if (AMDGPU::SReg_64RegClass.contains(*I))
2334  RC = &AMDGPU::SGPR_64RegClass;
2335  else if (AMDGPU::SReg_32RegClass.contains(*I))
2336  RC = &AMDGPU::SGPR_32RegClass;
2337  else
2338  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2339 
2340  Register NewVR = MRI->createVirtualRegister(RC);
2341  // Create copy from CSR to a virtual register.
2342  Entry->addLiveIn(*I);
2343  BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2344  .addReg(*I);
2345 
2346  // Insert the copy-back instructions right before the terminator.
2347  for (auto *Exit : Exits)
2348  BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2349  TII->get(TargetOpcode::COPY), *I)
2350  .addReg(NewVR);
2351  }
2352 }
2353 
2355  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2356  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2357  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2359 
2360  MachineFunction &MF = DAG.getMachineFunction();
2361  const Function &Fn = MF.getFunction();
2362  FunctionType *FType = MF.getFunction().getFunctionType();
2364 
2365  if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2366  DiagnosticInfoUnsupported NoGraphicsHSA(
2367  Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2368  DAG.getContext()->diagnose(NoGraphicsHSA);
2369  return DAG.getEntryNode();
2370  }
2371 
2372  Info->allocateKnownAddressLDSGlobal(Fn);
2373 
2376  BitVector Skipped(Ins.size());
2377  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2378  *DAG.getContext());
2379 
2380  bool IsGraphics = AMDGPU::isGraphics(CallConv);
2381  bool IsKernel = AMDGPU::isKernel(CallConv);
2382  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2383 
2384  if (IsGraphics) {
2385  assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
2386  (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
2387  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2388  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
2389  !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2390  !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2391  }
2392 
2393  if (CallConv == CallingConv::AMDGPU_PS) {
2394  processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2395 
2396  // At least one interpolation mode must be enabled or else the GPU will
2397  // hang.
2398  //
2399  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2400  // set PSInputAddr, the user wants to enable some bits after the compilation
2401  // based on run-time states. Since we can't know what the final PSInputEna
2402  // will look like, so we shouldn't do anything here and the user should take
2403  // responsibility for the correct programming.
2404  //
2405  // Otherwise, the following restrictions apply:
2406  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2407  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2408  // enabled too.
2409  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2410  ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2411  CCInfo.AllocateReg(AMDGPU::VGPR0);
2412  CCInfo.AllocateReg(AMDGPU::VGPR1);
2413  Info->markPSInputAllocated(0);
2414  Info->markPSInputEnabled(0);
2415  }
2416  if (Subtarget->isAmdPalOS()) {
2417  // For isAmdPalOS, the user does not enable some bits after compilation
2418  // based on run-time states; the register values being generated here are
2419  // the final ones set in hardware. Therefore we need to apply the
2420  // workaround to PSInputAddr and PSInputEnable together. (The case where
2421  // a bit is set in PSInputAddr but not PSInputEnable is where the
2422  // frontend set up an input arg for a particular interpolation mode, but
2423  // nothing uses that input arg. Really we should have an earlier pass
2424  // that removes such an arg.)
2425  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2426  if ((PsInputBits & 0x7F) == 0 ||
2427  ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2428  Info->markPSInputEnabled(
2429  countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
2430  }
2431  } else if (IsKernel) {
2432  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2433  } else {
2434  Splits.append(Ins.begin(), Ins.end());
2435  }
2436 
2437  if (IsEntryFunc) {
2438  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2439  allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2440  } else if (!IsGraphics) {
2441  // For the fixed ABI, pass workitem IDs in the last argument register.
2442  allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2443  }
2444 
2445  if (IsKernel) {
2447  } else {
2448  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2449  CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2450  }
2451 
2452  SmallVector<SDValue, 16> Chains;
2453 
2454  // FIXME: This is the minimum kernel argument alignment. We should improve
2455  // this to the maximum alignment of the arguments.
2456  //
2457  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2458  // kern arg offset.
2459  const Align KernelArgBaseAlign = Align(16);
2460 
2461  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2462  const ISD::InputArg &Arg = Ins[i];
2463  if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2464  InVals.push_back(DAG.getUNDEF(Arg.VT));
2465  continue;
2466  }
2467 
2468  CCValAssign &VA = ArgLocs[ArgIdx++];
2469  MVT VT = VA.getLocVT();
2470 
2471  if (IsEntryFunc && VA.isMemLoc()) {
2472  VT = Ins[i].VT;
2473  EVT MemVT = VA.getLocVT();
2474 
2475  const uint64_t Offset = VA.getLocMemOffset();
2476  Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2477 
2478  if (Arg.Flags.isByRef()) {
2479  SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2480 
2481  const GCNTargetMachine &TM =
2482  static_cast<const GCNTargetMachine &>(getTargetMachine());
2483  if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2484  Arg.Flags.getPointerAddrSpace())) {
2486  Arg.Flags.getPointerAddrSpace());
2487  }
2488 
2489  InVals.push_back(Ptr);
2490  continue;
2491  }
2492 
2493  SDValue Arg = lowerKernargMemParameter(
2494  DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2495  Chains.push_back(Arg.getValue(1));
2496 
2497  auto *ParamTy =
2498  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2499  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2500  ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2501  ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2502  // On SI local pointers are just offsets into LDS, so they are always
2503  // less than 16-bits. On CI and newer they could potentially be
2504  // real pointers, so we can't guarantee their size.
2505  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2506  DAG.getValueType(MVT::i16));
2507  }
2508 
2509  InVals.push_back(Arg);
2510  continue;
2511  } else if (!IsEntryFunc && VA.isMemLoc()) {
2512  SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2513  InVals.push_back(Val);
2514  if (!Arg.Flags.isByVal())
2515  Chains.push_back(Val.getValue(1));
2516  continue;
2517  }
2518 
2519  assert(VA.isRegLoc() && "Parameter must be in a register!");
2520 
2521  Register Reg = VA.getLocReg();
2522  const TargetRegisterClass *RC = nullptr;
2523  if (AMDGPU::VGPR_32RegClass.contains(Reg))
2524  RC = &AMDGPU::VGPR_32RegClass;
2525  else if (AMDGPU::SGPR_32RegClass.contains(Reg))
2526  RC = &AMDGPU::SGPR_32RegClass;
2527  else
2528  llvm_unreachable("Unexpected register class in LowerFormalArguments!");
2529  EVT ValVT = VA.getValVT();
2530 
2531  Reg = MF.addLiveIn(Reg, RC);
2532  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2533 
2534  if (Arg.Flags.isSRet()) {
2535  // The return object should be reasonably addressable.
2536 
2537  // FIXME: This helps when the return is a real sret. If it is a
2538  // automatically inserted sret (i.e. CanLowerReturn returns false), an
2539  // extra copy is inserted in SelectionDAGBuilder which obscures this.
2540  unsigned NumBits
2542  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2543  DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2544  }
2545 
2546  // If this is an 8 or 16-bit value, it is really passed promoted
2547  // to 32 bits. Insert an assert[sz]ext to capture this, then
2548  // truncate to the right size.
2549  switch (VA.getLocInfo()) {
2550  case CCValAssign::Full:
2551  break;
2552  case CCValAssign::BCvt:
2553  Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2554  break;
2555  case CCValAssign::SExt:
2556  Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2557  DAG.getValueType(ValVT));
2558  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2559  break;
2560  case CCValAssign::ZExt:
2561  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2562  DAG.getValueType(ValVT));
2563  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2564  break;
2565  case CCValAssign::AExt:
2566  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2567  break;
2568  default:
2569  llvm_unreachable("Unknown loc info!");
2570  }
2571 
2572  InVals.push_back(Val);
2573  }
2574 
2575  // Start adding system SGPRs.
2576  if (IsEntryFunc) {
2577  allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
2578  } else {
2579  CCInfo.AllocateReg(Info->getScratchRSrcReg());
2580  if (!IsGraphics)
2581  allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2582  }
2583 
2584  auto &ArgUsageInfo =
2586  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2587 
2588  unsigned StackArgSize = CCInfo.getNextStackOffset();
2589  Info->setBytesInStackArgArea(StackArgSize);
2590 
2591  return Chains.empty() ? Chain :
2592  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2593 }
2594 
2595 // TODO: If return values can't fit in registers, we should return as many as
2596 // possible in registers before passing on stack.
2598  CallingConv::ID CallConv,
2599  MachineFunction &MF, bool IsVarArg,
2600  const SmallVectorImpl<ISD::OutputArg> &Outs,
2601  LLVMContext &Context) const {
2602  // Replacing returns with sret/stack usage doesn't make sense for shaders.
2603  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2604  // for shaders. Vector types should be explicitly handled by CC.
2605  if (AMDGPU::isEntryFunctionCC(CallConv))
2606  return true;
2607 
2609  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2610  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2611 }
2612 
2613 SDValue
2615  bool isVarArg,
2616  const SmallVectorImpl<ISD::OutputArg> &Outs,
2617  const SmallVectorImpl<SDValue> &OutVals,
2618  const SDLoc &DL, SelectionDAG &DAG) const {
2619  MachineFunction &MF = DAG.getMachineFunction();
2621 
2622  if (AMDGPU::isKernel(CallConv)) {
2623  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2624  OutVals, DL, DAG);
2625  }
2626 
2627  bool IsShader = AMDGPU::isShader(CallConv);
2628 
2629  Info->setIfReturnsVoid(Outs.empty());
2630  bool IsWaveEnd = Info->returnsVoid() && IsShader;
2631 
2632  // CCValAssign - represent the assignment of the return value to a location.
2635 
2636  // CCState - Info about the registers and stack slots.
2637  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2638  *DAG.getContext());
2639 
2640  // Analyze outgoing return values.
2641  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2642 
2643  SDValue Flag;
2644  SmallVector<SDValue, 48> RetOps;
2645  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2646 
2647  // Copy the result values into the output registers.
2648  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2649  ++I, ++RealRVLocIdx) {
2650  CCValAssign &VA = RVLocs[I];
2651  assert(VA.isRegLoc() && "Can only return in registers!");
2652  // TODO: Partially return in registers if return values don't fit.
2653  SDValue Arg = OutVals[RealRVLocIdx];
2654 
2655  // Copied from other backends.
2656  switch (VA.getLocInfo()) {
2657  case CCValAssign::Full:
2658  break;
2659  case CCValAssign::BCvt:
2660  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2661  break;
2662  case CCValAssign::SExt:
2663  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2664  break;
2665  case CCValAssign::ZExt:
2666  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2667  break;
2668  case CCValAssign::AExt:
2669  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2670  break;
2671  default:
2672  llvm_unreachable("Unknown loc info!");
2673  }
2674 
2675  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2676  Flag = Chain.getValue(1);
2677  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2678  }
2679 
2680  // FIXME: Does sret work properly?
2681  if (!Info->isEntryFunction()) {
2682  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2683  const MCPhysReg *I =
2684  TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2685  if (I) {
2686  for (; *I; ++I) {
2687  if (AMDGPU::SReg_64RegClass.contains(*I))
2688  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2689  else if (AMDGPU::SReg_32RegClass.contains(*I))
2690  RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2691  else
2692  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2693  }
2694  }
2695  }
2696 
2697  // Update chain and glue.
2698  RetOps[0] = Chain;
2699  if (Flag.getNode())
2700  RetOps.push_back(Flag);
2701 
2702  unsigned Opc = AMDGPUISD::ENDPGM;
2703  if (!IsWaveEnd)
2705  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2706 }
2707 
2709  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2710  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2711  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2712  SDValue ThisVal) const {
2713  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2714 
2715  // Assign locations to each value returned by this call.
2717  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2718  *DAG.getContext());
2719  CCInfo.AnalyzeCallResult(Ins, RetCC);
2720 
2721  // Copy all of the result registers out of their specified physreg.
2722  for (unsigned i = 0; i != RVLocs.size(); ++i) {
2723  CCValAssign VA = RVLocs[i];
2724  SDValue Val;
2725 
2726  if (VA.isRegLoc()) {
2727  Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2728  Chain = Val.getValue(1);
2729  InFlag = Val.getValue(2);
2730  } else if (VA.isMemLoc()) {
2731  report_fatal_error("TODO: return values in memory");
2732  } else
2733  llvm_unreachable("unknown argument location type");
2734 
2735  switch (VA.getLocInfo()) {
2736  case CCValAssign::Full:
2737  break;
2738  case CCValAssign::BCvt:
2739  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2740  break;
2741  case CCValAssign::ZExt:
2742  Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2743  DAG.getValueType(VA.getValVT()));
2744  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2745  break;
2746  case CCValAssign::SExt:
2747  Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2748  DAG.getValueType(VA.getValVT()));
2749  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2750  break;
2751  case CCValAssign::AExt:
2752  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2753  break;
2754  default:
2755  llvm_unreachable("Unknown loc info!");
2756  }
2757 
2758  InVals.push_back(Val);
2759  }
2760 
2761  return Chain;
2762 }
2763 
2764 // Add code to pass special inputs required depending on used features separate
2765 // from the explicit user arguments present in the IR.
2767  CallLoweringInfo &CLI,
2768  CCState &CCInfo,
2769  const SIMachineFunctionInfo &Info,
2770  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2771  SmallVectorImpl<SDValue> &MemOpChains,
2772  SDValue Chain) const {
2773  // If we don't have a call site, this was a call inserted by
2774  // legalization. These can never use special inputs.
2775  if (!CLI.CB)
2776  return;
2777 
2778  SelectionDAG &DAG = CLI.DAG;
2779  const SDLoc &DL = CLI.DL;
2780  const Function &F = DAG.getMachineFunction().getFunction();
2781 
2782  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2783  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2784 
2785  const AMDGPUFunctionArgInfo *CalleeArgInfo
2787  if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
2788  auto &ArgUsageInfo =
2790  CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2791  }
2792 
2793  // TODO: Unify with private memory register handling. This is complicated by
2794  // the fact that at least in kernels, the input argument is not necessarily
2795  // in the same location as the input.
2796  static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
2798  {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
2799  {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
2800  {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
2801  {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
2802  {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
2803  {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
2804  {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
2805  {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
2806  };
2807 
2808  for (auto Attr : ImplicitAttrs) {
2809  const ArgDescriptor *OutgoingArg;
2810  const TargetRegisterClass *ArgRC;
2811  LLT ArgTy;
2812 
2813  AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
2814 
2815  // If the callee does not use the attribute value, skip copying the value.
2816  if (CLI.CB->hasFnAttr(Attr.second))
2817  continue;
2818 
2819  std::tie(OutgoingArg, ArgRC, ArgTy) =
2820  CalleeArgInfo->getPreloadedValue(InputID);
2821  if (!OutgoingArg)
2822  continue;
2823 
2824  const ArgDescriptor *IncomingArg;
2825  const TargetRegisterClass *IncomingArgRC;
2826  LLT Ty;
2827  std::tie(IncomingArg, IncomingArgRC, Ty) =
2828  CallerArgInfo.getPreloadedValue(InputID);
2829  assert(IncomingArgRC == ArgRC);
2830 
2831  // All special arguments are ints for now.
2832  EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2833  SDValue InputReg;
2834 
2835  if (IncomingArg) {
2836  InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2837  } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
2838  // The implicit arg ptr is special because it doesn't have a corresponding
2839  // input for kernels, and is computed from the kernarg segment pointer.
2840  InputReg = getImplicitArgPtr(DAG, DL);
2841  } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
2843  if (Id.has_value()) {
2844  InputReg = DAG.getConstant(Id.value(), DL, ArgVT);
2845  } else {
2846  InputReg = DAG.getUNDEF(ArgVT);
2847  }
2848  } else {
2849  // We may have proven the input wasn't needed, although the ABI is
2850  // requiring it. We just need to allocate the register appropriately.
2851  InputReg = DAG.getUNDEF(ArgVT);
2852  }
2853 
2854  if (OutgoingArg->isRegister()) {
2855  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2856  if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
2857  report_fatal_error("failed to allocate implicit input argument");
2858  } else {
2859  unsigned SpecialArgOffset =
2860  CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
2861  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2862  SpecialArgOffset);
2863  MemOpChains.push_back(ArgStore);
2864  }
2865  }
2866 
2867  // Pack workitem IDs into a single register or pass it as is if already
2868  // packed.
2869  const ArgDescriptor *OutgoingArg;
2870  const TargetRegisterClass *ArgRC;
2871  LLT Ty;
2872 
2873  std::tie(OutgoingArg, ArgRC, Ty) =
2875  if (!OutgoingArg)
2876  std::tie(OutgoingArg, ArgRC, Ty) =
2878  if (!OutgoingArg)
2879  std::tie(OutgoingArg, ArgRC, Ty) =
2881  if (!OutgoingArg)
2882  return;
2883 
2884  const ArgDescriptor *IncomingArgX = std::get<0>(
2886  const ArgDescriptor *IncomingArgY = std::get<0>(
2888  const ArgDescriptor *IncomingArgZ = std::get<0>(
2890 
2891  SDValue InputReg;
2892  SDLoc SL;
2893 
2894  const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
2895  const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
2896  const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
2897 
2898  // If incoming ids are not packed we need to pack them.
2899  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
2900  NeedWorkItemIDX) {
2901  if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
2902  InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
2903  } else {
2904  InputReg = DAG.getConstant(0, DL, MVT::i32);
2905  }
2906  }
2907 
2908  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
2909  NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
2910  SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
2911  Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
2912  DAG.getShiftAmountConstant(10, MVT::i32, SL));
2913  InputReg = InputReg.getNode() ?
2914  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
2915  }
2916 
2917  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
2918  NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
2919  SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
2920  Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
2921  DAG.getShiftAmountConstant(20, MVT::i32, SL));
2922  InputReg = InputReg.getNode() ?
2923  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
2924  }
2925 
2926  if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
2927  if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
2928  // We're in a situation where the outgoing function requires the workitem
2929  // ID, but the calling function does not have it (e.g a graphics function
2930  // calling a C calling convention function). This is illegal, but we need
2931  // to produce something.
2932  InputReg = DAG.getUNDEF(MVT::i32);
2933  } else {
2934  // Workitem ids are already packed, any of present incoming arguments
2935  // will carry all required fields.
2937  IncomingArgX ? *IncomingArgX :
2938  IncomingArgY ? *IncomingArgY :
2939  *IncomingArgZ, ~0u);
2940  InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
2941  }
2942  }
2943 
2944  if (OutgoingArg->isRegister()) {
2945  if (InputReg)
2946  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2947 
2948  CCInfo.AllocateReg(OutgoingArg->getRegister());
2949  } else {
2950  unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
2951  if (InputReg) {
2952  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2953  SpecialArgOffset);
2954  MemOpChains.push_back(ArgStore);
2955  }
2956  }
2957 }
2958 
2960  return CC == CallingConv::Fast;
2961 }
2962 
2963 /// Return true if we might ever do TCO for calls with this calling convention.
2965  switch (CC) {
2966  case CallingConv::C:
2968  return true;
2969  default:
2970  return canGuaranteeTCO(CC);
2971  }
2972 }
2973 
2975  SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2976  const SmallVectorImpl<ISD::OutputArg> &Outs,
2977  const SmallVectorImpl<SDValue> &OutVals,
2978  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2979  if (!mayTailCallThisCC(CalleeCC))
2980  return false;
2981 
2982  // For a divergent call target, we need to do a waterfall loop over the
2983  // possible callees which precludes us from using a simple jump.
2984  if (Callee->isDivergent())
2985  return false;
2986 
2987  MachineFunction &MF = DAG.getMachineFunction();
2988  const Function &CallerF = MF.getFunction();
2989  CallingConv::ID CallerCC = CallerF.getCallingConv();
2991  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2992 
2993  // Kernels aren't callable, and don't have a live in return address so it
2994  // doesn't make sense to do a tail call with entry functions.
2995  if (!CallerPreserved)
2996  return false;
2997 
2998  bool CCMatch = CallerCC == CalleeCC;
2999 
3001  if (canGuaranteeTCO(CalleeCC) && CCMatch)
3002  return true;
3003  return false;
3004  }
3005 
3006  // TODO: Can we handle var args?
3007  if (IsVarArg)
3008  return false;
3009 
3010  for (const Argument &Arg : CallerF.args()) {
3011  if (Arg.hasByValAttr())
3012  return false;
3013  }
3014 
3015  LLVMContext &Ctx = *DAG.getContext();
3016 
3017  // Check that the call results are passed in the same way.
3018  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3019  CCAssignFnForCall(CalleeCC, IsVarArg),
3020  CCAssignFnForCall(CallerCC, IsVarArg)))
3021  return false;
3022 
3023  // The callee has to preserve all registers the caller needs to preserve.
3024  if (!CCMatch) {
3025  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3026  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3027  return false;
3028  }
3029 
3030  // Nothing more to check if the callee is taking no arguments.
3031  if (Outs.empty())
3032  return true;
3033 
3035  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3036 
3037  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3038 
3039  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3040  // If the stack arguments for this call do not fit into our own save area then
3041  // the call cannot be made tail.
3042  // TODO: Is this really necessary?
3043  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3044  return false;
3045 
3046  const MachineRegisterInfo &MRI = MF.getRegInfo();
3047  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3048 }
3049 
3051  if (!CI->isTailCall())
3052  return false;
3053 
3054  const Function *ParentFn = CI->getParent()->getParent();
3055  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
3056  return false;
3057  return true;
3058 }
3059 
3060 // The wave scratch offset register is used as the global base pointer.
3062  SmallVectorImpl<SDValue> &InVals) const {
3063  SelectionDAG &DAG = CLI.DAG;
3064  const SDLoc &DL = CLI.DL;
3066  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3068  SDValue Chain = CLI.Chain;
3069  SDValue Callee = CLI.Callee;
3070  bool &IsTailCall = CLI.IsTailCall;
3071  CallingConv::ID CallConv = CLI.CallConv;
3072  bool IsVarArg = CLI.IsVarArg;
3073  bool IsSibCall = false;
3074  bool IsThisReturn = false;
3075  MachineFunction &MF = DAG.getMachineFunction();
3076 
3077  if (Callee.isUndef() || isNullConstant(Callee)) {
3078  if (!CLI.IsTailCall) {
3079  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3080  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3081  }
3082 
3083  return Chain;
3084  }
3085 
3086  if (IsVarArg) {
3087  return lowerUnhandledCall(CLI, InVals,
3088  "unsupported call to variadic function ");
3089  }
3090 
3091  if (!CLI.CB)
3092  report_fatal_error("unsupported libcall legalization");
3093 
3094  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3095  return lowerUnhandledCall(CLI, InVals,
3096  "unsupported required tail call to function ");
3097  }
3098 
3099  if (AMDGPU::isShader(CallConv)) {
3100  // Note the issue is with the CC of the called function, not of the call
3101  // itself.
3102  return lowerUnhandledCall(CLI, InVals,
3103  "unsupported call to a shader function ");
3104  }
3105 
3107  CallConv != CallingConv::AMDGPU_Gfx) {
3108  // Only allow calls with specific calling conventions.
3109  return lowerUnhandledCall(CLI, InVals,
3110  "unsupported calling convention for call from "
3111  "graphics shader of function ");
3112  }
3113 
3114  if (IsTailCall) {
3115  IsTailCall = isEligibleForTailCallOptimization(
3116  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3117  if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
3118  report_fatal_error("failed to perform tail call elimination on a call "
3119  "site marked musttail");
3120  }
3121 
3122  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3123 
3124  // A sibling call is one where we're under the usual C ABI and not planning
3125  // to change that but can still do a tail call:
3126  if (!TailCallOpt && IsTailCall)
3127  IsSibCall = true;
3128 
3129  if (IsTailCall)
3130  ++NumTailCalls;
3131  }
3132 
3135  SmallVector<SDValue, 8> MemOpChains;
3136 
3137  // Analyze operands of the call, assigning locations to each operand.
3139  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3140  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3141 
3142  if (CallConv != CallingConv::AMDGPU_Gfx) {
3143  // With a fixed ABI, allocate fixed registers before user arguments.
3144  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3145  }
3146 
3147  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3148 
3149  // Get a count of how many bytes are to be pushed on the stack.
3150  unsigned NumBytes = CCInfo.getNextStackOffset();
3151 
3152  if (IsSibCall) {
3153  // Since we're not changing the ABI to make this a tail call, the memory
3154  // operands are already available in the caller's incoming argument space.
3155  NumBytes = 0;
3156  }
3157 
3158  // FPDiff is the byte offset of the call's argument area from the callee's.
3159  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3160  // by this amount for a tail call. In a sibling call it must be 0 because the
3161  // caller will deallocate the entire stack and the callee still expects its
3162  // arguments to begin at SP+0. Completely unused for non-tail calls.
3163  int32_t FPDiff = 0;
3164  MachineFrameInfo &MFI = MF.getFrameInfo();
3165 
3166  // Adjust the stack pointer for the new arguments...
3167  // These operations are automatically eliminated by the prolog/epilog pass
3168  if (!IsSibCall) {
3169  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3170 
3171  if (!Subtarget->enableFlatScratch()) {
3172  SmallVector<SDValue, 4> CopyFromChains;
3173 
3174  // In the HSA case, this should be an identity copy.
3175  SDValue ScratchRSrcReg
3176  = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3177  RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3178  CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3179  Chain = DAG.getTokenFactor(DL, CopyFromChains);
3180  }
3181  }
3182 
3183  MVT PtrVT = MVT::i32;
3184 
3185  // Walk the register/memloc assignments, inserting copies/loads.
3186  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3187  CCValAssign &VA = ArgLocs[i];
3188  SDValue Arg = OutVals[i];
3189 
3190  // Promote the value if needed.
3191  switch (VA.getLocInfo()) {
3192  case CCValAssign::Full:
3193  break;
3194  case CCValAssign::BCvt:
3195  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3196  break;
3197  case CCValAssign::ZExt:
3198  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3199  break;
3200  case CCValAssign::SExt:
3201  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3202  break;
3203  case CCValAssign::AExt:
3204  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3205  break;
3206  case CCValAssign::FPExt:
3207  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3208  break;
3209  default:
3210  llvm_unreachable("Unknown loc info!");
3211  }
3212 
3213  if (VA.isRegLoc()) {
3214  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3215  } else {
3216  assert(VA.isMemLoc());
3217 
3218  SDValue DstAddr;
3219  MachinePointerInfo DstInfo;
3220 
3221  unsigned LocMemOffset = VA.getLocMemOffset();
3222  int32_t Offset = LocMemOffset;
3223 
3224  SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3225  MaybeAlign Alignment;
3226 
3227  if (IsTailCall) {
3228  ISD::ArgFlagsTy Flags = Outs[i].Flags;
3229  unsigned OpSize = Flags.isByVal() ?
3230  Flags.getByValSize() : VA.getValVT().getStoreSize();
3231 
3232  // FIXME: We can have better than the minimum byval required alignment.
3233  Alignment =
3234  Flags.isByVal()
3235  ? Flags.getNonZeroByValAlign()
3236  : commonAlignment(Subtarget->getStackAlignment(), Offset);
3237 
3238  Offset = Offset + FPDiff;
3239  int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3240 
3241  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3242  DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3243 
3244  // Make sure any stack arguments overlapping with where we're storing
3245  // are loaded before this eventual operation. Otherwise they'll be
3246  // clobbered.
3247 
3248  // FIXME: Why is this really necessary? This seems to just result in a
3249  // lot of code to copy the stack and write them back to the same
3250  // locations, which are supposed to be immutable?
3251  Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3252  } else {
3253  // Stores to the argument stack area are relative to the stack pointer.
3254  SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3255  MVT::i32);
3256  DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3257  DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3258  Alignment =
3259  commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3260  }
3261 
3262  if (Outs[i].Flags.isByVal()) {
3263  SDValue SizeNode =
3264  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3265  SDValue Cpy =
3266  DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3267  Outs[i].Flags.getNonZeroByValAlign(),
3268  /*isVol = */ false, /*AlwaysInline = */ true,
3269  /*isTailCall = */ false, DstInfo,
3271 
3272  MemOpChains.push_back(Cpy);
3273  } else {
3274  SDValue Store =
3275  DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3276  MemOpChains.push_back(Store);
3277  }
3278  }
3279  }
3280 
3281  if (!MemOpChains.empty())
3282  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3283 
3284  // Build a sequence of copy-to-reg nodes chained together with token chain
3285  // and flag operands which copy the outgoing args into the appropriate regs.
3286  SDValue InFlag;
3287  for (auto &RegToPass : RegsToPass) {
3288  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3289  RegToPass.second, InFlag);
3290  InFlag = Chain.getValue(1);
3291  }
3292 
3293 
3294  // We don't usually want to end the call-sequence here because we would tidy
3295  // the frame up *after* the call, however in the ABI-changing tail-call case
3296  // we've carefully laid out the parameters so that when sp is reset they'll be
3297  // in the correct location.
3298  if (IsTailCall && !IsSibCall) {
3299  Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InFlag, DL);
3300  InFlag = Chain.getValue(1);
3301  }
3302 
3303  std::vector<SDValue> Ops;
3304  Ops.push_back(Chain);
3305  Ops.push_back(Callee);
3306  // Add a redundant copy of the callee global which will not be legalized, as
3307  // we need direct access to the callee later.
3308  if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3309  const GlobalValue *GV = GSD->getGlobal();
3310  Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3311  } else {
3312  Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3313  }
3314 
3315  if (IsTailCall) {
3316  // Each tail call may have to adjust the stack by a different amount, so
3317  // this information must travel along with the operation for eventual
3318  // consumption by emitEpilogue.
3319  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3320  }
3321 
3322  // Add argument registers to the end of the list so that they are known live
3323  // into the call.
3324  for (auto &RegToPass : RegsToPass) {
3325  Ops.push_back(DAG.getRegister(RegToPass.first,
3326  RegToPass.second.getValueType()));
3327  }
3328 
3329  // Add a register mask operand representing the call-preserved registers.
3330 
3331  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
3332  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3333  assert(Mask && "Missing call preserved mask for calling convention");
3334  Ops.push_back(DAG.getRegisterMask(Mask));
3335 
3336  if (InFlag.getNode())
3337  Ops.push_back(InFlag);
3338 
3339  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3340 
3341  // If we're doing a tall call, use a TC_RETURN here rather than an
3342  // actual call instruction.
3343  if (IsTailCall) {
3344  MFI.setHasTailCall();
3345  return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
3346  }
3347 
3348  // Returns a chain and a flag for retval copy to use.
3349  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3350  Chain = Call.getValue(0);
3351  InFlag = Call.getValue(1);
3352 
3353  uint64_t CalleePopBytes = NumBytes;
3354  Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InFlag, DL);
3355  if (!Ins.empty())
3356  InFlag = Chain.getValue(1);
3357 
3358  // Handle result values, copying them out of physregs into vregs that we
3359  // return.
3360  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3361  InVals, IsThisReturn,
3362  IsThisReturn ? OutVals[0] : SDValue());
3363 }
3364 
3365 // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3366 // except for applying the wave size scale to the increment amount.
3368  SDValue Op, SelectionDAG &DAG) const {
3369  const MachineFunction &MF = DAG.getMachineFunction();
3371 
3372  SDLoc dl(Op);
3373  EVT VT = Op.getValueType();
3374  SDValue Tmp1 = Op;
3375  SDValue Tmp2 = Op.getValue(1);
3376  SDValue Tmp3 = Op.getOperand(2);
3377  SDValue Chain = Tmp1.getOperand(0);
3378 
3379  Register SPReg = Info->getStackPtrOffsetReg();
3380 
3381  // Chain the dynamic stack allocation so that it doesn't modify the stack
3382  // pointer when other instructions are using the stack.
3383  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3384 
3385  SDValue Size = Tmp2.getOperand(1);
3386  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3387  Chain = SP.getValue(1);
3388  MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3389  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3390  const TargetFrameLowering *TFL = ST.getFrameLowering();
3391  unsigned Opc =
3392  TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3393  ISD::ADD : ISD::SUB;
3394 
3395  SDValue ScaledSize = DAG.getNode(
3396  ISD::SHL, dl, VT, Size,
3397  DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
3398 
3399  Align StackAlign = TFL->getStackAlign();
3400  Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3401  if (Alignment && *Alignment > StackAlign) {
3402  Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3403  DAG.getConstant(-(uint64_t)Alignment->value()
3404  << ST.getWavefrontSizeLog2(),
3405  dl, VT));
3406  }
3407 
3408  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3409  Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3410 
3411  return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3412 }
3413 
3415  SelectionDAG &DAG) const {
3416  // We only handle constant sizes here to allow non-entry block, static sized
3417  // allocas. A truly dynamic value is more difficult to support because we
3418  // don't know if the size value is uniform or not. If the size isn't uniform,
3419  // we would need to do a wave reduction to get the maximum size to know how
3420  // much to increment the uniform stack pointer.
3421  SDValue Size = Op.getOperand(1);
3422  if (isa<ConstantSDNode>(Size))
3423  return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3424 
3426 }
3427 
3429  const MachineFunction &MF) const {
3431  .Case("m0", AMDGPU::M0)
3432  .Case("exec", AMDGPU::EXEC)
3433  .Case("exec_lo", AMDGPU::EXEC_LO)
3434  .Case("exec_hi", AMDGPU::EXEC_HI)
3435  .Case("flat_scratch", AMDGPU::FLAT_SCR)
3436  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3437  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3438  .Default(Register());
3439 
3440  if (Reg == AMDGPU::NoRegister) {
3441  report_fatal_error(Twine("invalid register name \""
3442  + StringRef(RegName) + "\"."));
3443 
3444  }
3445 
3446  if (!Subtarget->hasFlatScrRegister() &&
3447  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
3448  report_fatal_error(Twine("invalid register \""
3449  + StringRef(RegName) + "\" for subtarget."));
3450  }
3451 
3452  switch (Reg) {
3453  case AMDGPU::M0:
3454  case AMDGPU::EXEC_LO:
3455  case AMDGPU::EXEC_HI:
3456  case AMDGPU::FLAT_SCR_LO:
3457  case AMDGPU::FLAT_SCR_HI:
3458  if (VT.getSizeInBits() == 32)
3459  return Reg;
3460  break;
3461  case AMDGPU::EXEC:
3462  case AMDGPU::FLAT_SCR:
3463  if (VT.getSizeInBits() == 64)
3464  return Reg;
3465  break;
3466  default:
3467  llvm_unreachable("missing register type checking");
3468  }
3469 
3470  report_fatal_error(Twine("invalid type for register \""
3471  + StringRef(RegName) + "\"."));
3472 }
3473 
3474 // If kill is not the last instruction, split the block so kill is always a
3475 // proper terminator.
3478  MachineBasicBlock *BB) const {
3479  MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
3480  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3481  MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
3482  return SplitBB;
3483 }
3484 
3485 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3486 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3487 // be the first instruction in the remainder block.
3488 //
3489 /// \returns { LoopBody, Remainder }
3490 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3492  MachineFunction *MF = MBB.getParent();
3494 
3495  // To insert the loop we need to split the block. Move everything after this
3496  // point to a new block, and insert a new empty block between the two.
3498  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3500  ++MBBI;
3501 
3502  MF->insert(MBBI, LoopBB);
3503  MF->insert(MBBI, RemainderBB);
3504 
3505  LoopBB->addSuccessor(LoopBB);
3506  LoopBB->addSuccessor(RemainderBB);
3507 
3508  // Move the rest of the block into a new block.
3509  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3510 
3511  if (InstInLoop) {
3512  auto Next = std::next(I);
3513 
3514  // Move instruction to loop body.
3515  LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
3516 
3517  // Move the rest of the block.
3518  RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
3519  } else {
3520  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3521  }
3522 
3523  MBB.addSuccessor(LoopBB);
3524 
3525  return std::make_pair(LoopBB, RemainderBB);
3526 }
3527 
3528 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3530  MachineBasicBlock *MBB = MI.getParent();
3531  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3532  auto I = MI.getIterator();
3533  auto E = std::next(I);
3534 
3535  BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
3536  .addImm(0);
3537 
3538  MIBundleBuilder Bundler(*MBB, I, E);
3539  finalizeBundle(*MBB, Bundler.begin());
3540 }
3541 
3544  MachineBasicBlock *BB) const {
3545  const DebugLoc &DL = MI.getDebugLoc();
3546 
3547  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3548 
3549  MachineBasicBlock *LoopBB;
3550  MachineBasicBlock *RemainderBB;
3551  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3552 
3553  // Apparently kill flags are only valid if the def is in the same block?
3554  if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
3555  Src->setIsKill(false);
3556 
3557  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
3558 
3559  MachineBasicBlock::iterator I = LoopBB->end();
3560 
3561  const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
3563 
3564  // Clear TRAP_STS.MEM_VIOL
3565  BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
3566  .addImm(0)
3567  .addImm(EncodedReg);
3568 
3570 
3571  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3572 
3573  // Load and check TRAP_STS.MEM_VIOL
3574  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
3575  .addImm(EncodedReg);
3576 
3577  // FIXME: Do we need to use an isel pseudo that may clobber scc?
3578  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
3580  .addImm(0);
3581  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3582  .addMBB(LoopBB);
3583 
3584  return RemainderBB;
3585 }
3586 
3587 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3588 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3589 // will only do one iteration. In the worst case, this will loop 64 times.
3590 //
3591 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3594  MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3595  const DebugLoc &DL, const MachineOperand &Idx,
3596  unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
3597  unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
3598  Register &SGPRIdxReg) {
3599 
3600  MachineFunction *MF = OrigBB.getParent();
3601  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3602  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3603  MachineBasicBlock::iterator I = LoopBB.begin();
3604 
3605  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3606  Register PhiExec = MRI.createVirtualRegister(BoolRC);
3607  Register NewExec = MRI.createVirtualRegister(BoolRC);
3608  Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3609  Register CondReg = MRI.createVirtualRegister(BoolRC);
3610 
3611  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
3612  .addReg(InitReg)
3613  .addMBB(&OrigBB)
3614  .addReg(ResultReg)
3615  .addMBB(&LoopBB);
3616 
3617  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
3618  .addReg(InitSaveExecReg)
3619  .addMBB(&OrigBB)
3620  .addReg(NewExec)
3621  .addMBB(&LoopBB);
3622 
3623  // Read the next variant <- also loop target.
3624  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3625  .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
3626 
3627  // Compare the just read M0 value to all possible Idx values.
3628  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3629  .addReg(CurrentIdxReg)
3630  .addReg(Idx.getReg(), 0, Idx.getSubReg());
3631 
3632  // Update EXEC, save the original EXEC value to VCC.
3633  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3634  : AMDGPU::S_AND_SAVEEXEC_B64),
3635  NewExec)
3636  .addReg(CondReg, RegState::Kill);
3637 
3638  MRI.setSimpleHint(NewExec, CondReg);
3639 
3640  if (UseGPRIdxMode) {
3641  if (Offset == 0) {
3642  SGPRIdxReg = CurrentIdxReg;
3643  } else {
3644  SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3645  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3646  .addReg(CurrentIdxReg, RegState::Kill)
3647  .addImm(Offset);
3648  }
3649  } else {
3650  // Move index from VCC into M0
3651  if (Offset == 0) {
3652  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3653  .addReg(CurrentIdxReg, RegState::Kill);
3654  } else {
3655  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3656  .addReg(CurrentIdxReg, RegState::Kill)
3657  .addImm(Offset);
3658  }
3659  }
3660 
3661  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3662  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3663  MachineInstr *InsertPt =
3664  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3665  : AMDGPU::S_XOR_B64_term), Exec)
3666  .addReg(Exec)
3667  .addReg(NewExec);
3668 
3669  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3670  // s_cbranch_scc0?
3671 
3672  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3673  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3674  .addMBB(&LoopBB);
3675 
3676  return InsertPt->getIterator();
3677 }
3678 
3679 // This has slightly sub-optimal regalloc when the source vector is killed by
3680 // the read. The register allocator does not understand that the kill is
3681 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3682 // subregister from it, using 1 more VGPR than necessary. This was saved when
3683 // this was expanded after register allocation.
3686  unsigned InitResultReg, unsigned PhiReg, int Offset,
3687  bool UseGPRIdxMode, Register &SGPRIdxReg) {
3688  MachineFunction *MF = MBB.getParent();
3689  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3690  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3692  const DebugLoc &DL = MI.getDebugLoc();
3694 
3695  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3696  Register DstReg = MI.getOperand(0).getReg();
3697  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
3698  Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
3699  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3700  unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3701 
3702  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3703 
3704  // Save the EXEC mask
3705  BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
3706  .addReg(Exec);
3707 
3708  MachineBasicBlock *LoopBB;
3709  MachineBasicBlock *RemainderBB;
3710  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
3711 
3712  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3713 
3714  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3715  InitResultReg, DstReg, PhiReg, TmpExec,
3716  Offset, UseGPRIdxMode, SGPRIdxReg);
3717 
3718  MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
3720  ++MBBI;
3721  MF->insert(MBBI, LandingPad);
3722  LoopBB->removeSuccessor(RemainderBB);
3723  LandingPad->addSuccessor(RemainderBB);
3724  LoopBB->addSuccessor(LandingPad);
3725  MachineBasicBlock::iterator First = LandingPad->begin();
3726  BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
3727  .addReg(SaveExec);
3728 
3729  return InsPt;
3730 }
3731 
3732 // Returns subreg index, offset
3733 static std::pair<unsigned, int>
3735  const TargetRegisterClass *SuperRC,
3736  unsigned VecReg,
3737  int Offset) {
3738  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3739 
3740  // Skip out of bounds offsets, or else we would end up using an undefined
3741  // register.
3742  if (Offset >= NumElts || Offset < 0)
3743  return std::make_pair(AMDGPU::sub0, Offset);
3744 
3745  return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
3746 }
3747 
3750  int Offset) {
3751  MachineBasicBlock *MBB = MI.getParent();
3752  const DebugLoc &DL = MI.getDebugLoc();
3754 
3755  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3756 
3757  assert(Idx->getReg() != AMDGPU::NoRegister);
3758 
3759  if (Offset == 0) {
3760  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
3761  } else {
3762  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3763  .add(*Idx)
3764  .addImm(Offset);
3765  }
3766 }
3767 
3770  int Offset) {
3771  MachineBasicBlock *MBB = MI.getParent();
3772  const DebugLoc &DL = MI.getDebugLoc();
3774 
3775  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3776 
3777  if (Offset == 0)
3778  return Idx->getReg();
3779 
3780  Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3781  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3782  .add(*Idx)
3783  .addImm(Offset);
3784  return Tmp;
3785 }
3786 
3789  const GCNSubtarget &ST) {
3790  const SIInstrInfo *TII = ST.getInstrInfo();
3791  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3792  MachineFunction *MF = MBB.getParent();
3794 
3795  Register Dst = MI.getOperand(0).getReg();
3796  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3797  Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3798  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3799 
3800  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3801  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3802 
3803  unsigned SubReg;
3804  std::tie(SubReg, Offset)
3805  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3806 
3807  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3808 
3809  // Check for a SGPR index.
3810  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3812  const DebugLoc &DL = MI.getDebugLoc();
3813 
3814  if (UseGPRIdxMode) {
3815  // TODO: Look at the uses to avoid the copy. This may require rescheduling
3816  // to avoid interfering with other uses, so probably requires a new
3817  // optimization pass.
3818  Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
3819 
3820  const MCInstrDesc &GPRIDXDesc =
3821  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3822  BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3823  .addReg(SrcReg)
3824  .addReg(Idx)
3825  .addImm(SubReg);
3826  } else {
3827  setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
3828 
3829  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3830  .addReg(SrcReg, 0, SubReg)
3831  .addReg(SrcReg, RegState::Implicit);
3832  }
3833 
3834  MI.eraseFromParent();
3835 
3836  return &MBB;
3837  }
3838 
3839  // Control flow needs to be inserted if indexing with a VGPR.
3840  const DebugLoc &DL = MI.getDebugLoc();
3842 
3843  Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3844  Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3845 
3846  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3847 
3848  Register SGPRIdxReg;
3849  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
3850  UseGPRIdxMode, SGPRIdxReg);
3851 
3852  MachineBasicBlock *LoopBB = InsPt->getParent();
3853 
3854  if (UseGPRIdxMode) {
3855  const MCInstrDesc &GPRIDXDesc =
3856  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3857 
3858  BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3859  .addReg(SrcReg)
3860  .addReg(SGPRIdxReg)
3861  .addImm(SubReg);
3862  } else {
3863  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3864  .addReg(SrcReg, 0, SubReg)
3865  .addReg(SrcReg, RegState::Implicit);
3866  }
3867 
3868  MI.eraseFromParent();
3869 
3870  return LoopBB;
3871 }
3872 
3875  const GCNSubtarget &ST) {
3876  const SIInstrInfo *TII = ST.getInstrInfo();
3877  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3878  MachineFunction *MF = MBB.getParent();
3880 
3881  Register Dst = MI.getOperand(0).getReg();
3882  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3883  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3884  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3885  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3886  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3887  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3888 
3889  // This can be an immediate, but will be folded later.
3890  assert(Val->getReg());
3891 
3892  unsigned SubReg;
3893  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3894  SrcVec->getReg(),
3895  Offset);
3896  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3897 
3898  if (Idx->getReg() == AMDGPU::NoRegister) {
3900  const DebugLoc &DL = MI.getDebugLoc();
3901 
3902  assert(Offset == 0);
3903 
3904  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3905  .add(*SrcVec)
3906  .add(*Val)
3907  .addImm(SubReg);
3908 
3909  MI.eraseFromParent();
3910  return &MBB;
3911  }
3912 
3913  // Check for a SGPR index.
3914  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3916  const DebugLoc &DL = MI.getDebugLoc();
3917 
3918  if (UseGPRIdxMode) {
3919  Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
3920 
3921  const MCInstrDesc &GPRIDXDesc =
3922  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3923  BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3924  .addReg(SrcVec->getReg())
3925  .add(*Val)
3926  .addReg(Idx)
3927  .addImm(SubReg);
3928  } else {
3929  setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
3930 
3931  const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3932  TRI.getRegSizeInBits(*VecRC), 32, false);
3933  BuildMI(MBB, I, DL, MovRelDesc, Dst)
3934  .addReg(SrcVec->getReg())
3935  .add(*Val)
3936  .addImm(SubReg);
3937  }
3938  MI.eraseFromParent();
3939  return &MBB;
3940  }
3941 
3942  // Control flow needs to be inserted if indexing with a VGPR.
3943  if (Val->isReg())
3944  MRI.clearKillFlags(Val->getReg());
3945 
3946  const DebugLoc &DL = MI.getDebugLoc();
3947 
3948  Register PhiReg = MRI.createVirtualRegister(VecRC);
3949 
3950  Register SGPRIdxReg;
3951  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
3952  UseGPRIdxMode, SGPRIdxReg);
3953  MachineBasicBlock *LoopBB = InsPt->getParent();
3954 
3955  if (UseGPRIdxMode) {
3956  const MCInstrDesc &GPRIDXDesc =
3957  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3958 
3959  BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3960  .addReg(PhiReg)
3961  .add(*Val)
3962  .addReg(SGPRIdxReg)
3963  .addImm(AMDGPU::sub0);
3964  } else {
3965  const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3966  TRI.getRegSizeInBits(*VecRC), 32, false);
3967  BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
3968  .addReg(PhiReg)
3969  .add(*Val)
3970  .addImm(AMDGPU::sub0);
3971  }
3972 
3973  MI.eraseFromParent();
3974  return LoopBB;
3975 }
3976 
3978  MachineInstr &MI, MachineBasicBlock *BB) const {
3979 
3980  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3981  MachineFunction *MF = BB->getParent();
3983 
3984  switch (MI.getOpcode()) {
3985  case AMDGPU::S_UADDO_PSEUDO:
3986  case AMDGPU::S_USUBO_PSEUDO: {
3987  const DebugLoc &DL = MI.getDebugLoc();
3988  MachineOperand &Dest0 = MI.getOperand(0);
3989  MachineOperand &Dest1 = MI.getOperand(1);
3990  MachineOperand &Src0 = MI.getOperand(2);
3991  MachineOperand &Src1 = MI.getOperand(3);
3992 
3993  unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
3994  ? AMDGPU::S_ADD_I32
3995  : AMDGPU::S_SUB_I32;
3996  BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
3997 
3998  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
3999  .addImm(1)
4000  .addImm(0);
4001 
4002  MI.eraseFromParent();
4003  return BB;
4004  }
4005  case AMDGPU::S_ADD_U64_PSEUDO:
4006  case AMDGPU::S_SUB_U64_PSEUDO: {
4007  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4008  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4009  const SIRegisterInfo *TRI = ST.getRegisterInfo();
4010  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4011  const DebugLoc &DL = MI.getDebugLoc();
4012 
4013  MachineOperand &Dest = MI.getOperand(0);
4014  MachineOperand &Src0 = MI.getOperand(1);
4015  MachineOperand &Src1 = MI.getOperand(2);
4016 
4017  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4018  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4019 
4020  MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4021  MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4022  MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4023  MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4024 
4025  MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4026  MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4027  MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4028  MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4029 
4030  bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4031 
4032  unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4033  unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4034  BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
4035  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
4036  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4037  .addReg(DestSub0)
4038  .addImm(AMDGPU::sub0)
4039  .addReg(DestSub1)
4040  .addImm(AMDGPU::sub1);
4041  MI.eraseFromParent();
4042  return BB;
4043  }
4044  case AMDGPU::V_ADD_U64_PSEUDO:
4045  case AMDGPU::V_SUB_U64_PSEUDO: {
4046  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4047  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4048  const SIRegisterInfo *TRI = ST.getRegisterInfo();
4049  const DebugLoc &DL = MI.getDebugLoc();
4050 
4051  bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4052 
4053  MachineOperand &Dest = MI.getOperand(0);
4054  MachineOperand &Src0 = MI.getOperand(1);
4055  MachineOperand &Src1 = MI.getOperand(2);
4056 
4057  if (IsAdd && ST.hasLshlAddB64()) {
4058  auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
4059  Dest.getReg())
4060  .add(Src0)
4061  .addImm(0)
4062  .add(Src1);
4063  TII->legalizeOperands(*Add);
4064  MI.eraseFromParent();
4065  return BB;
4066  }
4067 
4068  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4069 
4070  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4071  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4072 
4073  Register CarryReg = MRI.createVirtualRegister(CarryRC);
4074  Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
4075 
4076  const TargetRegisterClass *Src0RC = Src0.isReg()
4077  ? MRI.getRegClass(Src0.getReg())
4078  : &AMDGPU::VReg_64RegClass;
4079  const TargetRegisterClass *Src1RC = Src1.isReg()
4080  ? MRI.getRegClass(Src1.getReg())
4081  : &AMDGPU::VReg_64RegClass;
4082 
4083  const TargetRegisterClass *Src0SubRC =
4084  TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4085  const TargetRegisterClass *Src1SubRC =
4086  TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4087 
4088  MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
4089  MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4090  MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
4091  MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4092 
4093  MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
4094  MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4095  MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
4096  MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4097 
4098  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4099  MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4100  .addReg(CarryReg, RegState::Define)
4101  .add(SrcReg0Sub0)
4102  .add(SrcReg1Sub0)
4103  .addImm(0); // clamp bit
4104 
4105  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4106  MachineInstr *HiHalf =
4107  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4108  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4109  .add(SrcReg0Sub1)
4110  .add(SrcReg1Sub1)
4111  .addReg(CarryReg, RegState::Kill)
4112  .addImm(0); // clamp bit
4113 
4114  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4115  .addReg(DestSub0)
4116  .addImm(AMDGPU::sub0)
4117  .addReg(DestSub1)
4118  .addImm(AMDGPU::sub1);
4119  TII->legalizeOperands(*LoHalf);
4120  TII->legalizeOperands(*HiHalf);
4121  MI.eraseFromParent();
4122  return BB;
4123  }
4124  case AMDGPU::S_ADD_CO_PSEUDO:
4125  case AMDGPU::S_SUB_CO_PSEUDO: {
4126  // This pseudo has a chance to be selected
4127  // only from uniform add/subcarry node. All the VGPR operands
4128  // therefore assumed to be splat vectors.
4129  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4130  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4131  const SIRegisterInfo *TRI = ST.getRegisterInfo();
4133  const DebugLoc &DL = MI.getDebugLoc();
4134  MachineOperand &Dest = MI.getOperand(0);
4135  MachineOperand &CarryDest = MI.getOperand(1);
4136  MachineOperand &Src0 = MI.getOperand(2);
4137  MachineOperand &Src1 = MI.getOperand(3);
4138  MachineOperand &Src2 = MI.getOperand(4);
4139  unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
4140  ? AMDGPU::S_ADDC_U32
4141  : AMDGPU::S_SUBB_U32;
4142  if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
4143  Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4144  BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
4145  .addReg(Src0.getReg());
4146  Src0.setReg(RegOp0);
4147  }
4148  if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
4149  Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4150  BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
4151  .addReg(Src1.getReg());
4152  Src1.setReg(RegOp1);
4153  }
4154  Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4155  if (TRI->isVectorRegister(MRI, Src2.getReg())) {
4156  BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
4157  .addReg(Src2.getReg());
4158  Src2.setReg(RegOp2);
4159  }
4160 
4161  const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
4162  unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
4163  assert(WaveSize == 64 || WaveSize == 32);
4164 
4165  if (WaveSize == 64) {
4166  if (ST.hasScalarCompareEq64()) {
4167  BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
4168  .addReg(Src2.getReg())
4169  .addImm(0);
4170  } else {
4171  const TargetRegisterClass *SubRC =
4172  TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
4173  MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
4174  MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
4175  MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
4176  MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
4177  Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4178 
4179  BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
4180  .add(Src2Sub0)
4181  .add(Src2Sub1);
4182 
4183  BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4184  .addReg(Src2_32, RegState::Kill)
4185  .addImm(0);
4186  }
4187  } else {
4188  BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32))
4189  .addReg(Src2.getReg())
4190  .addImm(0);
4191  }
4192 
4193  BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
4194 
4195  unsigned SelOpc =
4196  (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
4197 
4198  BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.