LLVM  14.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIISelLowering.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/Statistic.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/Analysis.h"
29 #include "llvm/IR/DiagnosticInfo.h"
30 #include "llvm/IR/IntrinsicInst.h"
31 #include "llvm/IR/IntrinsicsAMDGPU.h"
32 #include "llvm/IR/IntrinsicsR600.h"
34 #include "llvm/Support/KnownBits.h"
35 
36 using namespace llvm;
37 
38 #define DEBUG_TYPE "si-lower"
39 
40 STATISTIC(NumTailCalls, "Number of tail calls");
41 
43  "amdgpu-disable-loop-alignment",
44  cl::desc("Do not align and prefetch loops"),
45  cl::init(false));
46 
48  "amdgpu-reserve-vgpr-for-sgpr-spill",
49  cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true));
50 
52  "amdgpu-use-divergent-register-indexing",
53  cl::Hidden,
54  cl::desc("Use indirect register addressing for divergent indexes"),
55  cl::init(false));
56 
57 static bool hasFP32Denormals(const MachineFunction &MF) {
59  return Info->getMode().allFP32Denormals();
60 }
61 
62 static bool hasFP64FP16Denormals(const MachineFunction &MF) {
64  return Info->getMode().allFP64FP16Denormals();
65 }
66 
67 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
68  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
69  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
70  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
71  return AMDGPU::SGPR0 + Reg;
72  }
73  }
74  llvm_unreachable("Cannot allocate sgpr");
75 }
76 
78  const GCNSubtarget &STI)
79  : AMDGPUTargetLowering(TM, STI),
80  Subtarget(&STI) {
81  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
82  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
83 
84  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
85  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
86 
87  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
88 
89  const SIRegisterInfo *TRI = STI.getRegisterInfo();
90  const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
91 
92  addRegisterClass(MVT::f64, V64RegClass);
93  addRegisterClass(MVT::v2f32, V64RegClass);
94 
95  addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
96  addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
97 
98  addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
99  addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
100 
101  addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
102  addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
103 
104  addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
105  addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
106 
107  addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
108  addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
109 
110  addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
111  addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
112 
113  addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
114  addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
115 
116  addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
117  addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
118 
119  addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
120  addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
121 
122  addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
123  addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
124 
125  addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
126  addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
127 
128  addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
129  addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
130 
131  if (Subtarget->has16BitInsts()) {
132  addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
133  addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
134 
135  // Unless there are also VOP3P operations, not operations are really legal.
136  addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
137  addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
138  addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
139  addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
140  }
141 
142  addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
143  addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
144 
146 
147  // The boolean content concept here is too inflexible. Compares only ever
148  // really produce a 1-bit result. Any copy/extend from these will turn into a
149  // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
150  // it's what most targets use.
153 
154  // We need to custom lower vector stores from local memory
165 
176 
193 
201 
204 
209 
215 
220 
237 
246 
253 
256 
259 
263 
264 #if 0
267 #endif
268 
269  // We only support LOAD/STORE and vector manipulation ops for vectors
270  // with > 4 elements.
276  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
277  switch (Op) {
278  case ISD::LOAD:
279  case ISD::STORE:
280  case ISD::BUILD_VECTOR:
281  case ISD::BITCAST:
286  break;
288  case ISD::CONCAT_VECTORS:
290  break;
291  default:
293  break;
294  }
295  }
296  }
297 
299 
300  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
301  // is expanded to avoid having two separate loops in case the index is a VGPR.
302 
303  // Most operations are naturally 32-bit vector operations. We only support
304  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
305  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
308 
311 
314 
317  }
318 
319  for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
322 
325 
328 
331  }
332 
333  for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
336 
339 
342 
345  }
346 
347  for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
350 
353 
356 
359  }
360 
361  for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
364 
367 
370 
373  }
374 
379 
382 
383  // Avoid stack access for these.
384  // TODO: Generalize to more vector types.
389 
396 
401 
402  // Deal with vec3 vector operations when widened to vec4.
407 
408  // Deal with vec5/6/7 vector operations when widened to vec8.
417 
418  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
419  // and output demarshalling
422 
423  // We can't return success/failure, only the old value,
424  // let LLVM add the comparison
427 
428  if (Subtarget->hasFlatAddressSpace()) {
431  }
432 
435 
436  // FIXME: This should be narrowed to i32, but that only happens if i64 is
437  // illegal.
438  // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
441 
442  // On SI this is s_memtime and s_memrealtime on VI.
446 
447  if (Subtarget->has16BitInsts()) {
453  }
454 
455  if (Subtarget->hasMadMacF32Insts())
457 
458  if (!Subtarget->hasBFI()) {
459  // fcopysign can be done in a single instruction with BFI.
462  }
463 
464  if (!Subtarget->hasBCNT(32))
466 
467  if (!Subtarget->hasBCNT(64))
469 
470  if (Subtarget->hasFFBH()) {
473  }
474 
475  if (Subtarget->hasFFBL()) {
478  }
479 
480  // We only really have 32-bit BFE instructions (and 16-bit on VI).
481  //
482  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
483  // effort to match them now. We want this to be false for i64 cases when the
484  // extraction isn't restricted to the upper or lower half. Ideally we would
485  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
486  // span the midpoint are probably relatively rare, so don't worry about them
487  // for now.
488  if (Subtarget->hasBFE())
489  setHasExtractBitsInsn(true);
490 
491  // Clamp modifier on add/sub
492  if (Subtarget->hasIntClamp()) {
495  }
496 
497  if (Subtarget->hasAddNoCarry()) {
502  }
503 
508 
509 
510  // These are really only legal for ieee_mode functions. We should be avoiding
511  // them for functions that don't have ieee_mode enabled, so just say they are
512  // legal.
517 
518 
519  if (Subtarget->haveRoundOpsF64()) {
523  } else {
528  }
529 
531 
536 
537  if (Subtarget->has16BitInsts()) {
539 
542 
545 
548 
551 
558 
560 
566 
568 
570 
572 
574 
579 
582 
583  // F16 - Constant Actions.
585 
586  // F16 - Load/Store Actions.
591 
592  // F16 - VOP1 Actions.
596 
599 
605 
606  // F16 - VOP2 Actions.
609 
611 
612  // F16 - VOP3 Actions.
614  if (STI.hasMadF16())
616 
617  for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
618  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
619  switch (Op) {
620  case ISD::LOAD:
621  case ISD::STORE:
622  case ISD::BUILD_VECTOR:
623  case ISD::BITCAST:
629  break;
630  case ISD::CONCAT_VECTORS:
632  break;
633  default:
635  break;
636  }
637  }
638  }
639 
640  // v_perm_b32 can handle either of these.
644 
645  // XXX - Do these do anything? Vector constants turn into build_vector.
648 
651 
656 
661 
668 
673 
678 
683 
687 
688  if (!Subtarget->hasVOP3PInsts()) {
691  }
692 
694  // This isn't really legal, but this avoids the legalizer unrolling it (and
695  // allows matching fneg (fabs x) patterns)
697 
702 
705 
708  }
709 
710  if (Subtarget->hasVOP3PInsts()) {
721 
726 
730 
733 
735 
738 
741 
748 
753 
758 
762 
765 
769 
773 
774  if (Subtarget->hasPackedFP32Ops()) {
779 
780  for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) {
784  }
785  }
786  }
787 
790 
791  if (Subtarget->has16BitInsts()) {
796  } else {
797  // Legalization hack.
800 
803  }
804 
807  }
808 
811 
819 
831 
842 
870 
871  // All memory operations. Some folding on the pointer operand is done to help
872  // matching the constant offsets in the addressing modes.
893 
894  // FIXME: In other contexts we pretend this is a per-function property.
895  setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
896 
898 }
899 
901  return Subtarget;
902 }
903 
904 //===----------------------------------------------------------------------===//
905 // TargetLowering queries
906 //===----------------------------------------------------------------------===//
907 
908 // v_mad_mix* support a conversion from f16 to f32.
909 //
910 // There is only one special case when denormals are enabled we don't currently,
911 // where this is OK to use.
912 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
913  EVT DestVT, EVT SrcVT) const {
914  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
915  (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
916  DestVT.getScalarType() == MVT::f32 &&
917  SrcVT.getScalarType() == MVT::f16 &&
918  // TODO: This probably only requires no input flushing?
920 }
921 
923  // SI has some legal vector types, but no legal vector operations. Say no
924  // shuffles are legal in order to prefer scalarizing some vector operations.
925  return false;
926 }
927 
929  CallingConv::ID CC,
930  EVT VT) const {
931  if (CC == CallingConv::AMDGPU_KERNEL)
933 
934  if (VT.isVector()) {
935  EVT ScalarVT = VT.getScalarType();
936  unsigned Size = ScalarVT.getSizeInBits();
937  if (Size == 16) {
938  if (Subtarget->has16BitInsts())
939  return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
940  return VT.isInteger() ? MVT::i32 : MVT::f32;
941  }
942 
943  if (Size < 16)
944  return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
945  return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
946  }
947 
948  if (VT.getSizeInBits() > 32)
949  return MVT::i32;
950 
952 }
953 
955  CallingConv::ID CC,
956  EVT VT) const {
957  if (CC == CallingConv::AMDGPU_KERNEL)
959 
960  if (VT.isVector()) {
961  unsigned NumElts = VT.getVectorNumElements();
962  EVT ScalarVT = VT.getScalarType();
963  unsigned Size = ScalarVT.getSizeInBits();
964 
965  // FIXME: Should probably promote 8-bit vectors to i16.
966  if (Size == 16 && Subtarget->has16BitInsts())
967  return (NumElts + 1) / 2;
968 
969  if (Size <= 32)
970  return NumElts;
971 
972  if (Size > 32)
973  return NumElts * ((Size + 31) / 32);
974  } else if (VT.getSizeInBits() > 32)
975  return (VT.getSizeInBits() + 31) / 32;
976 
978 }
979 
982  EVT VT, EVT &IntermediateVT,
983  unsigned &NumIntermediates, MVT &RegisterVT) const {
984  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
985  unsigned NumElts = VT.getVectorNumElements();
986  EVT ScalarVT = VT.getScalarType();
987  unsigned Size = ScalarVT.getSizeInBits();
988  // FIXME: We should fix the ABI to be the same on targets without 16-bit
989  // support, but unless we can properly handle 3-vectors, it will be still be
990  // inconsistent.
991  if (Size == 16 && Subtarget->has16BitInsts()) {
992  RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
993  IntermediateVT = RegisterVT;
994  NumIntermediates = (NumElts + 1) / 2;
995  return NumIntermediates;
996  }
997 
998  if (Size == 32) {
999  RegisterVT = ScalarVT.getSimpleVT();
1000  IntermediateVT = RegisterVT;
1001  NumIntermediates = NumElts;
1002  return NumIntermediates;
1003  }
1004 
1005  if (Size < 16 && Subtarget->has16BitInsts()) {
1006  // FIXME: Should probably form v2i16 pieces
1007  RegisterVT = MVT::i16;
1008  IntermediateVT = ScalarVT;
1009  NumIntermediates = NumElts;
1010  return NumIntermediates;
1011  }
1012 
1013 
1014  if (Size != 16 && Size <= 32) {
1015  RegisterVT = MVT::i32;
1016  IntermediateVT = ScalarVT;
1017  NumIntermediates = NumElts;
1018  return NumIntermediates;
1019  }
1020 
1021  if (Size > 32) {
1022  RegisterVT = MVT::i32;
1023  IntermediateVT = RegisterVT;
1024  NumIntermediates = NumElts * ((Size + 31) / 32);
1025  return NumIntermediates;
1026  }
1027  }
1028 
1030  Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1031 }
1032 
1033 static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) {
1034  assert(DMaskLanes != 0);
1035 
1036  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1037  unsigned NumElts = std::min(DMaskLanes, VT->getNumElements());
1038  return EVT::getVectorVT(Ty->getContext(),
1039  EVT::getEVT(VT->getElementType()),
1040  NumElts);
1041  }
1042 
1043  return EVT::getEVT(Ty);
1044 }
1045 
1046 // Peek through TFE struct returns to only use the data size.
1047 static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) {
1048  auto *ST = dyn_cast<StructType>(Ty);
1049  if (!ST)
1050  return memVTFromImageData(Ty, DMaskLanes);
1051 
1052  // Some intrinsics return an aggregate type - special case to work out the
1053  // correct memVT.
1054  //
1055  // Only limited forms of aggregate type currently expected.
1056  if (ST->getNumContainedTypes() != 2 ||
1057  !ST->getContainedType(1)->isIntegerTy(32))
1058  return EVT();
1059  return memVTFromImageData(ST->getContainedType(0), DMaskLanes);
1060 }
1061 
1063  const CallInst &CI,
1064  MachineFunction &MF,
1065  unsigned IntrID) const {
1066  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1067  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
1069  (Intrinsic::ID)IntrID);
1070  if (Attr.hasFnAttr(Attribute::ReadNone))
1071  return false;
1072 
1074 
1075  if (RsrcIntr->IsImage) {
1076  Info.ptrVal =
1078  Info.align.reset();
1079  } else {
1080  Info.ptrVal =
1082  }
1083 
1085  if (Attr.hasFnAttr(Attribute::ReadOnly)) {
1086  unsigned DMaskLanes = 4;
1087 
1088  if (RsrcIntr->IsImage) {
1091  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1092  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1093 
1094  if (!BaseOpcode->Gather4) {
1095  // If this isn't a gather, we may have excess loaded elements in the
1096  // IR type. Check the dmask for the real number of elements loaded.
1097  unsigned DMask
1098  = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1099  DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
1100  }
1101 
1102  Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes);
1103  } else
1104  Info.memVT = EVT::getEVT(CI.getType());
1105 
1106  // FIXME: What does alignment mean for an image?
1109  } else if (Attr.hasFnAttr(Attribute::WriteOnly)) {
1110  Info.opc = ISD::INTRINSIC_VOID;
1111 
1112  Type *DataTy = CI.getArgOperand(0)->getType();
1113  if (RsrcIntr->IsImage) {
1114  unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1115  unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
1116  Info.memVT = memVTFromImageData(DataTy, DMaskLanes);
1117  } else
1118  Info.memVT = EVT::getEVT(DataTy);
1119 
1121  } else {
1122  // Atomic
1123  Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1125  Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1129 
1130  // XXX - Should this be volatile without known ordering?
1132  }
1133  return true;
1134  }
1135 
1136  switch (IntrID) {
1137  case Intrinsic::amdgcn_atomic_inc:
1138  case Intrinsic::amdgcn_atomic_dec:
1139  case Intrinsic::amdgcn_ds_ordered_add:
1140  case Intrinsic::amdgcn_ds_ordered_swap:
1141  case Intrinsic::amdgcn_ds_fadd:
1142  case Intrinsic::amdgcn_ds_fmin:
1143  case Intrinsic::amdgcn_ds_fmax: {
1145  Info.memVT = MVT::getVT(CI.getType());
1146  Info.ptrVal = CI.getOperand(0);
1147  Info.align.reset();
1149 
1150  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1151  if (!Vol->isZero())
1153 
1154  return true;
1155  }
1156  case Intrinsic::amdgcn_buffer_atomic_fadd: {
1158 
1160  Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1161  Info.ptrVal =
1163  Info.align.reset();
1165 
1166  const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1167  if (!Vol || !Vol->isZero())
1169 
1170  return true;
1171  }
1172  case Intrinsic::amdgcn_ds_append:
1173  case Intrinsic::amdgcn_ds_consume: {
1175  Info.memVT = MVT::getVT(CI.getType());
1176  Info.ptrVal = CI.getOperand(0);
1177  Info.align.reset();
1179 
1180  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1181  if (!Vol->isZero())
1183 
1184  return true;
1185  }
1186  case Intrinsic::amdgcn_global_atomic_csub: {
1188  Info.memVT = MVT::getVT(CI.getType());
1189  Info.ptrVal = CI.getOperand(0);
1190  Info.align.reset();
1194  return true;
1195  }
1196  case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1199  Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1200  Info.ptrVal =
1202  Info.align.reset();
1205  return true;
1206  }
1207  case Intrinsic::amdgcn_global_atomic_fadd:
1208  case Intrinsic::amdgcn_global_atomic_fmin:
1209  case Intrinsic::amdgcn_global_atomic_fmax:
1210  case Intrinsic::amdgcn_flat_atomic_fadd:
1211  case Intrinsic::amdgcn_flat_atomic_fmin:
1212  case Intrinsic::amdgcn_flat_atomic_fmax: {
1214  Info.memVT = MVT::getVT(CI.getType());
1215  Info.ptrVal = CI.getOperand(0);
1216  Info.align.reset();
1221  return true;
1222  }
1223  case Intrinsic::amdgcn_ds_gws_init:
1224  case Intrinsic::amdgcn_ds_gws_barrier:
1225  case Intrinsic::amdgcn_ds_gws_sema_v:
1226  case Intrinsic::amdgcn_ds_gws_sema_br:
1227  case Intrinsic::amdgcn_ds_gws_sema_p:
1228  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1229  Info.opc = ISD::INTRINSIC_VOID;
1230 
1232  Info.ptrVal =
1234 
1235  // This is an abstract access, but we need to specify a type and size.
1236  Info.memVT = MVT::i32;
1237  Info.size = 4;
1238  Info.align = Align(4);
1239 
1241  if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1243  return true;
1244  }
1245  default:
1246  return false;
1247  }
1248 }
1249 
1252  Type *&AccessTy) const {
1253  switch (II->getIntrinsicID()) {
1254  case Intrinsic::amdgcn_atomic_inc:
1255  case Intrinsic::amdgcn_atomic_dec:
1256  case Intrinsic::amdgcn_ds_ordered_add:
1257  case Intrinsic::amdgcn_ds_ordered_swap:
1258  case Intrinsic::amdgcn_ds_append:
1259  case Intrinsic::amdgcn_ds_consume:
1260  case Intrinsic::amdgcn_ds_fadd:
1261  case Intrinsic::amdgcn_ds_fmin:
1262  case Intrinsic::amdgcn_ds_fmax:
1263  case Intrinsic::amdgcn_global_atomic_fadd:
1264  case Intrinsic::amdgcn_flat_atomic_fadd:
1265  case Intrinsic::amdgcn_flat_atomic_fmin:
1266  case Intrinsic::amdgcn_flat_atomic_fmax:
1267  case Intrinsic::amdgcn_global_atomic_csub: {
1268  Value *Ptr = II->getArgOperand(0);
1269  AccessTy = II->getType();
1270  Ops.push_back(Ptr);
1271  return true;
1272  }
1273  default:
1274  return false;
1275  }
1276 }
1277 
1278 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
1279  if (!Subtarget->hasFlatInstOffsets()) {
1280  // Flat instructions do not have offsets, and only have the register
1281  // address.
1282  return AM.BaseOffs == 0 && AM.Scale == 0;
1283  }
1284 
1285  return AM.Scale == 0 &&
1286  (AM.BaseOffs == 0 ||
1287  Subtarget->getInstrInfo()->isLegalFLATOffset(
1289 }
1290 
1292  if (Subtarget->hasFlatGlobalInsts())
1293  return AM.Scale == 0 &&
1294  (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1297 
1298  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1299  // Assume the we will use FLAT for all global memory accesses
1300  // on VI.
1301  // FIXME: This assumption is currently wrong. On VI we still use
1302  // MUBUF instructions for the r + i addressing mode. As currently
1303  // implemented, the MUBUF instructions only work on buffer < 4GB.
1304  // It may be possible to support > 4GB buffers with MUBUF instructions,
1305  // by setting the stride value in the resource descriptor which would
1306  // increase the size limit to (stride * 4GB). However, this is risky,
1307  // because it has never been validated.
1308  return isLegalFlatAddressingMode(AM);
1309  }
1310 
1311  return isLegalMUBUFAddressingMode(AM);
1312 }
1313 
1314 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1315  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1316  // additionally can do r + r + i with addr64. 32-bit has more addressing
1317  // mode options. Depending on the resource constant, it can also do
1318  // (i64 r0) + (i32 r1) * (i14 i).
1319  //
1320  // Private arrays end up using a scratch buffer most of the time, so also
1321  // assume those use MUBUF instructions. Scratch loads / stores are currently
1322  // implemented as mubuf instructions with offen bit set, so slightly
1323  // different than the normal addr64.
1324  if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
1325  return false;
1326 
1327  // FIXME: Since we can split immediate into soffset and immediate offset,
1328  // would it make sense to allow any immediate?
1329 
1330  switch (AM.Scale) {
1331  case 0: // r + i or just i, depending on HasBaseReg.
1332  return true;
1333  case 1:
1334  return true; // We have r + r or r + i.
1335  case 2:
1336  if (AM.HasBaseReg) {
1337  // Reject 2 * r + r.
1338  return false;
1339  }
1340 
1341  // Allow 2 * r as r + r
1342  // Or 2 * r + i is allowed as r + r + i.
1343  return true;
1344  default: // Don't allow n * r
1345  return false;
1346  }
1347 }
1348 
1350  const AddrMode &AM, Type *Ty,
1351  unsigned AS, Instruction *I) const {
1352  // No global is ever allowed as a base.
1353  if (AM.BaseGV)
1354  return false;
1355 
1356  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1357  return isLegalGlobalAddressingMode(AM);
1358 
1359  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1362  // If the offset isn't a multiple of 4, it probably isn't going to be
1363  // correctly aligned.
1364  // FIXME: Can we get the real alignment here?
1365  if (AM.BaseOffs % 4 != 0)
1366  return isLegalMUBUFAddressingMode(AM);
1367 
1368  // There are no SMRD extloads, so if we have to do a small type access we
1369  // will use a MUBUF load.
1370  // FIXME?: We also need to do this if unaligned, but we don't know the
1371  // alignment here.
1372  if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1373  return isLegalGlobalAddressingMode(AM);
1374 
1375  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1376  // SMRD instructions have an 8-bit, dword offset on SI.
1377  if (!isUInt<8>(AM.BaseOffs / 4))
1378  return false;
1379  } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1380  // On CI+, this can also be a 32-bit literal constant offset. If it fits
1381  // in 8-bits, it can use a smaller encoding.
1382  if (!isUInt<32>(AM.BaseOffs / 4))
1383  return false;
1384  } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1385  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1386  if (!isUInt<20>(AM.BaseOffs))
1387  return false;
1388  } else
1389  llvm_unreachable("unhandled generation");
1390 
1391  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1392  return true;
1393 
1394  if (AM.Scale == 1 && AM.HasBaseReg)
1395  return true;
1396 
1397  return false;
1398 
1399  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1400  return isLegalMUBUFAddressingMode(AM);
1401  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1402  AS == AMDGPUAS::REGION_ADDRESS) {
1403  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1404  // field.
1405  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1406  // an 8-bit dword offset but we don't know the alignment here.
1407  if (!isUInt<16>(AM.BaseOffs))
1408  return false;
1409 
1410  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1411  return true;
1412 
1413  if (AM.Scale == 1 && AM.HasBaseReg)
1414  return true;
1415 
1416  return false;
1417  } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1419  // For an unknown address space, this usually means that this is for some
1420  // reason being used for pure arithmetic, and not based on some addressing
1421  // computation. We don't have instructions that compute pointers with any
1422  // addressing modes, so treat them as having no offset like flat
1423  // instructions.
1424  return isLegalFlatAddressingMode(AM);
1425  }
1426 
1427  // Assume a user alias of global for unknown address spaces.
1428  return isLegalGlobalAddressingMode(AM);
1429 }
1430 
1431 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1432  const MachineFunction &MF) const {
1433  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1434  return (MemVT.getSizeInBits() <= 4 * 32);
1435  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1436  unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1437  return (MemVT.getSizeInBits() <= MaxPrivateBits);
1438  } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1439  return (MemVT.getSizeInBits() <= 2 * 32);
1440  }
1441  return true;
1442 }
1443 
1445  unsigned Size, unsigned AddrSpace, Align Alignment,
1446  MachineMemOperand::Flags Flags, bool *IsFast) const {
1447  if (IsFast)
1448  *IsFast = false;
1449 
1450  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1451  AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1452  // Check if alignment requirements for ds_read/write instructions are
1453  // disabled.
1454  if (Subtarget->hasUnalignedDSAccessEnabled() &&
1455  !Subtarget->hasLDSMisalignedBug()) {
1456  if (IsFast)
1457  *IsFast = Alignment != Align(2);
1458  return true;
1459  }
1460 
1461  // Either, the alignment requirements are "enabled", or there is an
1462  // unaligned LDS access related hardware bug though alignment requirements
1463  // are "disabled". In either case, we need to check for proper alignment
1464  // requirements.
1465  //
1466  if (Size == 64) {
1467  // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1468  // can do a 4 byte aligned, 8 byte access in a single operation using
1469  // ds_read2/write2_b32 with adjacent offsets.
1470  bool AlignedBy4 = Alignment >= Align(4);
1471  if (IsFast)
1472  *IsFast = AlignedBy4;
1473 
1474  return AlignedBy4;
1475  }
1476  if (Size == 96) {
1477  // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1478  // gfx8 and older.
1479  bool AlignedBy16 = Alignment >= Align(16);
1480  if (IsFast)
1481  *IsFast = AlignedBy16;
1482 
1483  return AlignedBy16;
1484  }
1485  if (Size == 128) {
1486  // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1487  // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1488  // single operation using ds_read2/write2_b64.
1489  bool AlignedBy8 = Alignment >= Align(8);
1490  if (IsFast)
1491  *IsFast = AlignedBy8;
1492 
1493  return AlignedBy8;
1494  }
1495  }
1496 
1497  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1498  bool AlignedBy4 = Alignment >= Align(4);
1499  if (IsFast)
1500  *IsFast = AlignedBy4;
1501 
1502  return AlignedBy4 ||
1503  Subtarget->enableFlatScratch() ||
1504  Subtarget->hasUnalignedScratchAccess();
1505  }
1506 
1507  // FIXME: We have to be conservative here and assume that flat operations
1508  // will access scratch. If we had access to the IR function, then we
1509  // could determine if any private memory was used in the function.
1510  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1511  !Subtarget->hasUnalignedScratchAccess()) {
1512  bool AlignedBy4 = Alignment >= Align(4);
1513  if (IsFast)
1514  *IsFast = AlignedBy4;
1515 
1516  return AlignedBy4;
1517  }
1518 
1519  if (Subtarget->hasUnalignedBufferAccessEnabled() &&
1520  !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1521  AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
1522  // If we have an uniform constant load, it still requires using a slow
1523  // buffer instruction if unaligned.
1524  if (IsFast) {
1525  // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
1526  // 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
1527  *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1528  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1529  Alignment >= Align(4) : Alignment != Align(2);
1530  }
1531 
1532  return true;
1533  }
1534 
1535  // Smaller than dword value must be aligned.
1536  if (Size < 32)
1537  return false;
1538 
1539  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1540  // byte-address are ignored, thus forcing Dword alignment.
1541  // This applies to private, global, and constant memory.
1542  if (IsFast)
1543  *IsFast = true;
1544 
1545  return Size >= 32 && Alignment >= Align(4);
1546 }
1547 
1549  EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1550  bool *IsFast) const {
1551  if (IsFast)
1552  *IsFast = false;
1553 
1554  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1555  // which isn't a simple VT.
1556  // Until MVT is extended to handle this, simply check for the size and
1557  // rely on the condition below: allow accesses if the size is a multiple of 4.
1558  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1559  VT.getStoreSize() > 16)) {
1560  return false;
1561  }
1562 
1563  return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
1564  Alignment, Flags, IsFast);
1565 }
1566 
1568  const MemOp &Op, const AttributeList &FuncAttributes) const {
1569  // FIXME: Should account for address space here.
1570 
1571  // The default fallback uses the private pointer size as a guess for a type to
1572  // use. Make sure we switch these to 64-bit accesses.
1573 
1574  if (Op.size() >= 16 &&
1575  Op.isDstAligned(Align(4))) // XXX: Should only do for global
1576  return MVT::v4i32;
1577 
1578  if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1579  return MVT::v2i32;
1580 
1581  // Use the default.
1582  return MVT::Other;
1583 }
1584 
1586  const MemSDNode *MemNode = cast<MemSDNode>(N);
1587  const Value *Ptr = MemNode->getMemOperand()->getValue();
1588  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
1589  return I && I->getMetadata("amdgpu.noclobber");
1590 }
1591 
1593  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1595 }
1596 
1598  unsigned DestAS) const {
1599  // Flat -> private/local is a simple truncate.
1600  // Flat -> global is no-op
1601  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1602  return true;
1603 
1604  const GCNTargetMachine &TM =
1605  static_cast<const GCNTargetMachine &>(getTargetMachine());
1606  return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1607 }
1608 
1610  const MemSDNode *MemNode = cast<MemSDNode>(N);
1611 
1612  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1613 }
1614 
1617  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1621 }
1622 
1624  Type *Ty) const {
1625  // FIXME: Could be smarter if called for vector constants.
1626  return true;
1627 }
1628 
1630  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1631  switch (Op) {
1632  case ISD::LOAD:
1633  case ISD::STORE:
1634 
1635  // These operations are done with 32-bit instructions anyway.
1636  case ISD::AND:
1637  case ISD::OR:
1638  case ISD::XOR:
1639  case ISD::SELECT:
1640  // TODO: Extensions?
1641  return true;
1642  default:
1643  return false;
1644  }
1645  }
1646 
1647  // SimplifySetCC uses this function to determine whether or not it should
1648  // create setcc with i1 operands. We don't have instructions for i1 setcc.
1649  if (VT == MVT::i1 && Op == ISD::SETCC)
1650  return false;
1651 
1653 }
1654 
1655 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1656  const SDLoc &SL,
1657  SDValue Chain,
1658  uint64_t Offset) const {
1659  const DataLayout &DL = DAG.getDataLayout();
1660  MachineFunction &MF = DAG.getMachineFunction();
1662 
1663  const ArgDescriptor *InputPtrReg;
1664  const TargetRegisterClass *RC;
1665  LLT ArgTy;
1667 
1668  std::tie(InputPtrReg, RC, ArgTy) =
1670 
1671  // We may not have the kernarg segment argument if we have no kernel
1672  // arguments.
1673  if (!InputPtrReg)
1674  return DAG.getConstant(0, SL, PtrVT);
1675 
1677  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1678  MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1679 
1680  return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
1681 }
1682 
1683 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1684  const SDLoc &SL) const {
1686  FIRST_IMPLICIT);
1687  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1688 }
1689 
1690 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1691  const SDLoc &SL, SDValue Val,
1692  bool Signed,
1693  const ISD::InputArg *Arg) const {
1694  // First, if it is a widened vector, narrow it.
1695  if (VT.isVector() &&
1696  VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1697  EVT NarrowedVT =
1699  VT.getVectorNumElements());
1700  Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1701  DAG.getConstant(0, SL, MVT::i32));
1702  }
1703 
1704  // Then convert the vector elements or scalar value.
1705  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1706  VT.bitsLT(MemVT)) {
1707  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1708  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1709  }
1710 
1711  if (MemVT.isFloatingPoint())
1712  Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1713  else if (Signed)
1714  Val = DAG.getSExtOrTrunc(Val, SL, VT);
1715  else
1716  Val = DAG.getZExtOrTrunc(Val, SL, VT);
1717 
1718  return Val;
1719 }
1720 
1721 SDValue SITargetLowering::lowerKernargMemParameter(
1722  SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
1723  uint64_t Offset, Align Alignment, bool Signed,
1724  const ISD::InputArg *Arg) const {
1726 
1727  // Try to avoid using an extload by loading earlier than the argument address,
1728  // and extracting the relevant bits. The load should hopefully be merged with
1729  // the previous argument.
1730  if (MemVT.getStoreSize() < 4 && Alignment < 4) {
1731  // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1732  int64_t AlignDownOffset = alignDown(Offset, 4);
1733  int64_t OffsetDiff = Offset - AlignDownOffset;
1734 
1735  EVT IntVT = MemVT.changeTypeToInteger();
1736 
1737  // TODO: If we passed in the base kernel offset we could have a better
1738  // alignment than 4, but we don't really need it.
1739  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1740  SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
1743 
1744  SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1745  SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1746 
1747  SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1748  ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1749  ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1750 
1751 
1752  return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1753  }
1754 
1755  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1756  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
1759 
1760  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1761  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1762 }
1763 
1764 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1765  const SDLoc &SL, SDValue Chain,
1766  const ISD::InputArg &Arg) const {
1767  MachineFunction &MF = DAG.getMachineFunction();
1768  MachineFrameInfo &MFI = MF.getFrameInfo();
1769 
1770  if (Arg.Flags.isByVal()) {
1771  unsigned Size = Arg.Flags.getByValSize();
1772  int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1773  return DAG.getFrameIndex(FrameIdx, MVT::i32);
1774  }
1775 
1776  unsigned ArgOffset = VA.getLocMemOffset();
1777  unsigned ArgSize = VA.getValVT().getStoreSize();
1778 
1779  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1780 
1781  // Create load nodes to retrieve arguments from the stack.
1782  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1783  SDValue ArgValue;
1784 
1785  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1787  MVT MemVT = VA.getValVT();
1788 
1789  switch (VA.getLocInfo()) {
1790  default:
1791  break;
1792  case CCValAssign::BCvt:
1793  MemVT = VA.getLocVT();
1794  break;
1795  case CCValAssign::SExt:
1796  ExtType = ISD::SEXTLOAD;
1797  break;
1798  case CCValAssign::ZExt:
1799  ExtType = ISD::ZEXTLOAD;
1800  break;
1801  case CCValAssign::AExt:
1802  ExtType = ISD::EXTLOAD;
1803  break;
1804  }
1805 
1806  ArgValue = DAG.getExtLoad(
1807  ExtType, SL, VA.getLocVT(), Chain, FIN,
1809  MemVT);
1810  return ArgValue;
1811 }
1812 
1813 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1814  const SIMachineFunctionInfo &MFI,
1815  EVT VT,
1817  const ArgDescriptor *Reg;
1818  const TargetRegisterClass *RC;
1819  LLT Ty;
1820 
1821  std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
1822  if (!Reg) {
1823  if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
1824  // It's possible for a kernarg intrinsic call to appear in a kernel with
1825  // no allocated segment, in which case we do not add the user sgpr
1826  // argument, so just return null.
1827  return DAG.getConstant(0, SDLoc(), VT);
1828  }
1829 
1830  // It's undefined behavior if a function marked with the amdgpu-no-*
1831  // attributes uses the corresponding intrinsic.
1832  return DAG.getUNDEF(VT);
1833  }
1834 
1835  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1836 }
1837 
1839  CallingConv::ID CallConv,
1841  FunctionType *FType,
1843  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1844  const ISD::InputArg *Arg = &Ins[I];
1845 
1846  assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1847  "vector type argument should have been split");
1848 
1849  // First check if it's a PS input addr.
1850  if (CallConv == CallingConv::AMDGPU_PS &&
1851  !Arg->Flags.isInReg() && PSInputNum <= 15) {
1852  bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1853 
1854  // Inconveniently only the first part of the split is marked as isSplit,
1855  // so skip to the end. We only want to increment PSInputNum once for the
1856  // entire split argument.
1857  if (Arg->Flags.isSplit()) {
1858  while (!Arg->Flags.isSplitEnd()) {
1859  assert((!Arg->VT.isVector() ||
1860  Arg->VT.getScalarSizeInBits() == 16) &&
1861  "unexpected vector split in ps argument type");
1862  if (!SkipArg)
1863  Splits.push_back(*Arg);
1864  Arg = &Ins[++I];
1865  }
1866  }
1867 
1868  if (SkipArg) {
1869  // We can safely skip PS inputs.
1870  Skipped.set(Arg->getOrigArgIndex());
1871  ++PSInputNum;
1872  continue;
1873  }
1874 
1875  Info->markPSInputAllocated(PSInputNum);
1876  if (Arg->Used)
1877  Info->markPSInputEnabled(PSInputNum);
1878 
1879  ++PSInputNum;
1880  }
1881 
1882  Splits.push_back(*Arg);
1883  }
1884 }
1885 
1886 // Allocate special inputs passed in VGPRs.
1888  MachineFunction &MF,
1889  const SIRegisterInfo &TRI,
1890  SIMachineFunctionInfo &Info) const {
1891  const LLT S32 = LLT::scalar(32);
1893 
1894  if (Info.hasWorkItemIDX()) {
1895  Register Reg = AMDGPU::VGPR0;
1896  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1897 
1898  CCInfo.AllocateReg(Reg);
1899  unsigned Mask = (Subtarget->hasPackedTID() &&
1900  Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1901  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
1902  }
1903 
1904  if (Info.hasWorkItemIDY()) {
1905  assert(Info.hasWorkItemIDX());
1906  if (Subtarget->hasPackedTID()) {
1907  Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1908  0x3ff << 10));
1909  } else {
1910  unsigned Reg = AMDGPU::VGPR1;
1911  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1912 
1913  CCInfo.AllocateReg(Reg);
1914  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1915  }
1916  }
1917 
1918  if (Info.hasWorkItemIDZ()) {
1919  assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
1920  if (Subtarget->hasPackedTID()) {
1921  Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1922  0x3ff << 20));
1923  } else {
1924  unsigned Reg = AMDGPU::VGPR2;
1925  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1926 
1927  CCInfo.AllocateReg(Reg);
1928  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1929  }
1930  }
1931 }
1932 
1933 // Try to allocate a VGPR at the end of the argument list, or if no argument
1934 // VGPRs are left allocating a stack slot.
1935 // If \p Mask is is given it indicates bitfield position in the register.
1936 // If \p Arg is given use it with new ]p Mask instead of allocating new.
1937 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
1939  if (Arg.isSet())
1941 
1942  ArrayRef<MCPhysReg> ArgVGPRs
1943  = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1944  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1945  if (RegIdx == ArgVGPRs.size()) {
1946  // Spill to stack required.
1947  int64_t Offset = CCInfo.AllocateStack(4, Align(4));
1948 
1950  }
1951 
1952  unsigned Reg = ArgVGPRs[RegIdx];
1953  Reg = CCInfo.AllocateReg(Reg);
1954  assert(Reg != AMDGPU::NoRegister);
1955 
1956  MachineFunction &MF = CCInfo.getMachineFunction();
1957  Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1958  MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
1960 }
1961 
1963  const TargetRegisterClass *RC,
1964  unsigned NumArgRegs) {
1965  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1966  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1967  if (RegIdx == ArgSGPRs.size())
1968  report_fatal_error("ran out of SGPRs for arguments");
1969 
1970  unsigned Reg = ArgSGPRs[RegIdx];
1971  Reg = CCInfo.AllocateReg(Reg);
1972  assert(Reg != AMDGPU::NoRegister);
1973 
1974  MachineFunction &MF = CCInfo.getMachineFunction();
1975  MF.addLiveIn(Reg, RC);
1977 }
1978 
1979 // If this has a fixed position, we still should allocate the register in the
1980 // CCInfo state. Technically we could get away with this for values passed
1981 // outside of the normal argument range.
1983  const TargetRegisterClass *RC,
1984  MCRegister Reg) {
1985  Reg = CCInfo.AllocateReg(Reg);
1986  assert(Reg != AMDGPU::NoRegister);
1987  MachineFunction &MF = CCInfo.getMachineFunction();
1988  MF.addLiveIn(Reg, RC);
1989 }
1990 
1992  if (Arg) {
1993  allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
1994  Arg.getRegister());
1995  } else
1996  Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1997 }
1998 
2000  if (Arg) {
2001  allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2002  Arg.getRegister());
2003  } else
2004  Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2005 }
2006 
2007 /// Allocate implicit function VGPR arguments at the end of allocated user
2008 /// arguments.
2010  CCState &CCInfo, MachineFunction &MF,
2011  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2012  const unsigned Mask = 0x3ff;
2014 
2015  if (Info.hasWorkItemIDX()) {
2016  Arg = allocateVGPR32Input(CCInfo, Mask);
2017  Info.setWorkItemIDX(Arg);
2018  }
2019 
2020  if (Info.hasWorkItemIDY()) {
2021  Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2022  Info.setWorkItemIDY(Arg);
2023  }
2024 
2025  if (Info.hasWorkItemIDZ())
2026  Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2027 }
2028 
2029 /// Allocate implicit function VGPR arguments in fixed registers.
2031  CCState &CCInfo, MachineFunction &MF,
2032  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2033  Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2034  if (!Reg)
2035  report_fatal_error("failed to allocated VGPR for implicit arguments");
2036 
2037  const unsigned Mask = 0x3ff;
2038  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2039  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2040  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2041 }
2042 
2044  CCState &CCInfo,
2045  MachineFunction &MF,
2046  const SIRegisterInfo &TRI,
2047  SIMachineFunctionInfo &Info) const {
2048  auto &ArgInfo = Info.getArgInfo();
2049 
2050  // We need to allocate these in place regardless of their use.
2051  const bool IsFixed = AMDGPUTargetMachine::EnableFixedFunctionABI;
2052 
2053  // TODO: Unify handling with private memory pointers.
2054  if (IsFixed || Info.hasDispatchPtr())
2055  allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2056 
2057  if (IsFixed || Info.hasQueuePtr())
2058  allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2059 
2060  // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2061  // constant offset from the kernarg segment.
2062  if (IsFixed || Info.hasImplicitArgPtr())
2063  allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2064 
2065  if (IsFixed || Info.hasDispatchID())
2066  allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2067 
2068  // flat_scratch_init is not applicable for non-kernel functions.
2069 
2070  if (IsFixed || Info.hasWorkGroupIDX())
2071  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2072 
2073  if (IsFixed || Info.hasWorkGroupIDY())
2074  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2075 
2076  if (IsFixed || Info.hasWorkGroupIDZ())
2077  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2078 }
2079 
2080 // Allocate special inputs passed in user SGPRs.
2082  MachineFunction &MF,
2083  const SIRegisterInfo &TRI,
2084  SIMachineFunctionInfo &Info) const {
2085  if (Info.hasImplicitBufferPtr()) {
2086  Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2087  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2088  CCInfo.AllocateReg(ImplicitBufferPtrReg);
2089  }
2090 
2091  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2092  if (Info.hasPrivateSegmentBuffer()) {
2093  Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2094  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2095  CCInfo.AllocateReg(PrivateSegmentBufferReg);
2096  }
2097 
2098  if (Info.hasDispatchPtr()) {
2099  Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2100  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2101  CCInfo.AllocateReg(DispatchPtrReg);
2102  }
2103 
2104  if (Info.hasQueuePtr()) {
2105  Register QueuePtrReg = Info.addQueuePtr(TRI);
2106  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2107  CCInfo.AllocateReg(QueuePtrReg);
2108  }
2109 
2110  if (Info.hasKernargSegmentPtr()) {
2112  Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2113  CCInfo.AllocateReg(InputPtrReg);
2114 
2115  Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2117  }
2118 
2119  if (Info.hasDispatchID()) {
2120  Register DispatchIDReg = Info.addDispatchID(TRI);
2121  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2122  CCInfo.AllocateReg(DispatchIDReg);
2123  }
2124 
2125  if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2126  Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2127  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2128  CCInfo.AllocateReg(FlatScratchInitReg);
2129  }
2130 
2131  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2132  // these from the dispatch pointer.
2133 }
2134 
2135 // Allocate special input registers that are initialized per-wave.
2137  MachineFunction &MF,
2139  CallingConv::ID CallConv,
2140  bool IsShader) const {
2141  if (Info.hasWorkGroupIDX()) {
2142  Register Reg = Info.addWorkGroupIDX();
2143  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2144  CCInfo.AllocateReg(Reg);
2145  }
2146 
2147  if (Info.hasWorkGroupIDY()) {
2148  Register Reg = Info.addWorkGroupIDY();
2149  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2150  CCInfo.AllocateReg(Reg);
2151  }
2152 
2153  if (Info.hasWorkGroupIDZ()) {
2154  Register Reg = Info.addWorkGroupIDZ();
2155  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2156  CCInfo.AllocateReg(Reg);
2157  }
2158 
2159  if (Info.hasWorkGroupInfo()) {
2160  Register Reg = Info.addWorkGroupInfo();
2161  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2162  CCInfo.AllocateReg(Reg);
2163  }
2164 
2165  if (Info.hasPrivateSegmentWaveByteOffset()) {
2166  // Scratch wave offset passed in system SGPR.
2167  unsigned PrivateSegmentWaveByteOffsetReg;
2168 
2169  if (IsShader) {
2170  PrivateSegmentWaveByteOffsetReg =
2171  Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2172 
2173  // This is true if the scratch wave byte offset doesn't have a fixed
2174  // location.
2175  if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2176  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2177  Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2178  }
2179  } else
2180  PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2181 
2182  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2183  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2184  }
2185 }
2186 
2188  MachineFunction &MF,
2189  const SIRegisterInfo &TRI,
2191  // Now that we've figured out where the scratch register inputs are, see if
2192  // should reserve the arguments and use them directly.
2193  MachineFrameInfo &MFI = MF.getFrameInfo();
2194  bool HasStackObjects = MFI.hasStackObjects();
2195  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2196 
2197  // Record that we know we have non-spill stack objects so we don't need to
2198  // check all stack objects later.
2199  if (HasStackObjects)
2200  Info.setHasNonSpillStackObjects(true);
2201 
2202  // Everything live out of a block is spilled with fast regalloc, so it's
2203  // almost certain that spilling will be required.
2204  if (TM.getOptLevel() == CodeGenOpt::None)
2205  HasStackObjects = true;
2206 
2207  // For now assume stack access is needed in any callee functions, so we need
2208  // the scratch registers to pass in.
2209  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2210 
2211  if (!ST.enableFlatScratch()) {
2212  if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2213  // If we have stack objects, we unquestionably need the private buffer
2214  // resource. For the Code Object V2 ABI, this will be the first 4 user
2215  // SGPR inputs. We can reserve those and use them directly.
2216 
2217  Register PrivateSegmentBufferReg =
2219  Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2220  } else {
2221  unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2222  // We tentatively reserve the last registers (skipping the last registers
2223  // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2224  // we'll replace these with the ones immediately after those which were
2225  // really allocated. In the prologue copies will be inserted from the
2226  // argument to these reserved registers.
2227 
2228  // Without HSA, relocations are used for the scratch pointer and the
2229  // buffer resource setup is always inserted in the prologue. Scratch wave
2230  // offset is still in an input SGPR.
2231  Info.setScratchRSrcReg(ReservedBufferReg);
2232  }
2233  }
2234 
2236 
2237  // For entry functions we have to set up the stack pointer if we use it,
2238  // whereas non-entry functions get this "for free". This means there is no
2239  // intrinsic advantage to using S32 over S34 in cases where we do not have
2240  // calls but do need a frame pointer (i.e. if we are requested to have one
2241  // because frame pointer elimination is disabled). To keep things simple we
2242  // only ever use S32 as the call ABI stack pointer, and so using it does not
2243  // imply we need a separate frame pointer.
2244  //
2245  // Try to use s32 as the SP, but move it if it would interfere with input
2246  // arguments. This won't work with calls though.
2247  //
2248  // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2249  // registers.
2250  if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2251  Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2252  } else {
2254 
2255  if (MFI.hasCalls())
2256  report_fatal_error("call in graphics shader with too many input SGPRs");
2257 
2258  for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2259  if (!MRI.isLiveIn(Reg)) {
2260  Info.setStackPtrOffsetReg(Reg);
2261  break;
2262  }
2263  }
2264 
2265  if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2266  report_fatal_error("failed to find register for SP");
2267  }
2268 
2269  // hasFP should be accurate for entry functions even before the frame is
2270  // finalized, because it does not rely on the known stack size, only
2271  // properties like whether variable sized objects are present.
2272  if (ST.getFrameLowering()->hasFP(MF)) {
2273  Info.setFrameOffsetReg(AMDGPU::SGPR33);
2274  }
2275 }
2276 
2279  return !Info->isEntryFunction();
2280 }
2281 
2283 
2284 }
2285 
2287  MachineBasicBlock *Entry,
2288  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2290 
2291  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2292  if (!IStart)
2293  return;
2294 
2295  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2296  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2297  MachineBasicBlock::iterator MBBI = Entry->begin();
2298  for (const MCPhysReg *I = IStart; *I; ++I) {
2299  const TargetRegisterClass *RC = nullptr;
2300  if (AMDGPU::SReg_64RegClass.contains(*I))
2301  RC = &AMDGPU::SGPR_64RegClass;
2302  else if (AMDGPU::SReg_32RegClass.contains(*I))
2303  RC = &AMDGPU::SGPR_32RegClass;
2304  else
2305  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2306 
2307  Register NewVR = MRI->createVirtualRegister(RC);
2308  // Create copy from CSR to a virtual register.
2309  Entry->addLiveIn(*I);
2310  BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2311  .addReg(*I);
2312 
2313  // Insert the copy-back instructions right before the terminator.
2314  for (auto *Exit : Exits)
2315  BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2316  TII->get(TargetOpcode::COPY), *I)
2317  .addReg(NewVR);
2318  }
2319 }
2320 
2322  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2323  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2324  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2326 
2327  MachineFunction &MF = DAG.getMachineFunction();
2328  const Function &Fn = MF.getFunction();
2329  FunctionType *FType = MF.getFunction().getFunctionType();
2331 
2332  if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2333  DiagnosticInfoUnsupported NoGraphicsHSA(
2334  Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2335  DAG.getContext()->diagnose(NoGraphicsHSA);
2336  return DAG.getEntryNode();
2337  }
2338 
2339  Info->allocateModuleLDSGlobal(Fn.getParent());
2340 
2343  BitVector Skipped(Ins.size());
2344  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2345  *DAG.getContext());
2346 
2347  bool IsGraphics = AMDGPU::isGraphics(CallConv);
2348  bool IsKernel = AMDGPU::isKernel(CallConv);
2349  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2350 
2351  if (IsGraphics) {
2352  assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
2353  (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
2354  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2355  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
2356  !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
2357  !Info->hasWorkItemIDZ());
2358  }
2359 
2360  if (CallConv == CallingConv::AMDGPU_PS) {
2361  processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2362 
2363  // At least one interpolation mode must be enabled or else the GPU will
2364  // hang.
2365  //
2366  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2367  // set PSInputAddr, the user wants to enable some bits after the compilation
2368  // based on run-time states. Since we can't know what the final PSInputEna
2369  // will look like, so we shouldn't do anything here and the user should take
2370  // responsibility for the correct programming.
2371  //
2372  // Otherwise, the following restrictions apply:
2373  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2374  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2375  // enabled too.
2376  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2377  ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2378  CCInfo.AllocateReg(AMDGPU::VGPR0);
2379  CCInfo.AllocateReg(AMDGPU::VGPR1);
2380  Info->markPSInputAllocated(0);
2381  Info->markPSInputEnabled(0);
2382  }
2383  if (Subtarget->isAmdPalOS()) {
2384  // For isAmdPalOS, the user does not enable some bits after compilation
2385  // based on run-time states; the register values being generated here are
2386  // the final ones set in hardware. Therefore we need to apply the
2387  // workaround to PSInputAddr and PSInputEnable together. (The case where
2388  // a bit is set in PSInputAddr but not PSInputEnable is where the
2389  // frontend set up an input arg for a particular interpolation mode, but
2390  // nothing uses that input arg. Really we should have an earlier pass
2391  // that removes such an arg.)
2392  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2393  if ((PsInputBits & 0x7F) == 0 ||
2394  ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2395  Info->markPSInputEnabled(
2396  countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
2397  }
2398  } else if (IsKernel) {
2399  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2400  } else {
2401  Splits.append(Ins.begin(), Ins.end());
2402  }
2403 
2404  if (IsEntryFunc) {
2405  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2406  allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2407  } else {
2408  // For the fixed ABI, pass workitem IDs in the last argument register.
2410  allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2411  }
2412 
2413  if (IsKernel) {
2415  } else {
2416  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2417  CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2418  }
2419 
2420  SmallVector<SDValue, 16> Chains;
2421 
2422  // FIXME: This is the minimum kernel argument alignment. We should improve
2423  // this to the maximum alignment of the arguments.
2424  //
2425  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2426  // kern arg offset.
2427  const Align KernelArgBaseAlign = Align(16);
2428 
2429  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2430  const ISD::InputArg &Arg = Ins[i];
2431  if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2432  InVals.push_back(DAG.getUNDEF(Arg.VT));
2433  continue;
2434  }
2435 
2436  CCValAssign &VA = ArgLocs[ArgIdx++];
2437  MVT VT = VA.getLocVT();
2438 
2439  if (IsEntryFunc && VA.isMemLoc()) {
2440  VT = Ins[i].VT;
2441  EVT MemVT = VA.getLocVT();
2442 
2443  const uint64_t Offset = VA.getLocMemOffset();
2444  Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2445 
2446  if (Arg.Flags.isByRef()) {
2447  SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2448 
2449  const GCNTargetMachine &TM =
2450  static_cast<const GCNTargetMachine &>(getTargetMachine());
2451  if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2452  Arg.Flags.getPointerAddrSpace())) {
2453  Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
2454  Arg.Flags.getPointerAddrSpace());
2455  }
2456 
2457  InVals.push_back(Ptr);
2458  continue;
2459  }
2460 
2461  SDValue Arg = lowerKernargMemParameter(
2462  DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2463  Chains.push_back(Arg.getValue(1));
2464 
2465  auto *ParamTy =
2466  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2467  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2468  ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2469  ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2470  // On SI local pointers are just offsets into LDS, so they are always
2471  // less than 16-bits. On CI and newer they could potentially be
2472  // real pointers, so we can't guarantee their size.
2473  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2474  DAG.getValueType(MVT::i16));
2475  }
2476 
2477  InVals.push_back(Arg);
2478  continue;
2479  } else if (!IsEntryFunc && VA.isMemLoc()) {
2480  SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2481  InVals.push_back(Val);
2482  if (!Arg.Flags.isByVal())
2483  Chains.push_back(Val.getValue(1));
2484  continue;
2485  }
2486 
2487  assert(VA.isRegLoc() && "Parameter must be in a register!");
2488 
2489  Register Reg = VA.getLocReg();
2491  EVT ValVT = VA.getValVT();
2492 
2493  Reg = MF.addLiveIn(Reg, RC);
2494  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2495 
2496  if (Arg.Flags.isSRet()) {
2497  // The return object should be reasonably addressable.
2498 
2499  // FIXME: This helps when the return is a real sret. If it is a
2500  // automatically inserted sret (i.e. CanLowerReturn returns false), an
2501  // extra copy is inserted in SelectionDAGBuilder which obscures this.
2502  unsigned NumBits
2504  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2505  DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2506  }
2507 
2508  // If this is an 8 or 16-bit value, it is really passed promoted
2509  // to 32 bits. Insert an assert[sz]ext to capture this, then
2510  // truncate to the right size.
2511  switch (VA.getLocInfo()) {
2512  case CCValAssign::Full:
2513  break;
2514  case CCValAssign::BCvt:
2515  Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2516  break;
2517  case CCValAssign::SExt:
2518  Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2519  DAG.getValueType(ValVT));
2520  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2521  break;
2522  case CCValAssign::ZExt:
2523  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2524  DAG.getValueType(ValVT));
2525  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2526  break;
2527  case CCValAssign::AExt:
2528  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2529  break;
2530  default:
2531  llvm_unreachable("Unknown loc info!");
2532  }
2533 
2534  InVals.push_back(Val);
2535  }
2536 
2537  if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
2538  // Special inputs come after user arguments.
2539  allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2540  }
2541 
2542  // Start adding system SGPRs.
2543  if (IsEntryFunc) {
2544  allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
2545  } else {
2546  CCInfo.AllocateReg(Info->getScratchRSrcReg());
2547  allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2548  }
2549 
2550  auto &ArgUsageInfo =
2552  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2553 
2554  unsigned StackArgSize = CCInfo.getNextStackOffset();
2555  Info->setBytesInStackArgArea(StackArgSize);
2556 
2557  return Chains.empty() ? Chain :
2558  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2559 }
2560 
2561 // TODO: If return values can't fit in registers, we should return as many as
2562 // possible in registers before passing on stack.
2564  CallingConv::ID CallConv,
2565  MachineFunction &MF, bool IsVarArg,
2566  const SmallVectorImpl<ISD::OutputArg> &Outs,
2567  LLVMContext &Context) const {
2568  // Replacing returns with sret/stack usage doesn't make sense for shaders.
2569  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2570  // for shaders. Vector types should be explicitly handled by CC.
2571  if (AMDGPU::isEntryFunctionCC(CallConv))
2572  return true;
2573 
2575  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2576  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2577 }
2578 
2579 SDValue
2581  bool isVarArg,
2582  const SmallVectorImpl<ISD::OutputArg> &Outs,
2583  const SmallVectorImpl<SDValue> &OutVals,
2584  const SDLoc &DL, SelectionDAG &DAG) const {
2585  MachineFunction &MF = DAG.getMachineFunction();
2587 
2588  if (AMDGPU::isKernel(CallConv)) {
2589  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2590  OutVals, DL, DAG);
2591  }
2592 
2593  bool IsShader = AMDGPU::isShader(CallConv);
2594 
2595  Info->setIfReturnsVoid(Outs.empty());
2596  bool IsWaveEnd = Info->returnsVoid() && IsShader;
2597 
2598  // CCValAssign - represent the assignment of the return value to a location.
2601 
2602  // CCState - Info about the registers and stack slots.
2603  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2604  *DAG.getContext());
2605 
2606  // Analyze outgoing return values.
2607  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2608 
2609  SDValue Flag;
2610  SmallVector<SDValue, 48> RetOps;
2611  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2612 
2613  // Add return address for callable functions.
2614  if (!Info->isEntryFunction()) {
2616  SDValue ReturnAddrReg = CreateLiveInRegister(
2617  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2618 
2619  SDValue ReturnAddrVirtualReg = DAG.getRegister(
2620  MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass),
2621  MVT::i64);
2622  Chain =
2623  DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
2624  Flag = Chain.getValue(1);
2625  RetOps.push_back(ReturnAddrVirtualReg);
2626  }
2627 
2628  // Copy the result values into the output registers.
2629  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2630  ++I, ++RealRVLocIdx) {
2631  CCValAssign &VA = RVLocs[I];
2632  assert(VA.isRegLoc() && "Can only return in registers!");
2633  // TODO: Partially return in registers if return values don't fit.
2634  SDValue Arg = OutVals[RealRVLocIdx];
2635 
2636  // Copied from other backends.
2637  switch (VA.getLocInfo()) {
2638  case CCValAssign::Full:
2639  break;
2640  case CCValAssign::BCvt:
2641  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2642  break;
2643  case CCValAssign::SExt:
2644  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2645  break;
2646  case CCValAssign::ZExt:
2647  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2648  break;
2649  case CCValAssign::AExt:
2650  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2651  break;
2652  default:
2653  llvm_unreachable("Unknown loc info!");
2654  }
2655 
2656  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2657  Flag = Chain.getValue(1);
2658  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2659  }
2660 
2661  // FIXME: Does sret work properly?
2662  if (!Info->isEntryFunction()) {
2663  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2664  const MCPhysReg *I =
2665  TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2666  if (I) {
2667  for (; *I; ++I) {
2668  if (AMDGPU::SReg_64RegClass.contains(*I))
2669  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2670  else if (AMDGPU::SReg_32RegClass.contains(*I))
2671  RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2672  else
2673  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2674  }
2675  }
2676  }
2677 
2678  // Update chain and glue.
2679  RetOps[0] = Chain;
2680  if (Flag.getNode())
2681  RetOps.push_back(Flag);
2682 
2683  unsigned Opc = AMDGPUISD::ENDPGM;
2684  if (!IsWaveEnd)
2686  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2687 }
2688 
2690  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2691  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2692  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2693  SDValue ThisVal) const {
2694  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2695 
2696  // Assign locations to each value returned by this call.
2698  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2699  *DAG.getContext());
2700  CCInfo.AnalyzeCallResult(Ins, RetCC);
2701 
2702  // Copy all of the result registers out of their specified physreg.
2703  for (unsigned i = 0; i != RVLocs.size(); ++i) {
2704  CCValAssign VA = RVLocs[i];
2705  SDValue Val;
2706 
2707  if (VA.isRegLoc()) {
2708  Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2709  Chain = Val.getValue(1);
2710  InFlag = Val.getValue(2);
2711  } else if (VA.isMemLoc()) {
2712  report_fatal_error("TODO: return values in memory");
2713  } else
2714  llvm_unreachable("unknown argument location type");
2715 
2716  switch (VA.getLocInfo()) {
2717  case CCValAssign::Full:
2718  break;
2719  case CCValAssign::BCvt:
2720  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2721  break;
2722  case CCValAssign::ZExt:
2723  Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2724  DAG.getValueType(VA.getValVT()));
2725  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2726  break;
2727  case CCValAssign::SExt:
2728  Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2729  DAG.getValueType(VA.getValVT()));
2730  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2731  break;
2732  case CCValAssign::AExt:
2733  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2734  break;
2735  default:
2736  llvm_unreachable("Unknown loc info!");
2737  }
2738 
2739  InVals.push_back(Val);
2740  }
2741 
2742  return Chain;
2743 }
2744 
2745 // Add code to pass special inputs required depending on used features separate
2746 // from the explicit user arguments present in the IR.
2748  CallLoweringInfo &CLI,
2749  CCState &CCInfo,
2750  const SIMachineFunctionInfo &Info,
2751  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2752  SmallVectorImpl<SDValue> &MemOpChains,
2753  SDValue Chain) const {
2754  // If we don't have a call site, this was a call inserted by
2755  // legalization. These can never use special inputs.
2756  if (!CLI.CB)
2757  return;
2758 
2759  SelectionDAG &DAG = CLI.DAG;
2760  const SDLoc &DL = CLI.DL;
2761 
2762  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2763  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2764 
2765  const AMDGPUFunctionArgInfo *CalleeArgInfo
2767  if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
2768  auto &ArgUsageInfo =
2770  CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2771  }
2772 
2773  // TODO: Unify with private memory register handling. This is complicated by
2774  // the fact that at least in kernels, the input argument is not necessarily
2775  // in the same location as the input.
2776  static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
2778  {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
2779  {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
2780  {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
2781  {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
2782  {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
2783  {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
2784  {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"}
2785  };
2786 
2787  for (auto Attr : ImplicitAttrs) {
2788  const ArgDescriptor *OutgoingArg;
2789  const TargetRegisterClass *ArgRC;
2790  LLT ArgTy;
2791 
2792  AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
2793 
2794  // If the callee does not use the attribute value, skip copying the value.
2795  if (CLI.CB->hasFnAttr(Attr.second))
2796  continue;
2797 
2798  std::tie(OutgoingArg, ArgRC, ArgTy) =
2799  CalleeArgInfo->getPreloadedValue(InputID);
2800  if (!OutgoingArg)
2801  continue;
2802 
2803  const ArgDescriptor *IncomingArg;
2804  const TargetRegisterClass *IncomingArgRC;
2805  LLT Ty;
2806  std::tie(IncomingArg, IncomingArgRC, Ty) =
2807  CallerArgInfo.getPreloadedValue(InputID);
2808  assert(IncomingArgRC == ArgRC);
2809 
2810  // All special arguments are ints for now.
2811  EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2812  SDValue InputReg;
2813 
2814  if (IncomingArg) {
2815  InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2816  } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
2817  // The implicit arg ptr is special because it doesn't have a corresponding
2818  // input for kernels, and is computed from the kernarg segment pointer.
2819  InputReg = getImplicitArgPtr(DAG, DL);
2820  } else {
2821  // We may have proven the input wasn't needed, although the ABI is
2822  // requiring it. We just need to allocate the register appropriately.
2823  InputReg = DAG.getUNDEF(ArgVT);
2824  }
2825 
2826  if (OutgoingArg->isRegister()) {
2827  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2828  if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
2829  report_fatal_error("failed to allocate implicit input argument");
2830  } else {
2831  unsigned SpecialArgOffset =
2832  CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
2833  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2834  SpecialArgOffset);
2835  MemOpChains.push_back(ArgStore);
2836  }
2837  }
2838 
2839  // Pack workitem IDs into a single register or pass it as is if already
2840  // packed.
2841  const ArgDescriptor *OutgoingArg;
2842  const TargetRegisterClass *ArgRC;
2843  LLT Ty;
2844 
2845  std::tie(OutgoingArg, ArgRC, Ty) =
2847  if (!OutgoingArg)
2848  std::tie(OutgoingArg, ArgRC, Ty) =
2850  if (!OutgoingArg)
2851  std::tie(OutgoingArg, ArgRC, Ty) =
2853  if (!OutgoingArg)
2854  return;
2855 
2856  const ArgDescriptor *IncomingArgX = std::get<0>(
2858  const ArgDescriptor *IncomingArgY = std::get<0>(
2860  const ArgDescriptor *IncomingArgZ = std::get<0>(
2862 
2863  SDValue InputReg;
2864  SDLoc SL;
2865 
2866  const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
2867  const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
2868  const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
2869 
2870  // If incoming ids are not packed we need to pack them.
2871  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
2872  NeedWorkItemIDX)
2873  InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
2874 
2875  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
2876  NeedWorkItemIDY) {
2877  SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
2878  Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
2879  DAG.getShiftAmountConstant(10, MVT::i32, SL));
2880  InputReg = InputReg.getNode() ?
2881  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
2882  }
2883 
2884  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
2885  NeedWorkItemIDZ) {
2886  SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
2887  Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
2888  DAG.getShiftAmountConstant(20, MVT::i32, SL));
2889  InputReg = InputReg.getNode() ?
2890  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
2891  }
2892 
2893  if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
2894  // Workitem ids are already packed, any of present incoming arguments
2895  // will carry all required fields.
2897  IncomingArgX ? *IncomingArgX :
2898  IncomingArgY ? *IncomingArgY :
2899  *IncomingArgZ, ~0u);
2900  InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
2901  }
2902 
2903  if (OutgoingArg->isRegister()) {
2904  if (InputReg)
2905  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2906 
2907  CCInfo.AllocateReg(OutgoingArg->getRegister());
2908  } else {
2909  unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
2910  if (InputReg) {
2911  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2912  SpecialArgOffset);
2913  MemOpChains.push_back(ArgStore);
2914  }
2915  }
2916 }
2917 
2919  return CC == CallingConv::Fast;
2920 }
2921 
2922 /// Return true if we might ever do TCO for calls with this calling convention.
2924  switch (CC) {
2925  case CallingConv::C:
2927  return true;
2928  default:
2929  return canGuaranteeTCO(CC);
2930  }
2931 }
2932 
2934  SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2935  const SmallVectorImpl<ISD::OutputArg> &Outs,
2936  const SmallVectorImpl<SDValue> &OutVals,
2937  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2938  if (!mayTailCallThisCC(CalleeCC))
2939  return false;
2940 
2941  // For a divergent call target, we need to do a waterfall loop over the
2942  // possible callees which precludes us from using a simple jump.
2943  if (Callee->isDivergent())
2944  return false;
2945 
2946  MachineFunction &MF = DAG.getMachineFunction();
2947  const Function &CallerF = MF.getFunction();
2948  CallingConv::ID CallerCC = CallerF.getCallingConv();
2950  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2951 
2952  // Kernels aren't callable, and don't have a live in return address so it
2953  // doesn't make sense to do a tail call with entry functions.
2954  if (!CallerPreserved)
2955  return false;
2956 
2957  bool CCMatch = CallerCC == CalleeCC;
2958 
2960  if (canGuaranteeTCO(CalleeCC) && CCMatch)
2961  return true;
2962  return false;
2963  }
2964 
2965  // TODO: Can we handle var args?
2966  if (IsVarArg)
2967  return false;
2968 
2969  for (const Argument &Arg : CallerF.args()) {
2970  if (Arg.hasByValAttr())
2971  return false;
2972  }
2973 
2974  LLVMContext &Ctx = *DAG.getContext();
2975 
2976  // Check that the call results are passed in the same way.
2977  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2978  CCAssignFnForCall(CalleeCC, IsVarArg),
2979  CCAssignFnForCall(CallerCC, IsVarArg)))
2980  return false;
2981 
2982  // The callee has to preserve all registers the caller needs to preserve.
2983  if (!CCMatch) {
2984  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2985  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2986  return false;
2987  }
2988 
2989  // Nothing more to check if the callee is taking no arguments.
2990  if (Outs.empty())
2991  return true;
2992 
2994  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2995 
2996  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2997 
2998  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2999  // If the stack arguments for this call do not fit into our own save area then
3000  // the call cannot be made tail.
3001  // TODO: Is this really necessary?
3002  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3003  return false;
3004 
3005  const MachineRegisterInfo &MRI = MF.getRegInfo();
3006  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3007 }
3008 
3010  if (!CI->isTailCall())
3011  return false;
3012 
3013  const Function *ParentFn = CI->getParent()->getParent();
3014  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
3015  return false;
3016  return true;
3017 }
3018 
3019 // The wave scratch offset register is used as the global base pointer.
3021  SmallVectorImpl<SDValue> &InVals) const {
3022  SelectionDAG &DAG = CLI.DAG;
3023  const SDLoc &DL = CLI.DL;
3025  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3027  SDValue Chain = CLI.Chain;
3028  SDValue Callee = CLI.Callee;
3029  bool &IsTailCall = CLI.IsTailCall;
3030  CallingConv::ID CallConv = CLI.CallConv;
3031  bool IsVarArg = CLI.IsVarArg;
3032  bool IsSibCall = false;
3033  bool IsThisReturn = false;
3034  MachineFunction &MF = DAG.getMachineFunction();
3035 
3036  if (Callee.isUndef() || isNullConstant(Callee)) {
3037  if (!CLI.IsTailCall) {
3038  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3039  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3040  }
3041 
3042  return Chain;
3043  }
3044 
3045  if (IsVarArg) {
3046  return lowerUnhandledCall(CLI, InVals,
3047  "unsupported call to variadic function ");
3048  }
3049 
3050  if (!CLI.CB)
3051  report_fatal_error("unsupported libcall legalization");
3052 
3053  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3054  return lowerUnhandledCall(CLI, InVals,
3055  "unsupported required tail call to function ");
3056  }
3057 
3058  if (AMDGPU::isShader(CallConv)) {
3059  // Note the issue is with the CC of the called function, not of the call
3060  // itself.
3061  return lowerUnhandledCall(CLI, InVals,
3062  "unsupported call to a shader function ");
3063  }
3064 
3066  CallConv != CallingConv::AMDGPU_Gfx) {
3067  // Only allow calls with specific calling conventions.
3068  return lowerUnhandledCall(CLI, InVals,
3069  "unsupported calling convention for call from "
3070  "graphics shader of function ");
3071  }
3072 
3073  if (IsTailCall) {
3074  IsTailCall = isEligibleForTailCallOptimization(
3075  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3076  if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
3077  report_fatal_error("failed to perform tail call elimination on a call "
3078  "site marked musttail");
3079  }
3080 
3081  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3082 
3083  // A sibling call is one where we're under the usual C ABI and not planning
3084  // to change that but can still do a tail call:
3085  if (!TailCallOpt && IsTailCall)
3086  IsSibCall = true;
3087 
3088  if (IsTailCall)
3089  ++NumTailCalls;
3090  }
3091 
3094  SmallVector<SDValue, 8> MemOpChains;
3095 
3096  // Analyze operands of the call, assigning locations to each operand.
3098  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3099  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3100 
3102  CallConv != CallingConv::AMDGPU_Gfx) {
3103  // With a fixed ABI, allocate fixed registers before user arguments.
3104  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3105  }
3106 
3107  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3108 
3109  // Get a count of how many bytes are to be pushed on the stack.
3110  unsigned NumBytes = CCInfo.getNextStackOffset();
3111 
3112  if (IsSibCall) {
3113  // Since we're not changing the ABI to make this a tail call, the memory
3114  // operands are already available in the caller's incoming argument space.
3115  NumBytes = 0;
3116  }
3117 
3118  // FPDiff is the byte offset of the call's argument area from the callee's.
3119  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3120  // by this amount for a tail call. In a sibling call it must be 0 because the
3121  // caller will deallocate the entire stack and the callee still expects its
3122  // arguments to begin at SP+0. Completely unused for non-tail calls.
3123  int32_t FPDiff = 0;
3124  MachineFrameInfo &MFI = MF.getFrameInfo();
3125 
3126  // Adjust the stack pointer for the new arguments...
3127  // These operations are automatically eliminated by the prolog/epilog pass
3128  if (!IsSibCall) {
3129  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3130 
3131  if (!Subtarget->enableFlatScratch()) {
3132  SmallVector<SDValue, 4> CopyFromChains;
3133 
3134  // In the HSA case, this should be an identity copy.
3135  SDValue ScratchRSrcReg
3136  = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3137  RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3138  CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3139  Chain = DAG.getTokenFactor(DL, CopyFromChains);
3140  }
3141  }
3142 
3143  MVT PtrVT = MVT::i32;
3144 
3145  // Walk the register/memloc assignments, inserting copies/loads.
3146  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3147  CCValAssign &VA = ArgLocs[i];
3148  SDValue Arg = OutVals[i];
3149 
3150  // Promote the value if needed.
3151  switch (VA.getLocInfo()) {
3152  case CCValAssign::Full:
3153  break;
3154  case CCValAssign::BCvt:
3155  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3156  break;
3157  case CCValAssign::ZExt:
3158  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3159  break;
3160  case CCValAssign::SExt:
3161  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3162  break;
3163  case CCValAssign::AExt:
3164  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3165  break;
3166  case CCValAssign::FPExt:
3167  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3168  break;
3169  default:
3170  llvm_unreachable("Unknown loc info!");
3171  }
3172 
3173  if (VA.isRegLoc()) {
3174  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3175  } else {
3176  assert(VA.isMemLoc());
3177 
3178  SDValue DstAddr;
3179  MachinePointerInfo DstInfo;
3180 
3181  unsigned LocMemOffset = VA.getLocMemOffset();
3182  int32_t Offset = LocMemOffset;
3183 
3184  SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3185  MaybeAlign Alignment;
3186 
3187  if (IsTailCall) {
3188  ISD::ArgFlagsTy Flags = Outs[i].Flags;
3189  unsigned OpSize = Flags.isByVal() ?
3190  Flags.getByValSize() : VA.getValVT().getStoreSize();
3191 
3192  // FIXME: We can have better than the minimum byval required alignment.
3193  Alignment =
3194  Flags.isByVal()
3195  ? Flags.getNonZeroByValAlign()
3196  : commonAlignment(Subtarget->getStackAlignment(), Offset);
3197 
3198  Offset = Offset + FPDiff;
3199  int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3200 
3201  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3202  DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3203 
3204  // Make sure any stack arguments overlapping with where we're storing
3205  // are loaded before this eventual operation. Otherwise they'll be
3206  // clobbered.
3207 
3208  // FIXME: Why is this really necessary? This seems to just result in a
3209  // lot of code to copy the stack and write them back to the same
3210  // locations, which are supposed to be immutable?
3211  Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3212  } else {
3213  // Stores to the argument stack area are relative to the stack pointer.
3214  SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3215  MVT::i32);
3216  DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3217  DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3218  Alignment =
3219  commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3220  }
3221 
3222  if (Outs[i].Flags.isByVal()) {
3223  SDValue SizeNode =
3224  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3225  SDValue Cpy =
3226  DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3227  Outs[i].Flags.getNonZeroByValAlign(),
3228  /*isVol = */ false, /*AlwaysInline = */ true,
3229  /*isTailCall = */ false, DstInfo,
3231 
3232  MemOpChains.push_back(Cpy);
3233  } else {
3234  SDValue Store =
3235  DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3236  MemOpChains.push_back(Store);
3237  }
3238  }
3239  }
3240 
3242  CallConv != CallingConv::AMDGPU_Gfx) {
3243  // Copy special input registers after user input arguments.
3244  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3245  }
3246 
3247  if (!MemOpChains.empty())
3248  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3249 
3250  // Build a sequence of copy-to-reg nodes chained together with token chain
3251  // and flag operands which copy the outgoing args into the appropriate regs.
3252  SDValue InFlag;
3253  for (auto &RegToPass : RegsToPass) {
3254  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3255  RegToPass.second, InFlag);
3256  InFlag = Chain.getValue(1);
3257  }
3258 
3259 
3260  SDValue PhysReturnAddrReg;
3261  if (IsTailCall) {
3262  // Since the return is being combined with the call, we need to pass on the
3263  // return address.
3264 
3266  SDValue ReturnAddrReg = CreateLiveInRegister(
3267  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
3268 
3269  PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
3270  MVT::i64);
3271  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
3272  InFlag = Chain.getValue(1);
3273  }
3274 
3275  // We don't usually want to end the call-sequence here because we would tidy
3276  // the frame up *after* the call, however in the ABI-changing tail-call case
3277  // we've carefully laid out the parameters so that when sp is reset they'll be
3278  // in the correct location.
3279  if (IsTailCall && !IsSibCall) {
3280  Chain = DAG.getCALLSEQ_END(Chain,
3281  DAG.getTargetConstant(NumBytes, DL, MVT::i32),
3282  DAG.getTargetConstant(0, DL, MVT::i32),
3283  InFlag, DL);
3284  InFlag = Chain.getValue(1);
3285  }
3286 
3287  std::vector<SDValue> Ops;
3288  Ops.push_back(Chain);
3289  Ops.push_back(Callee);
3290  // Add a redundant copy of the callee global which will not be legalized, as
3291  // we need direct access to the callee later.
3292  if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3293  const GlobalValue *GV = GSD->getGlobal();
3294  Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3295  } else {
3296  Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3297  }
3298 
3299  if (IsTailCall) {
3300  // Each tail call may have to adjust the stack by a different amount, so
3301  // this information must travel along with the operation for eventual
3302  // consumption by emitEpilogue.
3303  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3304 
3305  Ops.push_back(PhysReturnAddrReg);
3306  }
3307 
3308  // Add argument registers to the end of the list so that they are known live
3309  // into the call.
3310  for (auto &RegToPass : RegsToPass) {
3311  Ops.push_back(DAG.getRegister(RegToPass.first,
3312  RegToPass.second.getValueType()));
3313  }
3314 
3315  // Add a register mask operand representing the call-preserved registers.
3316 
3317  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
3318  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3319  assert(Mask && "Missing call preserved mask for calling convention");
3320  Ops.push_back(DAG.getRegisterMask(Mask));
3321 
3322  if (InFlag.getNode())
3323  Ops.push_back(InFlag);
3324 
3325  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3326 
3327  // If we're doing a tall call, use a TC_RETURN here rather than an
3328  // actual call instruction.
3329  if (IsTailCall) {
3330  MFI.setHasTailCall();
3331  return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
3332  }
3333 
3334  // Returns a chain and a flag for retval copy to use.
3335  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3336  Chain = Call.getValue(0);
3337  InFlag = Call.getValue(1);
3338 
3339  uint64_t CalleePopBytes = NumBytes;
3340  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
3341  DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
3342  InFlag, DL);
3343  if (!Ins.empty())
3344  InFlag = Chain.getValue(1);
3345 
3346  // Handle result values, copying them out of physregs into vregs that we
3347  // return.
3348  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3349  InVals, IsThisReturn,
3350  IsThisReturn ? OutVals[0] : SDValue());
3351 }
3352 
3353 // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3354 // except for applying the wave size scale to the increment amount.
3356  SDValue Op, SelectionDAG &DAG) const {
3357  const MachineFunction &MF = DAG.getMachineFunction();
3359 
3360  SDLoc dl(Op);
3361  EVT VT = Op.getValueType();
3362  SDValue Tmp1 = Op;
3363  SDValue Tmp2 = Op.getValue(1);
3364  SDValue Tmp3 = Op.getOperand(2);
3365  SDValue Chain = Tmp1.getOperand(0);
3366 
3367  Register SPReg = Info->getStackPtrOffsetReg();
3368 
3369  // Chain the dynamic stack allocation so that it doesn't modify the stack
3370  // pointer when other instructions are using the stack.
3371  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3372 
3373  SDValue Size = Tmp2.getOperand(1);
3374  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3375  Chain = SP.getValue(1);
3376  MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3377  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3378  const TargetFrameLowering *TFL = ST.getFrameLowering();
3379  unsigned Opc =
3380  TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3381  ISD::ADD : ISD::SUB;
3382 
3383  SDValue ScaledSize = DAG.getNode(
3384  ISD::SHL, dl, VT, Size,
3385  DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
3386 
3387  Align StackAlign = TFL->getStackAlign();
3388  Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3389  if (Alignment && *Alignment > StackAlign) {
3390  Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3391  DAG.getConstant(-(uint64_t)Alignment->value()
3392  << ST.getWavefrontSizeLog2(),
3393  dl, VT));
3394  }
3395 
3396  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3397  Tmp2 = DAG.getCALLSEQ_END(
3398  Chain, DAG.getIntPtrConstant(0, dl, true),
3399  DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
3400 
3401  return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3402 }
3403 
3405  SelectionDAG &DAG) const {
3406  // We only handle constant sizes here to allow non-entry block, static sized
3407  // allocas. A truly dynamic value is more difficult to support because we
3408  // don't know if the size value is uniform or not. If the size isn't uniform,
3409  // we would need to do a wave reduction to get the maximum size to know how
3410  // much to increment the uniform stack pointer.
3411  SDValue Size = Op.getOperand(1);
3412  if (isa<ConstantSDNode>(Size))
3413  return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3414 
3416 }
3417 
3419  const MachineFunction &MF) const {
3421  .Case("m0", AMDGPU::M0)
3422  .Case("exec", AMDGPU::EXEC)
3423  .Case("exec_lo", AMDGPU::EXEC_LO)
3424  .Case("exec_hi", AMDGPU::EXEC_HI)
3425  .Case("flat_scratch", AMDGPU::FLAT_SCR)
3426  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3427  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3428  .Default(Register());
3429 
3430  if (Reg == AMDGPU::NoRegister) {
3431  report_fatal_error(Twine("invalid register name \""
3432  + StringRef(RegName) + "\"."));
3433 
3434  }
3435 
3436  if (!Subtarget->hasFlatScrRegister() &&
3437  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
3438  report_fatal_error(Twine("invalid register \""
3439  + StringRef(RegName) + "\" for subtarget."));
3440  }
3441 
3442  switch (Reg) {
3443  case AMDGPU::M0:
3444  case AMDGPU::EXEC_LO:
3445  case AMDGPU::EXEC_HI:
3446  case AMDGPU::FLAT_SCR_LO:
3447  case AMDGPU::FLAT_SCR_HI:
3448  if (VT.getSizeInBits() == 32)
3449  return Reg;
3450  break;
3451  case AMDGPU::EXEC:
3452  case AMDGPU::FLAT_SCR:
3453  if (VT.getSizeInBits() == 64)
3454  return Reg;
3455  break;
3456  default:
3457  llvm_unreachable("missing register type checking");
3458  }
3459 
3460  report_fatal_error(Twine("invalid type for register \""
3461  + StringRef(RegName) + "\"."));
3462 }
3463 
3464 // If kill is not the last instruction, split the block so kill is always a
3465 // proper terminator.
3468  MachineBasicBlock *BB) const {
3469  MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
3470  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3471  MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
3472  return SplitBB;
3473 }
3474 
3475 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3476 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3477 // be the first instruction in the remainder block.
3478 //
3479 /// \returns { LoopBody, Remainder }
3480 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3482  MachineFunction *MF = MBB.getParent();
3484 
3485  // To insert the loop we need to split the block. Move everything after this
3486  // point to a new block, and insert a new empty block between the two.
3488  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3490  ++MBBI;
3491 
3492  MF->insert(MBBI, LoopBB);
3493  MF->insert(MBBI, RemainderBB);
3494 
3495  LoopBB->addSuccessor(LoopBB);
3496  LoopBB->addSuccessor(RemainderBB);
3497 
3498  // Move the rest of the block into a new block.
3499  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3500 
3501  if (InstInLoop) {
3502  auto Next = std::next(I);
3503 
3504  // Move instruction to loop body.
3505  LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
3506 
3507  // Move the rest of the block.
3508  RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
3509  } else {
3510  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3511  }
3512 
3513  MBB.addSuccessor(LoopBB);
3514 
3515  return std::make_pair(LoopBB, RemainderBB);
3516 }
3517 
3518 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3520  MachineBasicBlock *MBB = MI.getParent();
3521  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3522  auto I = MI.getIterator();
3523  auto E = std::next(I);
3524 
3525  BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
3526  .addImm(0);
3527 
3528  MIBundleBuilder Bundler(*MBB, I, E);
3529  finalizeBundle(*MBB, Bundler.begin());
3530 }
3531 
3534  MachineBasicBlock *BB) const {
3535  const DebugLoc &DL = MI.getDebugLoc();
3536 
3537  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3538 
3539  MachineBasicBlock *LoopBB;
3540  MachineBasicBlock *RemainderBB;
3541  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3542 
3543  // Apparently kill flags are only valid if the def is in the same block?
3544  if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
3545  Src->setIsKill(false);
3546 
3547  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
3548 
3549  MachineBasicBlock::iterator I = LoopBB->end();
3550 
3551  const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
3553 
3554  // Clear TRAP_STS.MEM_VIOL
3555  BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
3556  .addImm(0)
3557  .addImm(EncodedReg);
3558 
3560 
3561  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3562 
3563  // Load and check TRAP_STS.MEM_VIOL
3564  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
3565  .addImm(EncodedReg);
3566 
3567  // FIXME: Do we need to use an isel pseudo that may clobber scc?
3568  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
3570  .addImm(0);
3571  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3572  .addMBB(LoopBB);
3573 
3574  return RemainderBB;
3575 }
3576 
3577 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3578 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3579 // will only do one iteration. In the worst case, this will loop 64 times.
3580 //
3581 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3584  MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3585  const DebugLoc &DL, const MachineOperand &Idx,
3586  unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
3587  unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
3588  Register &SGPRIdxReg) {
3589 
3590  MachineFunction *MF = OrigBB.getParent();
3591  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3592  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3593  MachineBasicBlock::iterator I = LoopBB.begin();
3594 
3595  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3596  Register PhiExec = MRI.createVirtualRegister(BoolRC);
3597  Register NewExec = MRI.createVirtualRegister(BoolRC);
3598  Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3599  Register CondReg = MRI.createVirtualRegister(BoolRC);
3600 
3601  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
3602  .addReg(InitReg)
3603  .addMBB(&OrigBB)
3604  .addReg(ResultReg)
3605  .addMBB(&LoopBB);
3606 
3607  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
3608  .addReg(InitSaveExecReg)
3609  .addMBB(&OrigBB)
3610  .addReg(NewExec)
3611  .addMBB(&LoopBB);
3612 
3613  // Read the next variant <- also loop target.
3614  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3615  .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
3616 
3617  // Compare the just read M0 value to all possible Idx values.
3618  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3619  .addReg(CurrentIdxReg)
3620  .addReg(Idx.getReg(), 0, Idx.getSubReg());
3621 
3622  // Update EXEC, save the original EXEC value to VCC.
3623  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3624  : AMDGPU::S_AND_SAVEEXEC_B64),
3625  NewExec)
3626  .addReg(CondReg, RegState::Kill);
3627 
3628  MRI.setSimpleHint(NewExec, CondReg);
3629 
3630  if (UseGPRIdxMode) {
3631  if (Offset == 0) {
3632  SGPRIdxReg = CurrentIdxReg;
3633  } else {
3634  SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3635  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3636  .addReg(CurrentIdxReg, RegState::Kill)
3637  .addImm(Offset);
3638  }
3639  } else {
3640  // Move index from VCC into M0
3641  if (Offset == 0) {
3642  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3643  .addReg(CurrentIdxReg, RegState::Kill);
3644  } else {
3645  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3646  .addReg(CurrentIdxReg, RegState::Kill)
3647  .addImm(Offset);
3648  }
3649  }
3650 
3651  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3652  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3653  MachineInstr *InsertPt =
3654  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3655  : AMDGPU::S_XOR_B64_term), Exec)
3656  .addReg(Exec)
3657  .addReg(NewExec);
3658 
3659  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3660  // s_cbranch_scc0?
3661 
3662  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3663  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3664  .addMBB(&LoopBB);
3665 
3666  return InsertPt->getIterator();
3667 }
3668 
3669 // This has slightly sub-optimal regalloc when the source vector is killed by
3670 // the read. The register allocator does not understand that the kill is
3671 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3672 // subregister from it, using 1 more VGPR than necessary. This was saved when
3673 // this was expanded after register allocation.
3676  unsigned InitResultReg, unsigned PhiReg, int Offset,
3677  bool UseGPRIdxMode, Register &SGPRIdxReg) {
3678  MachineFunction *MF = MBB.getParent();
3679  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3680  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3682  const DebugLoc &DL = MI.getDebugLoc();
3684 
3685  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3686  Register DstReg = MI.getOperand(0).getReg();
3687  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
3688  Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
3689  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3690  unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3691 
3692  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3693 
3694  // Save the EXEC mask
3695  BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
3696  .addReg(Exec);
3697 
3698  MachineBasicBlock *LoopBB;
3699  MachineBasicBlock *RemainderBB;
3700  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
3701 
3702  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3703 
3704  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3705  InitResultReg, DstReg, PhiReg, TmpExec,
3706  Offset, UseGPRIdxMode, SGPRIdxReg);
3707 
3708  MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
3710  ++MBBI;
3711  MF->insert(MBBI, LandingPad);
3712  LoopBB->removeSuccessor(RemainderBB);
3713  LandingPad->addSuccessor(RemainderBB);
3714  LoopBB->addSuccessor(LandingPad);
3715  MachineBasicBlock::iterator First = LandingPad->begin();
3716  BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
3717  .addReg(SaveExec);
3718 
3719  return InsPt;
3720 }
3721 
3722 // Returns subreg index, offset
3723 static std::pair<unsigned, int>
3725  const TargetRegisterClass *SuperRC,
3726  unsigned VecReg,
3727  int Offset) {
3728  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3729 
3730  // Skip out of bounds offsets, or else we would end up using an undefined
3731  // register.
3732  if (Offset >= NumElts || Offset < 0)
3733  return std::make_pair(AMDGPU::sub0, Offset);
3734 
3735  return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
3736 }
3737 
3740  int Offset) {
3741  MachineBasicBlock *MBB = MI.getParent();
3742  const DebugLoc &DL = MI.getDebugLoc();
3744 
3745  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3746 
3747  assert(Idx->getReg() != AMDGPU::NoRegister);
3748 
3749  if (Offset == 0) {
3750  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
3751  } else {
3752  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3753  .add(*Idx)
3754  .addImm(Offset);
3755  }
3756 }
3757 
3760  int Offset) {
3761  MachineBasicBlock *MBB = MI.getParent();
3762  const DebugLoc &DL = MI.getDebugLoc();
3764 
3765  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3766 
3767  if (Offset == 0)
3768  return Idx->getReg();
3769 
3770  Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3771  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3772  .add(*Idx)
3773  .addImm(Offset);
3774  return Tmp;
3775 }
3776 
3779  const GCNSubtarget &ST) {
3780  const SIInstrInfo *TII = ST.getInstrInfo();
3781  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3782  MachineFunction *MF = MBB.getParent();
3784 
3785  Register Dst = MI.getOperand(0).getReg();
3786  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3787  Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3788  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3789 
3790  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3791  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3792 
3793  unsigned SubReg;
3794  std::tie(SubReg, Offset)
3795  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3796 
3797  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3798 
3799  // Check for a SGPR index.
3800  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3802  const DebugLoc &DL = MI.getDebugLoc();
3803 
3804  if (UseGPRIdxMode) {
3805  // TODO: Look at the uses to avoid the copy. This may require rescheduling
3806  // to avoid interfering with other uses, so probably requires a new
3807  // optimization pass.
3809 
3810  const MCInstrDesc &GPRIDXDesc =
3811  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3812  BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3813  .addReg(SrcReg)
3814  .addReg(Idx)
3815  .addImm(SubReg);
3816  } else {
3818 
3819  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3820  .addReg(SrcReg, 0, SubReg)
3821  .addReg(SrcReg, RegState::Implicit);
3822  }
3823 
3824  MI.eraseFromParent();
3825 
3826  return &MBB;
3827  }
3828 
3829  // Control flow needs to be inserted if indexing with a VGPR.
3830  const DebugLoc &DL = MI.getDebugLoc();
3832 
3833  Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3834  Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3835 
3836  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3837 
3838  Register SGPRIdxReg;
3839  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
3840  UseGPRIdxMode, SGPRIdxReg);
3841 
3842  MachineBasicBlock *LoopBB = InsPt->getParent();
3843 
3844  if (UseGPRIdxMode) {
3845  const MCInstrDesc &GPRIDXDesc =
3846  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3847 
3848  BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3849  .addReg(SrcReg)
3850  .addReg(SGPRIdxReg)
3851  .addImm(SubReg);
3852  } else {
3853  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3854  .addReg(SrcReg, 0, SubReg)
3855  .addReg(SrcReg, RegState::Implicit);
3856  }
3857 
3858  MI.eraseFromParent();
3859 
3860  return LoopBB;
3861 }
3862 
3865  const GCNSubtarget &ST) {
3866  const SIInstrInfo *TII = ST.getInstrInfo();
3867  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3868  MachineFunction *MF = MBB.getParent();
3870 
3871  Register Dst = MI.getOperand(0).getReg();
3872  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3873  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3874  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3875  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3876  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3877  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3878 
3879  // This can be an immediate, but will be folded later.
3880  assert(Val->getReg());
3881 
3882  unsigned SubReg;
3883  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3884  SrcVec->getReg(),
3885  Offset);
3886  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3887 
3888  if (Idx->getReg() == AMDGPU::NoRegister) {
3890  const DebugLoc &DL = MI.getDebugLoc();
3891 
3892  assert(Offset == 0);
3893 
3894  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3895  .add(*SrcVec)
3896  .add(*Val)
3897  .addImm(SubReg);
3898 
3899  MI.eraseFromParent();
3900  return &MBB;
3901  }
3902 
3903  // Check for a SGPR index.
3904  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3906  const DebugLoc &DL = MI.getDebugLoc();
3907 
3908  if (UseGPRIdxMode) {
3910 
3911  const MCInstrDesc &GPRIDXDesc =
3912  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3913  BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3914  .addReg(SrcVec->getReg())
3915  .add(*Val)
3916  .addReg(Idx)
3917  .addImm(SubReg);
3918  } else {
3920 
3921  const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3922  TRI.getRegSizeInBits(*VecRC), 32, false);
3923  BuildMI(MBB, I, DL, MovRelDesc, Dst)
3924  .addReg(SrcVec->getReg())
3925  .add(*Val)
3926  .addImm(SubReg);
3927  }
3928  MI.eraseFromParent();
3929  return &MBB;
3930  }
3931 
3932  // Control flow needs to be inserted if indexing with a VGPR.
3933  if (Val->isReg())
3934  MRI.clearKillFlags(Val->getReg());
3935 
3936  const DebugLoc &DL = MI.getDebugLoc();
3937 
3938  Register PhiReg = MRI.createVirtualRegister(VecRC);
3939 
3940  Register SGPRIdxReg;
3941  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
3942  UseGPRIdxMode, SGPRIdxReg);
3943  MachineBasicBlock *LoopBB = InsPt->getParent();
3944 
3945  if (UseGPRIdxMode) {
3946  const MCInstrDesc &GPRIDXDesc =
3947  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3948 
3949  BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3950  .addReg(PhiReg)
3951  .add(*Val)
3952  .addReg(SGPRIdxReg)
3953  .addImm(AMDGPU::sub0);
3954  } else {
3955  const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3956  TRI.getRegSizeInBits(*VecRC), 32, false);
3957  BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
3958  .addReg(PhiReg)
3959  .add(*Val)
3960  .addImm(AMDGPU::sub0);
3961  }
3962 
3963  MI.eraseFromParent();
3964  return LoopBB;
3965 }
3966 
3968  MachineInstr &MI, MachineBasicBlock *BB) const {
3969 
3970  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3971  MachineFunction *MF = BB->getParent();
3973 
3974  switch (MI.getOpcode()) {
3975  case AMDGPU::S_UADDO_PSEUDO:
3976  case AMDGPU::S_USUBO_PSEUDO: {
3977  const DebugLoc &DL = MI.getDebugLoc();
<