LLVM  14.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIISelLowering.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/Statistic.h"
22 #include "llvm/BinaryFormat/ELF.h"
23 #include "llvm/CodeGen/Analysis.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicInst.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include "llvm/IR/IntrinsicsR600.h"
32 #include "llvm/Support/KnownBits.h"
33 
34 using namespace llvm;
35 
36 #define DEBUG_TYPE "si-lower"
37 
38 STATISTIC(NumTailCalls, "Number of tail calls");
39 
41  "amdgpu-disable-loop-alignment",
42  cl::desc("Do not align and prefetch loops"),
43  cl::init(false));
44 
46  "amdgpu-reserve-vgpr-for-sgpr-spill",
47  cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true));
48 
50  "amdgpu-use-divergent-register-indexing",
51  cl::Hidden,
52  cl::desc("Use indirect register addressing for divergent indexes"),
53  cl::init(false));
54 
55 static bool hasFP32Denormals(const MachineFunction &MF) {
57  return Info->getMode().allFP32Denormals();
58 }
59 
60 static bool hasFP64FP16Denormals(const MachineFunction &MF) {
62  return Info->getMode().allFP64FP16Denormals();
63 }
64 
65 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
66  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
67  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
68  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
69  return AMDGPU::SGPR0 + Reg;
70  }
71  }
72  llvm_unreachable("Cannot allocate sgpr");
73 }
74 
76  const GCNSubtarget &STI)
77  : AMDGPUTargetLowering(TM, STI),
78  Subtarget(&STI) {
79  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
80  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
81 
82  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
83  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
84 
85  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
86 
87  const SIRegisterInfo *TRI = STI.getRegisterInfo();
88  const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
89 
90  addRegisterClass(MVT::f64, V64RegClass);
91  addRegisterClass(MVT::v2f32, V64RegClass);
92 
93  addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
94  addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
95 
96  addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
97  addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
98 
99  addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
100  addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
101 
102  addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
103  addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
104 
105  addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
106  addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
107 
108  addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
109  addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
110 
111  addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
112  addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
113 
114  addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
115  addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
116 
117  addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
118  addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
119 
120  addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
121  addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
122 
123  addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
124  addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
125 
126  addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
127  addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
128 
129  if (Subtarget->has16BitInsts()) {
130  addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
131  addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
132 
133  // Unless there are also VOP3P operations, not operations are really legal.
134  addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
135  addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
136  addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
137  addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
138  }
139 
140  addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
141  addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
142 
144 
145  // The boolean content concept here is too inflexible. Compares only ever
146  // really produce a 1-bit result. Any copy/extend from these will turn into a
147  // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
148  // it's what most targets use.
151 
152  // We need to custom lower vector stores from local memory
163 
174 
191 
199 
202 
207 
213 
218 
235 
244 
251 
254 
257 
261 
262 #if 0
265 #endif
266 
267  // We only support LOAD/STORE and vector manipulation ops for vectors
268  // with > 4 elements.
274  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
275  switch (Op) {
276  case ISD::LOAD:
277  case ISD::STORE:
278  case ISD::BUILD_VECTOR:
279  case ISD::BITCAST:
284  break;
286  case ISD::CONCAT_VECTORS:
288  break;
289  default:
291  break;
292  }
293  }
294  }
295 
297 
298  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
299  // is expanded to avoid having two separate loops in case the index is a VGPR.
300 
301  // Most operations are naturally 32-bit vector operations. We only support
302  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
303  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
306 
309 
312 
315  }
316 
317  for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
320 
323 
326 
329  }
330 
331  for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
334 
337 
340 
343  }
344 
345  for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
348 
351 
354 
357  }
358 
359  for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
362 
365 
368 
371  }
372 
377 
380 
381  // Avoid stack access for these.
382  // TODO: Generalize to more vector types.
387 
394 
399 
400  // Deal with vec3 vector operations when widened to vec4.
405 
406  // Deal with vec5/6/7 vector operations when widened to vec8.
415 
416  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
417  // and output demarshalling
420 
421  // We can't return success/failure, only the old value,
422  // let LLVM add the comparison
425 
426  if (Subtarget->hasFlatAddressSpace()) {
429  }
430 
433 
434  // FIXME: This should be narrowed to i32, but that only happens if i64 is
435  // illegal.
436  // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
439 
440  // On SI this is s_memtime and s_memrealtime on VI.
444 
445  if (Subtarget->has16BitInsts()) {
451  }
452 
453  if (Subtarget->hasMadMacF32Insts())
455 
456  if (!Subtarget->hasBFI()) {
457  // fcopysign can be done in a single instruction with BFI.
460  }
461 
462  if (!Subtarget->hasBCNT(32))
464 
465  if (!Subtarget->hasBCNT(64))
467 
468  if (Subtarget->hasFFBH())
470 
471  if (Subtarget->hasFFBL())
473 
474  // We only really have 32-bit BFE instructions (and 16-bit on VI).
475  //
476  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
477  // effort to match them now. We want this to be false for i64 cases when the
478  // extraction isn't restricted to the upper or lower half. Ideally we would
479  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
480  // span the midpoint are probably relatively rare, so don't worry about them
481  // for now.
482  if (Subtarget->hasBFE())
483  setHasExtractBitsInsn(true);
484 
485  // Clamp modifier on add/sub
486  if (Subtarget->hasIntClamp()) {
489  }
490 
491  if (Subtarget->hasAddNoCarry()) {
496  }
497 
502 
503 
504  // These are really only legal for ieee_mode functions. We should be avoiding
505  // them for functions that don't have ieee_mode enabled, so just say they are
506  // legal.
511 
512 
513  if (Subtarget->haveRoundOpsF64()) {
517  } else {
522  }
523 
525 
530 
531  if (Subtarget->has16BitInsts()) {
533 
536 
539 
542 
545 
552 
554 
560 
562 
564 
566 
568 
573 
576 
577  // F16 - Constant Actions.
579 
580  // F16 - Load/Store Actions.
585 
586  // F16 - VOP1 Actions.
590 
593 
599 
600  // F16 - VOP2 Actions.
603 
605 
606  // F16 - VOP3 Actions.
608  if (STI.hasMadF16())
610 
611  for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
612  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
613  switch (Op) {
614  case ISD::LOAD:
615  case ISD::STORE:
616  case ISD::BUILD_VECTOR:
617  case ISD::BITCAST:
623  break;
624  case ISD::CONCAT_VECTORS:
626  break;
627  default:
629  break;
630  }
631  }
632  }
633 
634  // v_perm_b32 can handle either of these.
638 
639  // XXX - Do these do anything? Vector constants turn into build_vector.
642 
645 
650 
655 
662 
667 
672 
677 
681 
682  if (!Subtarget->hasVOP3PInsts()) {
685  }
686 
688  // This isn't really legal, but this avoids the legalizer unrolling it (and
689  // allows matching fneg (fabs x) patterns)
691 
696 
699 
702  }
703 
704  if (Subtarget->hasVOP3PInsts()) {
715 
720 
724 
727 
729 
732 
735 
742 
747 
752 
756 
759 
763 
767 
768  if (Subtarget->hasPackedFP32Ops()) {
773 
774  for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) {
778  }
779  }
780  }
781 
784 
785  if (Subtarget->has16BitInsts()) {
790  } else {
791  // Legalization hack.
794 
797  }
798 
801  }
802 
805 
813 
825 
836 
864 
865  // All memory operations. Some folding on the pointer operand is done to help
866  // matching the constant offsets in the addressing modes.
887 
888  // FIXME: In other contexts we pretend this is a per-function property.
889  setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
890 
892 }
893 
895  return Subtarget;
896 }
897 
898 //===----------------------------------------------------------------------===//
899 // TargetLowering queries
900 //===----------------------------------------------------------------------===//
901 
902 // v_mad_mix* support a conversion from f16 to f32.
903 //
904 // There is only one special case when denormals are enabled we don't currently,
905 // where this is OK to use.
906 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
907  EVT DestVT, EVT SrcVT) const {
908  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
909  (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
910  DestVT.getScalarType() == MVT::f32 &&
911  SrcVT.getScalarType() == MVT::f16 &&
912  // TODO: This probably only requires no input flushing?
914 }
915 
917  // SI has some legal vector types, but no legal vector operations. Say no
918  // shuffles are legal in order to prefer scalarizing some vector operations.
919  return false;
920 }
921 
923  CallingConv::ID CC,
924  EVT VT) const {
925  if (CC == CallingConv::AMDGPU_KERNEL)
927 
928  if (VT.isVector()) {
929  EVT ScalarVT = VT.getScalarType();
930  unsigned Size = ScalarVT.getSizeInBits();
931  if (Size == 16) {
932  if (Subtarget->has16BitInsts())
933  return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
934  return VT.isInteger() ? MVT::i32 : MVT::f32;
935  }
936 
937  if (Size < 16)
938  return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
939  return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
940  }
941 
942  if (VT.getSizeInBits() > 32)
943  return MVT::i32;
944 
946 }
947 
949  CallingConv::ID CC,
950  EVT VT) const {
951  if (CC == CallingConv::AMDGPU_KERNEL)
953 
954  if (VT.isVector()) {
955  unsigned NumElts = VT.getVectorNumElements();
956  EVT ScalarVT = VT.getScalarType();
957  unsigned Size = ScalarVT.getSizeInBits();
958 
959  // FIXME: Should probably promote 8-bit vectors to i16.
960  if (Size == 16 && Subtarget->has16BitInsts())
961  return (NumElts + 1) / 2;
962 
963  if (Size <= 32)
964  return NumElts;
965 
966  if (Size > 32)
967  return NumElts * ((Size + 31) / 32);
968  } else if (VT.getSizeInBits() > 32)
969  return (VT.getSizeInBits() + 31) / 32;
970 
972 }
973 
976  EVT VT, EVT &IntermediateVT,
977  unsigned &NumIntermediates, MVT &RegisterVT) const {
978  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
979  unsigned NumElts = VT.getVectorNumElements();
980  EVT ScalarVT = VT.getScalarType();
981  unsigned Size = ScalarVT.getSizeInBits();
982  // FIXME: We should fix the ABI to be the same on targets without 16-bit
983  // support, but unless we can properly handle 3-vectors, it will be still be
984  // inconsistent.
985  if (Size == 16 && Subtarget->has16BitInsts()) {
986  RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
987  IntermediateVT = RegisterVT;
988  NumIntermediates = (NumElts + 1) / 2;
989  return NumIntermediates;
990  }
991 
992  if (Size == 32) {
993  RegisterVT = ScalarVT.getSimpleVT();
994  IntermediateVT = RegisterVT;
995  NumIntermediates = NumElts;
996  return NumIntermediates;
997  }
998 
999  if (Size < 16 && Subtarget->has16BitInsts()) {
1000  // FIXME: Should probably form v2i16 pieces
1001  RegisterVT = MVT::i16;
1002  IntermediateVT = ScalarVT;
1003  NumIntermediates = NumElts;
1004  return NumIntermediates;
1005  }
1006 
1007 
1008  if (Size != 16 && Size <= 32) {
1009  RegisterVT = MVT::i32;
1010  IntermediateVT = ScalarVT;
1011  NumIntermediates = NumElts;
1012  return NumIntermediates;
1013  }
1014 
1015  if (Size > 32) {
1016  RegisterVT = MVT::i32;
1017  IntermediateVT = RegisterVT;
1018  NumIntermediates = NumElts * ((Size + 31) / 32);
1019  return NumIntermediates;
1020  }
1021  }
1022 
1024  Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1025 }
1026 
1027 static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) {
1028  assert(DMaskLanes != 0);
1029 
1030  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1031  unsigned NumElts = std::min(DMaskLanes, VT->getNumElements());
1032  return EVT::getVectorVT(Ty->getContext(),
1033  EVT::getEVT(VT->getElementType()),
1034  NumElts);
1035  }
1036 
1037  return EVT::getEVT(Ty);
1038 }
1039 
1040 // Peek through TFE struct returns to only use the data size.
1041 static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) {
1042  auto *ST = dyn_cast<StructType>(Ty);
1043  if (!ST)
1044  return memVTFromImageData(Ty, DMaskLanes);
1045 
1046  // Some intrinsics return an aggregate type - special case to work out the
1047  // correct memVT.
1048  //
1049  // Only limited forms of aggregate type currently expected.
1050  if (ST->getNumContainedTypes() != 2 ||
1051  !ST->getContainedType(1)->isIntegerTy(32))
1052  return EVT();
1053  return memVTFromImageData(ST->getContainedType(0), DMaskLanes);
1054 }
1055 
1057  const CallInst &CI,
1058  MachineFunction &MF,
1059  unsigned IntrID) const {
1060  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1061  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
1063  (Intrinsic::ID)IntrID);
1064  if (Attr.hasFnAttribute(Attribute::ReadNone))
1065  return false;
1066 
1068 
1069  if (RsrcIntr->IsImage) {
1070  Info.ptrVal =
1072  Info.align.reset();
1073  } else {
1074  Info.ptrVal =
1076  }
1077 
1079  if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
1080  unsigned DMaskLanes = 4;
1081 
1082  if (RsrcIntr->IsImage) {
1085  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1086  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1087 
1088  if (!BaseOpcode->Gather4) {
1089  // If this isn't a gather, we may have excess loaded elements in the
1090  // IR type. Check the dmask for the real number of elements loaded.
1091  unsigned DMask
1092  = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1093  DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
1094  }
1095 
1096  Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes);
1097  } else
1098  Info.memVT = EVT::getEVT(CI.getType());
1099 
1100  // FIXME: What does alignment mean for an image?
1103  } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
1104  Info.opc = ISD::INTRINSIC_VOID;
1105 
1106  Type *DataTy = CI.getArgOperand(0)->getType();
1107  if (RsrcIntr->IsImage) {
1108  unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1109  unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
1110  Info.memVT = memVTFromImageData(DataTy, DMaskLanes);
1111  } else
1112  Info.memVT = EVT::getEVT(DataTy);
1113 
1115  } else {
1116  // Atomic
1117  Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1119  Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1123 
1124  // XXX - Should this be volatile without known ordering?
1126  }
1127  return true;
1128  }
1129 
1130  switch (IntrID) {
1131  case Intrinsic::amdgcn_atomic_inc:
1132  case Intrinsic::amdgcn_atomic_dec:
1133  case Intrinsic::amdgcn_ds_ordered_add:
1134  case Intrinsic::amdgcn_ds_ordered_swap:
1135  case Intrinsic::amdgcn_ds_fadd:
1136  case Intrinsic::amdgcn_ds_fmin:
1137  case Intrinsic::amdgcn_ds_fmax: {
1139  Info.memVT = MVT::getVT(CI.getType());
1140  Info.ptrVal = CI.getOperand(0);
1141  Info.align.reset();
1143 
1144  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1145  if (!Vol->isZero())
1147 
1148  return true;
1149  }
1150  case Intrinsic::amdgcn_buffer_atomic_fadd: {
1152 
1154  Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1155  Info.ptrVal =
1157  Info.align.reset();
1159 
1160  const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1161  if (!Vol || !Vol->isZero())
1163 
1164  return true;
1165  }
1166  case Intrinsic::amdgcn_ds_append:
1167  case Intrinsic::amdgcn_ds_consume: {
1169  Info.memVT = MVT::getVT(CI.getType());
1170  Info.ptrVal = CI.getOperand(0);
1171  Info.align.reset();
1173 
1174  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1175  if (!Vol->isZero())
1177 
1178  return true;
1179  }
1180  case Intrinsic::amdgcn_global_atomic_csub: {
1182  Info.memVT = MVT::getVT(CI.getType());
1183  Info.ptrVal = CI.getOperand(0);
1184  Info.align.reset();
1188  return true;
1189  }
1190  case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1193  Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1194  Info.ptrVal =
1196  Info.align.reset();
1199  return true;
1200  }
1201  case Intrinsic::amdgcn_global_atomic_fadd:
1202  case Intrinsic::amdgcn_global_atomic_fmin:
1203  case Intrinsic::amdgcn_global_atomic_fmax:
1204  case Intrinsic::amdgcn_flat_atomic_fadd:
1205  case Intrinsic::amdgcn_flat_atomic_fmin:
1206  case Intrinsic::amdgcn_flat_atomic_fmax: {
1208  Info.memVT = MVT::getVT(CI.getType());
1209  Info.ptrVal = CI.getOperand(0);
1210  Info.align.reset();
1215  return true;
1216  }
1217  case Intrinsic::amdgcn_ds_gws_init:
1218  case Intrinsic::amdgcn_ds_gws_barrier:
1219  case Intrinsic::amdgcn_ds_gws_sema_v:
1220  case Intrinsic::amdgcn_ds_gws_sema_br:
1221  case Intrinsic::amdgcn_ds_gws_sema_p:
1222  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1223  Info.opc = ISD::INTRINSIC_VOID;
1224 
1226  Info.ptrVal =
1228 
1229  // This is an abstract access, but we need to specify a type and size.
1230  Info.memVT = MVT::i32;
1231  Info.size = 4;
1232  Info.align = Align(4);
1233 
1235  if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1237  return true;
1238  }
1239  default:
1240  return false;
1241  }
1242 }
1243 
1246  Type *&AccessTy) const {
1247  switch (II->getIntrinsicID()) {
1248  case Intrinsic::amdgcn_atomic_inc:
1249  case Intrinsic::amdgcn_atomic_dec:
1250  case Intrinsic::amdgcn_ds_ordered_add:
1251  case Intrinsic::amdgcn_ds_ordered_swap:
1252  case Intrinsic::amdgcn_ds_append:
1253  case Intrinsic::amdgcn_ds_consume:
1254  case Intrinsic::amdgcn_ds_fadd:
1255  case Intrinsic::amdgcn_ds_fmin:
1256  case Intrinsic::amdgcn_ds_fmax:
1257  case Intrinsic::amdgcn_global_atomic_fadd:
1258  case Intrinsic::amdgcn_flat_atomic_fadd:
1259  case Intrinsic::amdgcn_flat_atomic_fmin:
1260  case Intrinsic::amdgcn_flat_atomic_fmax:
1261  case Intrinsic::amdgcn_global_atomic_csub: {
1262  Value *Ptr = II->getArgOperand(0);
1263  AccessTy = II->getType();
1264  Ops.push_back(Ptr);
1265  return true;
1266  }
1267  default:
1268  return false;
1269  }
1270 }
1271 
1272 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
1273  if (!Subtarget->hasFlatInstOffsets()) {
1274  // Flat instructions do not have offsets, and only have the register
1275  // address.
1276  return AM.BaseOffs == 0 && AM.Scale == 0;
1277  }
1278 
1279  return AM.Scale == 0 &&
1280  (AM.BaseOffs == 0 ||
1281  Subtarget->getInstrInfo()->isLegalFLATOffset(
1283 }
1284 
1286  if (Subtarget->hasFlatGlobalInsts())
1287  return AM.Scale == 0 &&
1288  (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1291 
1292  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1293  // Assume the we will use FLAT for all global memory accesses
1294  // on VI.
1295  // FIXME: This assumption is currently wrong. On VI we still use
1296  // MUBUF instructions for the r + i addressing mode. As currently
1297  // implemented, the MUBUF instructions only work on buffer < 4GB.
1298  // It may be possible to support > 4GB buffers with MUBUF instructions,
1299  // by setting the stride value in the resource descriptor which would
1300  // increase the size limit to (stride * 4GB). However, this is risky,
1301  // because it has never been validated.
1302  return isLegalFlatAddressingMode(AM);
1303  }
1304 
1305  return isLegalMUBUFAddressingMode(AM);
1306 }
1307 
1308 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1309  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1310  // additionally can do r + r + i with addr64. 32-bit has more addressing
1311  // mode options. Depending on the resource constant, it can also do
1312  // (i64 r0) + (i32 r1) * (i14 i).
1313  //
1314  // Private arrays end up using a scratch buffer most of the time, so also
1315  // assume those use MUBUF instructions. Scratch loads / stores are currently
1316  // implemented as mubuf instructions with offen bit set, so slightly
1317  // different than the normal addr64.
1318  if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
1319  return false;
1320 
1321  // FIXME: Since we can split immediate into soffset and immediate offset,
1322  // would it make sense to allow any immediate?
1323 
1324  switch (AM.Scale) {
1325  case 0: // r + i or just i, depending on HasBaseReg.
1326  return true;
1327  case 1:
1328  return true; // We have r + r or r + i.
1329  case 2:
1330  if (AM.HasBaseReg) {
1331  // Reject 2 * r + r.
1332  return false;
1333  }
1334 
1335  // Allow 2 * r as r + r
1336  // Or 2 * r + i is allowed as r + r + i.
1337  return true;
1338  default: // Don't allow n * r
1339  return false;
1340  }
1341 }
1342 
1344  const AddrMode &AM, Type *Ty,
1345  unsigned AS, Instruction *I) const {
1346  // No global is ever allowed as a base.
1347  if (AM.BaseGV)
1348  return false;
1349 
1350  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1351  return isLegalGlobalAddressingMode(AM);
1352 
1353  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1356  // If the offset isn't a multiple of 4, it probably isn't going to be
1357  // correctly aligned.
1358  // FIXME: Can we get the real alignment here?
1359  if (AM.BaseOffs % 4 != 0)
1360  return isLegalMUBUFAddressingMode(AM);
1361 
1362  // There are no SMRD extloads, so if we have to do a small type access we
1363  // will use a MUBUF load.
1364  // FIXME?: We also need to do this if unaligned, but we don't know the
1365  // alignment here.
1366  if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1367  return isLegalGlobalAddressingMode(AM);
1368 
1369  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1370  // SMRD instructions have an 8-bit, dword offset on SI.
1371  if (!isUInt<8>(AM.BaseOffs / 4))
1372  return false;
1373  } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1374  // On CI+, this can also be a 32-bit literal constant offset. If it fits
1375  // in 8-bits, it can use a smaller encoding.
1376  if (!isUInt<32>(AM.BaseOffs / 4))
1377  return false;
1378  } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1379  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1380  if (!isUInt<20>(AM.BaseOffs))
1381  return false;
1382  } else
1383  llvm_unreachable("unhandled generation");
1384 
1385  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1386  return true;
1387 
1388  if (AM.Scale == 1 && AM.HasBaseReg)
1389  return true;
1390 
1391  return false;
1392 
1393  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1394  return isLegalMUBUFAddressingMode(AM);
1395  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1396  AS == AMDGPUAS::REGION_ADDRESS) {
1397  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1398  // field.
1399  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1400  // an 8-bit dword offset but we don't know the alignment here.
1401  if (!isUInt<16>(AM.BaseOffs))
1402  return false;
1403 
1404  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1405  return true;
1406 
1407  if (AM.Scale == 1 && AM.HasBaseReg)
1408  return true;
1409 
1410  return false;
1411  } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1413  // For an unknown address space, this usually means that this is for some
1414  // reason being used for pure arithmetic, and not based on some addressing
1415  // computation. We don't have instructions that compute pointers with any
1416  // addressing modes, so treat them as having no offset like flat
1417  // instructions.
1418  return isLegalFlatAddressingMode(AM);
1419  }
1420 
1421  // Assume a user alias of global for unknown address spaces.
1422  return isLegalGlobalAddressingMode(AM);
1423 }
1424 
1425 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1426  const SelectionDAG &DAG) const {
1427  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1428  return (MemVT.getSizeInBits() <= 4 * 32);
1429  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1430  unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1431  return (MemVT.getSizeInBits() <= MaxPrivateBits);
1432  } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1433  return (MemVT.getSizeInBits() <= 2 * 32);
1434  }
1435  return true;
1436 }
1437 
1439  unsigned Size, unsigned AddrSpace, Align Alignment,
1440  MachineMemOperand::Flags Flags, bool *IsFast) const {
1441  if (IsFast)
1442  *IsFast = false;
1443 
1444  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1445  AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1446  // Check if alignment requirements for ds_read/write instructions are
1447  // disabled.
1448  if (Subtarget->hasUnalignedDSAccessEnabled() &&
1449  !Subtarget->hasLDSMisalignedBug()) {
1450  if (IsFast)
1451  *IsFast = Alignment != Align(2);
1452  return true;
1453  }
1454 
1455  // Either, the alignment requirements are "enabled", or there is an
1456  // unaligned LDS access related hardware bug though alignment requirements
1457  // are "disabled". In either case, we need to check for proper alignment
1458  // requirements.
1459  //
1460  if (Size == 64) {
1461  // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1462  // can do a 4 byte aligned, 8 byte access in a single operation using
1463  // ds_read2/write2_b32 with adjacent offsets.
1464  bool AlignedBy4 = Alignment >= Align(4);
1465  if (IsFast)
1466  *IsFast = AlignedBy4;
1467 
1468  return AlignedBy4;
1469  }
1470  if (Size == 96) {
1471  // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1472  // gfx8 and older.
1473  bool AlignedBy16 = Alignment >= Align(16);
1474  if (IsFast)
1475  *IsFast = AlignedBy16;
1476 
1477  return AlignedBy16;
1478  }
1479  if (Size == 128) {
1480  // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1481  // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1482  // single operation using ds_read2/write2_b64.
1483  bool AlignedBy8 = Alignment >= Align(8);
1484  if (IsFast)
1485  *IsFast = AlignedBy8;
1486 
1487  return AlignedBy8;
1488  }
1489  }
1490 
1491  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1492  bool AlignedBy4 = Alignment >= Align(4);
1493  if (IsFast)
1494  *IsFast = AlignedBy4;
1495 
1496  return AlignedBy4 ||
1497  Subtarget->enableFlatScratch() ||
1498  Subtarget->hasUnalignedScratchAccess();
1499  }
1500 
1501  // FIXME: We have to be conservative here and assume that flat operations
1502  // will access scratch. If we had access to the IR function, then we
1503  // could determine if any private memory was used in the function.
1504  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1505  !Subtarget->hasUnalignedScratchAccess()) {
1506  bool AlignedBy4 = Alignment >= Align(4);
1507  if (IsFast)
1508  *IsFast = AlignedBy4;
1509 
1510  return AlignedBy4;
1511  }
1512 
1513  if (Subtarget->hasUnalignedBufferAccessEnabled() &&
1514  !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1515  AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
1516  // If we have an uniform constant load, it still requires using a slow
1517  // buffer instruction if unaligned.
1518  if (IsFast) {
1519  // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
1520  // 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
1521  *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1522  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1523  Alignment >= Align(4) : Alignment != Align(2);
1524  }
1525 
1526  return true;
1527  }
1528 
1529  // Smaller than dword value must be aligned.
1530  if (Size < 32)
1531  return false;
1532 
1533  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1534  // byte-address are ignored, thus forcing Dword alignment.
1535  // This applies to private, global, and constant memory.
1536  if (IsFast)
1537  *IsFast = true;
1538 
1539  return Size >= 32 && Alignment >= Align(4);
1540 }
1541 
1543  EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1544  bool *IsFast) const {
1545  if (IsFast)
1546  *IsFast = false;
1547 
1548  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1549  // which isn't a simple VT.
1550  // Until MVT is extended to handle this, simply check for the size and
1551  // rely on the condition below: allow accesses if the size is a multiple of 4.
1552  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1553  VT.getStoreSize() > 16)) {
1554  return false;
1555  }
1556 
1557  return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
1558  Alignment, Flags, IsFast);
1559 }
1560 
1562  const MemOp &Op, const AttributeList &FuncAttributes) const {
1563  // FIXME: Should account for address space here.
1564 
1565  // The default fallback uses the private pointer size as a guess for a type to
1566  // use. Make sure we switch these to 64-bit accesses.
1567 
1568  if (Op.size() >= 16 &&
1569  Op.isDstAligned(Align(4))) // XXX: Should only do for global
1570  return MVT::v4i32;
1571 
1572  if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1573  return MVT::v2i32;
1574 
1575  // Use the default.
1576  return MVT::Other;
1577 }
1578 
1580  const MemSDNode *MemNode = cast<MemSDNode>(N);
1581  const Value *Ptr = MemNode->getMemOperand()->getValue();
1582  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
1583  return I && I->getMetadata("amdgpu.noclobber");
1584 }
1585 
1587  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1589 }
1590 
1592  unsigned DestAS) const {
1593  // Flat -> private/local is a simple truncate.
1594  // Flat -> global is no-op
1595  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1596  return true;
1597 
1598  const GCNTargetMachine &TM =
1599  static_cast<const GCNTargetMachine &>(getTargetMachine());
1600  return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1601 }
1602 
1604  const MemSDNode *MemNode = cast<MemSDNode>(N);
1605 
1606  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1607 }
1608 
1611  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1615 }
1616 
1618  Type *Ty) const {
1619  // FIXME: Could be smarter if called for vector constants.
1620  return true;
1621 }
1622 
1624  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1625  switch (Op) {
1626  case ISD::LOAD:
1627  case ISD::STORE:
1628 
1629  // These operations are done with 32-bit instructions anyway.
1630  case ISD::AND:
1631  case ISD::OR:
1632  case ISD::XOR:
1633  case ISD::SELECT:
1634  // TODO: Extensions?
1635  return true;
1636  default:
1637  return false;
1638  }
1639  }
1640 
1641  // SimplifySetCC uses this function to determine whether or not it should
1642  // create setcc with i1 operands. We don't have instructions for i1 setcc.
1643  if (VT == MVT::i1 && Op == ISD::SETCC)
1644  return false;
1645 
1647 }
1648 
1649 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1650  const SDLoc &SL,
1651  SDValue Chain,
1652  uint64_t Offset) const {
1653  const DataLayout &DL = DAG.getDataLayout();
1654  MachineFunction &MF = DAG.getMachineFunction();
1656 
1657  const ArgDescriptor *InputPtrReg;
1658  const TargetRegisterClass *RC;
1659  LLT ArgTy;
1660 
1661  std::tie(InputPtrReg, RC, ArgTy) =
1663 
1666  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1667  MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1668 
1669  return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
1670 }
1671 
1672 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1673  const SDLoc &SL) const {
1675  FIRST_IMPLICIT);
1676  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1677 }
1678 
1679 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1680  const SDLoc &SL, SDValue Val,
1681  bool Signed,
1682  const ISD::InputArg *Arg) const {
1683  // First, if it is a widened vector, narrow it.
1684  if (VT.isVector() &&
1685  VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1686  EVT NarrowedVT =
1688  VT.getVectorNumElements());
1689  Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1690  DAG.getConstant(0, SL, MVT::i32));
1691  }
1692 
1693  // Then convert the vector elements or scalar value.
1694  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1695  VT.bitsLT(MemVT)) {
1696  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1697  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1698  }
1699 
1700  if (MemVT.isFloatingPoint())
1701  Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1702  else if (Signed)
1703  Val = DAG.getSExtOrTrunc(Val, SL, VT);
1704  else
1705  Val = DAG.getZExtOrTrunc(Val, SL, VT);
1706 
1707  return Val;
1708 }
1709 
1710 SDValue SITargetLowering::lowerKernargMemParameter(
1711  SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
1712  uint64_t Offset, Align Alignment, bool Signed,
1713  const ISD::InputArg *Arg) const {
1715 
1716  // Try to avoid using an extload by loading earlier than the argument address,
1717  // and extracting the relevant bits. The load should hopefully be merged with
1718  // the previous argument.
1719  if (MemVT.getStoreSize() < 4 && Alignment < 4) {
1720  // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1721  int64_t AlignDownOffset = alignDown(Offset, 4);
1722  int64_t OffsetDiff = Offset - AlignDownOffset;
1723 
1724  EVT IntVT = MemVT.changeTypeToInteger();
1725 
1726  // TODO: If we passed in the base kernel offset we could have a better
1727  // alignment than 4, but we don't really need it.
1728  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1729  SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
1732 
1733  SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1734  SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1735 
1736  SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1737  ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1738  ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1739 
1740 
1741  return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1742  }
1743 
1744  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1745  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
1748 
1749  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1750  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1751 }
1752 
1753 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1754  const SDLoc &SL, SDValue Chain,
1755  const ISD::InputArg &Arg) const {
1756  MachineFunction &MF = DAG.getMachineFunction();
1757  MachineFrameInfo &MFI = MF.getFrameInfo();
1758 
1759  if (Arg.Flags.isByVal()) {
1760  unsigned Size = Arg.Flags.getByValSize();
1761  int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1762  return DAG.getFrameIndex(FrameIdx, MVT::i32);
1763  }
1764 
1765  unsigned ArgOffset = VA.getLocMemOffset();
1766  unsigned ArgSize = VA.getValVT().getStoreSize();
1767 
1768  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1769 
1770  // Create load nodes to retrieve arguments from the stack.
1771  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1772  SDValue ArgValue;
1773 
1774  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1776  MVT MemVT = VA.getValVT();
1777 
1778  switch (VA.getLocInfo()) {
1779  default:
1780  break;
1781  case CCValAssign::BCvt:
1782  MemVT = VA.getLocVT();
1783  break;
1784  case CCValAssign::SExt:
1785  ExtType = ISD::SEXTLOAD;
1786  break;
1787  case CCValAssign::ZExt:
1788  ExtType = ISD::ZEXTLOAD;
1789  break;
1790  case CCValAssign::AExt:
1791  ExtType = ISD::EXTLOAD;
1792  break;
1793  }
1794 
1795  ArgValue = DAG.getExtLoad(
1796  ExtType, SL, VA.getLocVT(), Chain, FIN,
1798  MemVT);
1799  return ArgValue;
1800 }
1801 
1802 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1803  const SIMachineFunctionInfo &MFI,
1804  EVT VT,
1806  const ArgDescriptor *Reg;
1807  const TargetRegisterClass *RC;
1808  LLT Ty;
1809 
1810  std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
1811  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1812 }
1813 
1815  CallingConv::ID CallConv,
1817  FunctionType *FType,
1819  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1820  const ISD::InputArg *Arg = &Ins[I];
1821 
1822  assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1823  "vector type argument should have been split");
1824 
1825  // First check if it's a PS input addr.
1826  if (CallConv == CallingConv::AMDGPU_PS &&
1827  !Arg->Flags.isInReg() && PSInputNum <= 15) {
1828  bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1829 
1830  // Inconveniently only the first part of the split is marked as isSplit,
1831  // so skip to the end. We only want to increment PSInputNum once for the
1832  // entire split argument.
1833  if (Arg->Flags.isSplit()) {
1834  while (!Arg->Flags.isSplitEnd()) {
1835  assert((!Arg->VT.isVector() ||
1836  Arg->VT.getScalarSizeInBits() == 16) &&
1837  "unexpected vector split in ps argument type");
1838  if (!SkipArg)
1839  Splits.push_back(*Arg);
1840  Arg = &Ins[++I];
1841  }
1842  }
1843 
1844  if (SkipArg) {
1845  // We can safely skip PS inputs.
1846  Skipped.set(Arg->getOrigArgIndex());
1847  ++PSInputNum;
1848  continue;
1849  }
1850 
1851  Info->markPSInputAllocated(PSInputNum);
1852  if (Arg->Used)
1853  Info->markPSInputEnabled(PSInputNum);
1854 
1855  ++PSInputNum;
1856  }
1857 
1858  Splits.push_back(*Arg);
1859  }
1860 }
1861 
1862 // Allocate special inputs passed in VGPRs.
1864  MachineFunction &MF,
1865  const SIRegisterInfo &TRI,
1866  SIMachineFunctionInfo &Info) const {
1867  const LLT S32 = LLT::scalar(32);
1869 
1870  if (Info.hasWorkItemIDX()) {
1871  Register Reg = AMDGPU::VGPR0;
1872  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1873 
1874  CCInfo.AllocateReg(Reg);
1875  unsigned Mask = (Subtarget->hasPackedTID() &&
1876  Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1877  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
1878  }
1879 
1880  if (Info.hasWorkItemIDY()) {
1881  assert(Info.hasWorkItemIDX());
1882  if (Subtarget->hasPackedTID()) {
1883  Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1884  0x3ff << 10));
1885  } else {
1886  unsigned Reg = AMDGPU::VGPR1;
1887  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1888 
1889  CCInfo.AllocateReg(Reg);
1890  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1891  }
1892  }
1893 
1894  if (Info.hasWorkItemIDZ()) {
1895  assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
1896  if (Subtarget->hasPackedTID()) {
1897  Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1898  0x3ff << 20));
1899  } else {
1900  unsigned Reg = AMDGPU::VGPR2;
1901  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1902 
1903  CCInfo.AllocateReg(Reg);
1904  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1905  }
1906  }
1907 }
1908 
1909 // Try to allocate a VGPR at the end of the argument list, or if no argument
1910 // VGPRs are left allocating a stack slot.
1911 // If \p Mask is is given it indicates bitfield position in the register.
1912 // If \p Arg is given use it with new ]p Mask instead of allocating new.
1913 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
1915  if (Arg.isSet())
1917 
1918  ArrayRef<MCPhysReg> ArgVGPRs
1919  = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1920  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1921  if (RegIdx == ArgVGPRs.size()) {
1922  // Spill to stack required.
1923  int64_t Offset = CCInfo.AllocateStack(4, Align(4));
1924 
1926  }
1927 
1928  unsigned Reg = ArgVGPRs[RegIdx];
1929  Reg = CCInfo.AllocateReg(Reg);
1930  assert(Reg != AMDGPU::NoRegister);
1931 
1932  MachineFunction &MF = CCInfo.getMachineFunction();
1933  Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1934  MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
1936 }
1937 
1939  const TargetRegisterClass *RC,
1940  unsigned NumArgRegs) {
1941  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1942  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1943  if (RegIdx == ArgSGPRs.size())
1944  report_fatal_error("ran out of SGPRs for arguments");
1945 
1946  unsigned Reg = ArgSGPRs[RegIdx];
1947  Reg = CCInfo.AllocateReg(Reg);
1948  assert(Reg != AMDGPU::NoRegister);
1949 
1950  MachineFunction &MF = CCInfo.getMachineFunction();
1951  MF.addLiveIn(Reg, RC);
1953 }
1954 
1955 // If this has a fixed position, we still should allocate the register in the
1956 // CCInfo state. Technically we could get away with this for values passed
1957 // outside of the normal argument range.
1959  const TargetRegisterClass *RC,
1960  MCRegister Reg) {
1961  Reg = CCInfo.AllocateReg(Reg);
1962  assert(Reg != AMDGPU::NoRegister);
1963  MachineFunction &MF = CCInfo.getMachineFunction();
1964  MF.addLiveIn(Reg, RC);
1965 }
1966 
1968  if (Arg) {
1969  allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
1970  Arg.getRegister());
1971  } else
1972  Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1973 }
1974 
1976  if (Arg) {
1977  allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
1978  Arg.getRegister());
1979  } else
1980  Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1981 }
1982 
1983 /// Allocate implicit function VGPR arguments at the end of allocated user
1984 /// arguments.
1986  CCState &CCInfo, MachineFunction &MF,
1987  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
1988  const unsigned Mask = 0x3ff;
1990 
1991  if (Info.hasWorkItemIDX()) {
1992  Arg = allocateVGPR32Input(CCInfo, Mask);
1993  Info.setWorkItemIDX(Arg);
1994  }
1995 
1996  if (Info.hasWorkItemIDY()) {
1997  Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
1998  Info.setWorkItemIDY(Arg);
1999  }
2000 
2001  if (Info.hasWorkItemIDZ())
2002  Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2003 }
2004 
2005 /// Allocate implicit function VGPR arguments in fixed registers.
2007  CCState &CCInfo, MachineFunction &MF,
2008  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2009  Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2010  if (!Reg)
2011  report_fatal_error("failed to allocated VGPR for implicit arguments");
2012 
2013  const unsigned Mask = 0x3ff;
2014  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2015  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2016  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2017 }
2018 
2020  CCState &CCInfo,
2021  MachineFunction &MF,
2022  const SIRegisterInfo &TRI,
2023  SIMachineFunctionInfo &Info) const {
2024  auto &ArgInfo = Info.getArgInfo();
2025 
2026  // TODO: Unify handling with private memory pointers.
2027 
2028  if (Info.hasDispatchPtr())
2029  allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2030 
2031  if (Info.hasQueuePtr())
2032  allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2033 
2034  // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2035  // constant offset from the kernarg segment.
2036  if (Info.hasImplicitArgPtr())
2037  allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2038 
2039  if (Info.hasDispatchID())
2040  allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2041 
2042  // flat_scratch_init is not applicable for non-kernel functions.
2043 
2044  if (Info.hasWorkGroupIDX())
2045  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2046 
2047  if (Info.hasWorkGroupIDY())
2048  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2049 
2050  if (Info.hasWorkGroupIDZ())
2051  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2052 }
2053 
2054 // Allocate special inputs passed in user SGPRs.
2056  MachineFunction &MF,
2057  const SIRegisterInfo &TRI,
2058  SIMachineFunctionInfo &Info) const {
2059  if (Info.hasImplicitBufferPtr()) {
2060  Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2061  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2062  CCInfo.AllocateReg(ImplicitBufferPtrReg);
2063  }
2064 
2065  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2066  if (Info.hasPrivateSegmentBuffer()) {
2067  Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2068  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2069  CCInfo.AllocateReg(PrivateSegmentBufferReg);
2070  }
2071 
2072  if (Info.hasDispatchPtr()) {
2073  Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2074  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2075  CCInfo.AllocateReg(DispatchPtrReg);
2076  }
2077 
2078  if (Info.hasQueuePtr()) {
2079  Register QueuePtrReg = Info.addQueuePtr(TRI);
2080  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2081  CCInfo.AllocateReg(QueuePtrReg);
2082  }
2083 
2084  if (Info.hasKernargSegmentPtr()) {
2086  Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2087  CCInfo.AllocateReg(InputPtrReg);
2088 
2089  Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2091  }
2092 
2093  if (Info.hasDispatchID()) {
2094  Register DispatchIDReg = Info.addDispatchID(TRI);
2095  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2096  CCInfo.AllocateReg(DispatchIDReg);
2097  }
2098 
2099  if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2100  Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2101  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2102  CCInfo.AllocateReg(FlatScratchInitReg);
2103  }
2104 
2105  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2106  // these from the dispatch pointer.
2107 }
2108 
2109 // Allocate special input registers that are initialized per-wave.
2111  MachineFunction &MF,
2113  CallingConv::ID CallConv,
2114  bool IsShader) const {
2115  if (Info.hasWorkGroupIDX()) {
2116  Register Reg = Info.addWorkGroupIDX();
2117  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2118  CCInfo.AllocateReg(Reg);
2119  }
2120 
2121  if (Info.hasWorkGroupIDY()) {
2122  Register Reg = Info.addWorkGroupIDY();
2123  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2124  CCInfo.AllocateReg(Reg);
2125  }
2126 
2127  if (Info.hasWorkGroupIDZ()) {
2128  Register Reg = Info.addWorkGroupIDZ();
2129  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2130  CCInfo.AllocateReg(Reg);
2131  }
2132 
2133  if (Info.hasWorkGroupInfo()) {
2134  Register Reg = Info.addWorkGroupInfo();
2135  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2136  CCInfo.AllocateReg(Reg);
2137  }
2138 
2139  if (Info.hasPrivateSegmentWaveByteOffset()) {
2140  // Scratch wave offset passed in system SGPR.
2141  unsigned PrivateSegmentWaveByteOffsetReg;
2142 
2143  if (IsShader) {
2144  PrivateSegmentWaveByteOffsetReg =
2145  Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2146 
2147  // This is true if the scratch wave byte offset doesn't have a fixed
2148  // location.
2149  if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2150  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2151  Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2152  }
2153  } else
2154  PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2155 
2156  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2157  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2158  }
2159 }
2160 
2162  MachineFunction &MF,
2163  const SIRegisterInfo &TRI,
2165  // Now that we've figured out where the scratch register inputs are, see if
2166  // should reserve the arguments and use them directly.
2167  MachineFrameInfo &MFI = MF.getFrameInfo();
2168  bool HasStackObjects = MFI.hasStackObjects();
2169  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2170 
2171  // Record that we know we have non-spill stack objects so we don't need to
2172  // check all stack objects later.
2173  if (HasStackObjects)
2174  Info.setHasNonSpillStackObjects(true);
2175 
2176  // Everything live out of a block is spilled with fast regalloc, so it's
2177  // almost certain that spilling will be required.
2178  if (TM.getOptLevel() == CodeGenOpt::None)
2179  HasStackObjects = true;
2180 
2181  // For now assume stack access is needed in any callee functions, so we need
2182  // the scratch registers to pass in.
2183  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2184 
2185  if (!ST.enableFlatScratch()) {
2186  if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2187  // If we have stack objects, we unquestionably need the private buffer
2188  // resource. For the Code Object V2 ABI, this will be the first 4 user
2189  // SGPR inputs. We can reserve those and use them directly.
2190 
2191  Register PrivateSegmentBufferReg =
2193  Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2194  } else {
2195  unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2196  // We tentatively reserve the last registers (skipping the last registers
2197  // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2198  // we'll replace these with the ones immediately after those which were
2199  // really allocated. In the prologue copies will be inserted from the
2200  // argument to these reserved registers.
2201 
2202  // Without HSA, relocations are used for the scratch pointer and the
2203  // buffer resource setup is always inserted in the prologue. Scratch wave
2204  // offset is still in an input SGPR.
2205  Info.setScratchRSrcReg(ReservedBufferReg);
2206  }
2207  }
2208 
2210 
2211  // For entry functions we have to set up the stack pointer if we use it,
2212  // whereas non-entry functions get this "for free". This means there is no
2213  // intrinsic advantage to using S32 over S34 in cases where we do not have
2214  // calls but do need a frame pointer (i.e. if we are requested to have one
2215  // because frame pointer elimination is disabled). To keep things simple we
2216  // only ever use S32 as the call ABI stack pointer, and so using it does not
2217  // imply we need a separate frame pointer.
2218  //
2219  // Try to use s32 as the SP, but move it if it would interfere with input
2220  // arguments. This won't work with calls though.
2221  //
2222  // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2223  // registers.
2224  if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2225  Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2226  } else {
2228 
2229  if (MFI.hasCalls())
2230  report_fatal_error("call in graphics shader with too many input SGPRs");
2231 
2232  for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2233  if (!MRI.isLiveIn(Reg)) {
2234  Info.setStackPtrOffsetReg(Reg);
2235  break;
2236  }
2237  }
2238 
2239  if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2240  report_fatal_error("failed to find register for SP");
2241  }
2242 
2243  // hasFP should be accurate for entry functions even before the frame is
2244  // finalized, because it does not rely on the known stack size, only
2245  // properties like whether variable sized objects are present.
2246  if (ST.getFrameLowering()->hasFP(MF)) {
2247  Info.setFrameOffsetReg(AMDGPU::SGPR33);
2248  }
2249 }
2250 
2253  return !Info->isEntryFunction();
2254 }
2255 
2257 
2258 }
2259 
2261  MachineBasicBlock *Entry,
2262  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2264 
2265  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2266  if (!IStart)
2267  return;
2268 
2269  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2270  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2271  MachineBasicBlock::iterator MBBI = Entry->begin();
2272  for (const MCPhysReg *I = IStart; *I; ++I) {
2273  const TargetRegisterClass *RC = nullptr;
2274  if (AMDGPU::SReg_64RegClass.contains(*I))
2275  RC = &AMDGPU::SGPR_64RegClass;
2276  else if (AMDGPU::SReg_32RegClass.contains(*I))
2277  RC = &AMDGPU::SGPR_32RegClass;
2278  else
2279  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2280 
2281  Register NewVR = MRI->createVirtualRegister(RC);
2282  // Create copy from CSR to a virtual register.
2283  Entry->addLiveIn(*I);
2284  BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2285  .addReg(*I);
2286 
2287  // Insert the copy-back instructions right before the terminator.
2288  for (auto *Exit : Exits)
2289  BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2290  TII->get(TargetOpcode::COPY), *I)
2291  .addReg(NewVR);
2292  }
2293 }
2294 
2296  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2297  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2298  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2300 
2301  MachineFunction &MF = DAG.getMachineFunction();
2302  const Function &Fn = MF.getFunction();
2303  FunctionType *FType = MF.getFunction().getFunctionType();
2305 
2306  if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2307  DiagnosticInfoUnsupported NoGraphicsHSA(
2308  Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2309  DAG.getContext()->diagnose(NoGraphicsHSA);
2310  return DAG.getEntryNode();
2311  }
2312 
2313  Info->allocateModuleLDSGlobal(Fn.getParent());
2314 
2317  BitVector Skipped(Ins.size());
2318  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2319  *DAG.getContext());
2320 
2321  bool IsGraphics = AMDGPU::isGraphics(CallConv);
2322  bool IsKernel = AMDGPU::isKernel(CallConv);
2323  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2324 
2325  if (IsGraphics) {
2326  assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
2327  (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
2328  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2329  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
2330  !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
2331  !Info->hasWorkItemIDZ());
2332  }
2333 
2334  if (CallConv == CallingConv::AMDGPU_PS) {
2335  processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2336 
2337  // At least one interpolation mode must be enabled or else the GPU will
2338  // hang.
2339  //
2340  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2341  // set PSInputAddr, the user wants to enable some bits after the compilation
2342  // based on run-time states. Since we can't know what the final PSInputEna
2343  // will look like, so we shouldn't do anything here and the user should take
2344  // responsibility for the correct programming.
2345  //
2346  // Otherwise, the following restrictions apply:
2347  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2348  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2349  // enabled too.
2350  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2351  ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2352  CCInfo.AllocateReg(AMDGPU::VGPR0);
2353  CCInfo.AllocateReg(AMDGPU::VGPR1);
2354  Info->markPSInputAllocated(0);
2355  Info->markPSInputEnabled(0);
2356  }
2357  if (Subtarget->isAmdPalOS()) {
2358  // For isAmdPalOS, the user does not enable some bits after compilation
2359  // based on run-time states; the register values being generated here are
2360  // the final ones set in hardware. Therefore we need to apply the
2361  // workaround to PSInputAddr and PSInputEnable together. (The case where
2362  // a bit is set in PSInputAddr but not PSInputEnable is where the
2363  // frontend set up an input arg for a particular interpolation mode, but
2364  // nothing uses that input arg. Really we should have an earlier pass
2365  // that removes such an arg.)
2366  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2367  if ((PsInputBits & 0x7F) == 0 ||
2368  ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2369  Info->markPSInputEnabled(
2370  countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
2371  }
2372  } else if (IsKernel) {
2373  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2374  } else {
2375  Splits.append(Ins.begin(), Ins.end());
2376  }
2377 
2378  if (IsEntryFunc) {
2379  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2380  allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2381  } else {
2382  // For the fixed ABI, pass workitem IDs in the last argument register.
2384  allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2385  }
2386 
2387  if (IsKernel) {
2389  } else {
2390  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2391  CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2392  }
2393 
2394  SmallVector<SDValue, 16> Chains;
2395 
2396  // FIXME: This is the minimum kernel argument alignment. We should improve
2397  // this to the maximum alignment of the arguments.
2398  //
2399  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2400  // kern arg offset.
2401  const Align KernelArgBaseAlign = Align(16);
2402 
2403  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2404  const ISD::InputArg &Arg = Ins[i];
2405  if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2406  InVals.push_back(DAG.getUNDEF(Arg.VT));
2407  continue;
2408  }
2409 
2410  CCValAssign &VA = ArgLocs[ArgIdx++];
2411  MVT VT = VA.getLocVT();
2412 
2413  if (IsEntryFunc && VA.isMemLoc()) {
2414  VT = Ins[i].VT;
2415  EVT MemVT = VA.getLocVT();
2416 
2417  const uint64_t Offset = VA.getLocMemOffset();
2418  Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2419 
2420  if (Arg.Flags.isByRef()) {
2421  SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2422 
2423  const GCNTargetMachine &TM =
2424  static_cast<const GCNTargetMachine &>(getTargetMachine());
2425  if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2426  Arg.Flags.getPointerAddrSpace())) {
2427  Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
2428  Arg.Flags.getPointerAddrSpace());
2429  }
2430 
2431  InVals.push_back(Ptr);
2432  continue;
2433  }
2434 
2435  SDValue Arg = lowerKernargMemParameter(
2436  DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2437  Chains.push_back(Arg.getValue(1));
2438 
2439  auto *ParamTy =
2440  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2441  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2442  ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2443  ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2444  // On SI local pointers are just offsets into LDS, so they are always
2445  // less than 16-bits. On CI and newer they could potentially be
2446  // real pointers, so we can't guarantee their size.
2447  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2448  DAG.getValueType(MVT::i16));
2449  }
2450 
2451  InVals.push_back(Arg);
2452  continue;
2453  } else if (!IsEntryFunc && VA.isMemLoc()) {
2454  SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2455  InVals.push_back(Val);
2456  if (!Arg.Flags.isByVal())
2457  Chains.push_back(Val.getValue(1));
2458  continue;
2459  }
2460 
2461  assert(VA.isRegLoc() && "Parameter must be in a register!");
2462 
2463  Register Reg = VA.getLocReg();
2465  EVT ValVT = VA.getValVT();
2466 
2467  Reg = MF.addLiveIn(Reg, RC);
2468  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2469 
2470  if (Arg.Flags.isSRet()) {
2471  // The return object should be reasonably addressable.
2472 
2473  // FIXME: This helps when the return is a real sret. If it is a
2474  // automatically inserted sret (i.e. CanLowerReturn returns false), an
2475  // extra copy is inserted in SelectionDAGBuilder which obscures this.
2476  unsigned NumBits
2478  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2479  DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2480  }
2481 
2482  // If this is an 8 or 16-bit value, it is really passed promoted
2483  // to 32 bits. Insert an assert[sz]ext to capture this, then
2484  // truncate to the right size.
2485  switch (VA.getLocInfo()) {
2486  case CCValAssign::Full:
2487  break;
2488  case CCValAssign::BCvt:
2489  Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2490  break;
2491  case CCValAssign::SExt:
2492  Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2493  DAG.getValueType(ValVT));
2494  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2495  break;
2496  case CCValAssign::ZExt:
2497  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2498  DAG.getValueType(ValVT));
2499  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2500  break;
2501  case CCValAssign::AExt:
2502  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2503  break;
2504  default:
2505  llvm_unreachable("Unknown loc info!");
2506  }
2507 
2508  InVals.push_back(Val);
2509  }
2510 
2511  if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
2512  // Special inputs come after user arguments.
2513  allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2514  }
2515 
2516  // Start adding system SGPRs.
2517  if (IsEntryFunc) {
2518  allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
2519  } else {
2520  CCInfo.AllocateReg(Info->getScratchRSrcReg());
2521  allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2522  }
2523 
2524  auto &ArgUsageInfo =
2526  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2527 
2528  unsigned StackArgSize = CCInfo.getNextStackOffset();
2529  Info->setBytesInStackArgArea(StackArgSize);
2530 
2531  return Chains.empty() ? Chain :
2532  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2533 }
2534 
2535 // TODO: If return values can't fit in registers, we should return as many as
2536 // possible in registers before passing on stack.
2538  CallingConv::ID CallConv,
2539  MachineFunction &MF, bool IsVarArg,
2540  const SmallVectorImpl<ISD::OutputArg> &Outs,
2541  LLVMContext &Context) const {
2542  // Replacing returns with sret/stack usage doesn't make sense for shaders.
2543  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2544  // for shaders. Vector types should be explicitly handled by CC.
2545  if (AMDGPU::isEntryFunctionCC(CallConv))
2546  return true;
2547 
2549  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2550  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2551 }
2552 
2553 SDValue
2555  bool isVarArg,
2556  const SmallVectorImpl<ISD::OutputArg> &Outs,
2557  const SmallVectorImpl<SDValue> &OutVals,
2558  const SDLoc &DL, SelectionDAG &DAG) const {
2559  MachineFunction &MF = DAG.getMachineFunction();
2561 
2562  if (AMDGPU::isKernel(CallConv)) {
2563  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2564  OutVals, DL, DAG);
2565  }
2566 
2567  bool IsShader = AMDGPU::isShader(CallConv);
2568 
2569  Info->setIfReturnsVoid(Outs.empty());
2570  bool IsWaveEnd = Info->returnsVoid() && IsShader;
2571 
2572  // CCValAssign - represent the assignment of the return value to a location.
2575 
2576  // CCState - Info about the registers and stack slots.
2577  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2578  *DAG.getContext());
2579 
2580  // Analyze outgoing return values.
2581  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2582 
2583  SDValue Flag;
2584  SmallVector<SDValue, 48> RetOps;
2585  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2586 
2587  // Add return address for callable functions.
2588  if (!Info->isEntryFunction()) {
2590  SDValue ReturnAddrReg = CreateLiveInRegister(
2591  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2592 
2593  SDValue ReturnAddrVirtualReg = DAG.getRegister(
2594  MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass),
2595  MVT::i64);
2596  Chain =
2597  DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
2598  Flag = Chain.getValue(1);
2599  RetOps.push_back(ReturnAddrVirtualReg);
2600  }
2601 
2602  // Copy the result values into the output registers.
2603  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2604  ++I, ++RealRVLocIdx) {
2605  CCValAssign &VA = RVLocs[I];
2606  assert(VA.isRegLoc() && "Can only return in registers!");
2607  // TODO: Partially return in registers if return values don't fit.
2608  SDValue Arg = OutVals[RealRVLocIdx];
2609 
2610  // Copied from other backends.
2611  switch (VA.getLocInfo()) {
2612  case CCValAssign::Full:
2613  break;
2614  case CCValAssign::BCvt:
2615  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2616  break;
2617  case CCValAssign::SExt:
2618  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2619  break;
2620  case CCValAssign::ZExt:
2621  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2622  break;
2623  case CCValAssign::AExt:
2624  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2625  break;
2626  default:
2627  llvm_unreachable("Unknown loc info!");
2628  }
2629 
2630  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2631  Flag = Chain.getValue(1);
2632  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2633  }
2634 
2635  // FIXME: Does sret work properly?
2636  if (!Info->isEntryFunction()) {
2637  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2638  const MCPhysReg *I =
2639  TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2640  if (I) {
2641  for (; *I; ++I) {
2642  if (AMDGPU::SReg_64RegClass.contains(*I))
2643  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2644  else if (AMDGPU::SReg_32RegClass.contains(*I))
2645  RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2646  else
2647  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2648  }
2649  }
2650  }
2651 
2652  // Update chain and glue.
2653  RetOps[0] = Chain;
2654  if (Flag.getNode())
2655  RetOps.push_back(Flag);
2656 
2657  unsigned Opc = AMDGPUISD::ENDPGM;
2658  if (!IsWaveEnd)
2660  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2661 }
2662 
2664  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2665  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2666  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2667  SDValue ThisVal) const {
2668  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2669 
2670  // Assign locations to each value returned by this call.
2672  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2673  *DAG.getContext());
2674  CCInfo.AnalyzeCallResult(Ins, RetCC);
2675 
2676  // Copy all of the result registers out of their specified physreg.
2677  for (unsigned i = 0; i != RVLocs.size(); ++i) {
2678  CCValAssign VA = RVLocs[i];
2679  SDValue Val;
2680 
2681  if (VA.isRegLoc()) {
2682  Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2683  Chain = Val.getValue(1);
2684  InFlag = Val.getValue(2);
2685  } else if (VA.isMemLoc()) {
2686  report_fatal_error("TODO: return values in memory");
2687  } else
2688  llvm_unreachable("unknown argument location type");
2689 
2690  switch (VA.getLocInfo()) {
2691  case CCValAssign::Full:
2692  break;
2693  case CCValAssign::BCvt:
2694  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2695  break;
2696  case CCValAssign::ZExt:
2697  Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2698  DAG.getValueType(VA.getValVT()));
2699  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2700  break;
2701  case CCValAssign::SExt:
2702  Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2703  DAG.getValueType(VA.getValVT()));
2704  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2705  break;
2706  case CCValAssign::AExt:
2707  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2708  break;
2709  default:
2710  llvm_unreachable("Unknown loc info!");
2711  }
2712 
2713  InVals.push_back(Val);
2714  }
2715 
2716  return Chain;
2717 }
2718 
2719 // Add code to pass special inputs required depending on used features separate
2720 // from the explicit user arguments present in the IR.
2722  CallLoweringInfo &CLI,
2723  CCState &CCInfo,
2724  const SIMachineFunctionInfo &Info,
2725  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2726  SmallVectorImpl<SDValue> &MemOpChains,
2727  SDValue Chain) const {
2728  // If we don't have a call site, this was a call inserted by
2729  // legalization. These can never use special inputs.
2730  if (!CLI.CB)
2731  return;
2732 
2733  SelectionDAG &DAG = CLI.DAG;
2734  const SDLoc &DL = CLI.DL;
2735 
2736  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2737  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2738 
2739  const AMDGPUFunctionArgInfo *CalleeArgInfo
2741  if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
2742  auto &ArgUsageInfo =
2744  CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2745  }
2746 
2747  // TODO: Unify with private memory register handling. This is complicated by
2748  // the fact that at least in kernels, the input argument is not necessarily
2749  // in the same location as the input.
2758  };
2759 
2760  for (auto InputID : InputRegs) {
2761  const ArgDescriptor *OutgoingArg;
2762  const TargetRegisterClass *ArgRC;
2763  LLT ArgTy;
2764 
2765  std::tie(OutgoingArg, ArgRC, ArgTy) =
2766  CalleeArgInfo->getPreloadedValue(InputID);
2767  if (!OutgoingArg)
2768  continue;
2769 
2770  const ArgDescriptor *IncomingArg;
2771  const TargetRegisterClass *IncomingArgRC;
2772  LLT Ty;
2773  std::tie(IncomingArg, IncomingArgRC, Ty) =
2774  CallerArgInfo.getPreloadedValue(InputID);
2775  assert(IncomingArgRC == ArgRC);
2776 
2777  // All special arguments are ints for now.
2778  EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2779  SDValue InputReg;
2780 
2781  if (IncomingArg) {
2782  InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2783  } else {
2784  // The implicit arg ptr is special because it doesn't have a corresponding
2785  // input for kernels, and is computed from the kernarg segment pointer.
2787  InputReg = getImplicitArgPtr(DAG, DL);
2788  }
2789 
2790  if (OutgoingArg->isRegister()) {
2791  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2792  if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
2793  report_fatal_error("failed to allocate implicit input argument");
2794  } else {
2795  unsigned SpecialArgOffset =
2796  CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
2797  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2798  SpecialArgOffset);
2799  MemOpChains.push_back(ArgStore);
2800  }
2801  }
2802 
2803  // Pack workitem IDs into a single register or pass it as is if already
2804  // packed.
2805  const ArgDescriptor *OutgoingArg;
2806  const TargetRegisterClass *ArgRC;
2807  LLT Ty;
2808 
2809  std::tie(OutgoingArg, ArgRC, Ty) =
2811  if (!OutgoingArg)
2812  std::tie(OutgoingArg, ArgRC, Ty) =
2814  if (!OutgoingArg)
2815  std::tie(OutgoingArg, ArgRC, Ty) =
2817  if (!OutgoingArg)
2818  return;
2819 
2820  const ArgDescriptor *IncomingArgX = std::get<0>(
2822  const ArgDescriptor *IncomingArgY = std::get<0>(
2824  const ArgDescriptor *IncomingArgZ = std::get<0>(
2826 
2827  SDValue InputReg;
2828  SDLoc SL;
2829 
2830  // If incoming ids are not packed we need to pack them.
2831  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX)
2832  InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
2833 
2834  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
2835  SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
2836  Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
2837  DAG.getShiftAmountConstant(10, MVT::i32, SL));
2838  InputReg = InputReg.getNode() ?
2839  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
2840  }
2841 
2842  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
2843  SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
2844  Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
2845  DAG.getShiftAmountConstant(20, MVT::i32, SL));
2846  InputReg = InputReg.getNode() ?
2847  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
2848  }
2849 
2850  if (!InputReg.getNode()) {
2851  // Workitem ids are already packed, any of present incoming arguments
2852  // will carry all required fields.
2854  IncomingArgX ? *IncomingArgX :
2855  IncomingArgY ? *IncomingArgY :
2856  *IncomingArgZ, ~0u);
2857  InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
2858  }
2859 
2860  if (OutgoingArg->isRegister()) {
2861  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2862  CCInfo.AllocateReg(OutgoingArg->getRegister());
2863  } else {
2864  unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
2865  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2866  SpecialArgOffset);
2867  MemOpChains.push_back(ArgStore);
2868  }
2869 }
2870 
2872  return CC == CallingConv::Fast;
2873 }
2874 
2875 /// Return true if we might ever do TCO for calls with this calling convention.
2877  switch (CC) {
2878  case CallingConv::C:
2880  return true;
2881  default:
2882  return canGuaranteeTCO(CC);
2883  }
2884 }
2885 
2887  SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2888  const SmallVectorImpl<ISD::OutputArg> &Outs,
2889  const SmallVectorImpl<SDValue> &OutVals,
2890  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2891  if (!mayTailCallThisCC(CalleeCC))
2892  return false;
2893 
2894  // For a divergent call target, we need to do a waterfall loop over the
2895  // possible callees which precludes us from using a simple jump.
2896  if (Callee->isDivergent())
2897  return false;
2898 
2899  MachineFunction &MF = DAG.getMachineFunction();
2900  const Function &CallerF = MF.getFunction();
2901  CallingConv::ID CallerCC = CallerF.getCallingConv();
2903  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2904 
2905  // Kernels aren't callable, and don't have a live in return address so it
2906  // doesn't make sense to do a tail call with entry functions.
2907  if (!CallerPreserved)
2908  return false;
2909 
2910  bool CCMatch = CallerCC == CalleeCC;
2911 
2913  if (canGuaranteeTCO(CalleeCC) && CCMatch)
2914  return true;
2915  return false;
2916  }
2917 
2918  // TODO: Can we handle var args?
2919  if (IsVarArg)
2920  return false;
2921 
2922  for (const Argument &Arg : CallerF.args()) {
2923  if (Arg.hasByValAttr())
2924  return false;
2925  }
2926 
2927  LLVMContext &Ctx = *DAG.getContext();
2928 
2929  // Check that the call results are passed in the same way.
2930  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2931  CCAssignFnForCall(CalleeCC, IsVarArg),
2932  CCAssignFnForCall(CallerCC, IsVarArg)))
2933  return false;
2934 
2935  // The callee has to preserve all registers the caller needs to preserve.
2936  if (!CCMatch) {
2937  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2938  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2939  return false;
2940  }
2941 
2942  // Nothing more to check if the callee is taking no arguments.
2943  if (Outs.empty())
2944  return true;
2945 
2947  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2948 
2949  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2950 
2951  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2952  // If the stack arguments for this call do not fit into our own save area then
2953  // the call cannot be made tail.
2954  // TODO: Is this really necessary?
2955  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2956  return false;
2957 
2958  const MachineRegisterInfo &MRI = MF.getRegInfo();
2959  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2960 }
2961 
2963  if (!CI->isTailCall())
2964  return false;
2965 
2966  const Function *ParentFn = CI->getParent()->getParent();
2967  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2968  return false;
2969  return true;
2970 }
2971 
2972 // The wave scratch offset register is used as the global base pointer.
2974  SmallVectorImpl<SDValue> &InVals) const {
2975  SelectionDAG &DAG = CLI.DAG;
2976  const SDLoc &DL = CLI.DL;
2978  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2980  SDValue Chain = CLI.Chain;
2981  SDValue Callee = CLI.Callee;
2982  bool &IsTailCall = CLI.IsTailCall;
2983  CallingConv::ID CallConv = CLI.CallConv;
2984  bool IsVarArg = CLI.IsVarArg;
2985  bool IsSibCall = false;
2986  bool IsThisReturn = false;
2987  MachineFunction &MF = DAG.getMachineFunction();
2988 
2989  if (Callee.isUndef() || isNullConstant(Callee)) {
2990  if (!CLI.IsTailCall) {
2991  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
2992  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
2993  }
2994 
2995  return Chain;
2996  }
2997 
2998  if (IsVarArg) {
2999  return lowerUnhandledCall(CLI, InVals,
3000  "unsupported call to variadic function ");
3001  }
3002 
3003  if (!CLI.CB)
3004  report_fatal_error("unsupported libcall legalization");
3005 
3006  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3007  return lowerUnhandledCall(CLI, InVals,
3008  "unsupported required tail call to function ");
3009  }
3010 
3011  if (AMDGPU::isShader(CallConv)) {
3012  // Note the issue is with the CC of the called function, not of the call
3013  // itself.
3014  return lowerUnhandledCall(CLI, InVals,
3015  "unsupported call to a shader function ");
3016  }
3017 
3019  CallConv != CallingConv::AMDGPU_Gfx) {
3020  // Only allow calls with specific calling conventions.
3021  return lowerUnhandledCall(CLI, InVals,
3022  "unsupported calling convention for call from "
3023  "graphics shader of function ");
3024  }
3025 
3026  if (IsTailCall) {
3027  IsTailCall = isEligibleForTailCallOptimization(
3028  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3029  if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
3030  report_fatal_error("failed to perform tail call elimination on a call "
3031  "site marked musttail");
3032  }
3033 
3034  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3035 
3036  // A sibling call is one where we're under the usual C ABI and not planning
3037  // to change that but can still do a tail call:
3038  if (!TailCallOpt && IsTailCall)
3039  IsSibCall = true;
3040 
3041  if (IsTailCall)
3042  ++NumTailCalls;
3043  }
3044 
3047  SmallVector<SDValue, 8> MemOpChains;
3048 
3049  // Analyze operands of the call, assigning locations to each operand.
3051  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3052  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3053 
3055  CallConv != CallingConv::AMDGPU_Gfx) {
3056  // With a fixed ABI, allocate fixed registers before user arguments.
3057  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3058  }
3059 
3060  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3061 
3062  // Get a count of how many bytes are to be pushed on the stack.
3063  unsigned NumBytes = CCInfo.getNextStackOffset();
3064 
3065  if (IsSibCall) {
3066  // Since we're not changing the ABI to make this a tail call, the memory
3067  // operands are already available in the caller's incoming argument space.
3068  NumBytes = 0;
3069  }
3070 
3071  // FPDiff is the byte offset of the call's argument area from the callee's.
3072  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3073  // by this amount for a tail call. In a sibling call it must be 0 because the
3074  // caller will deallocate the entire stack and the callee still expects its
3075  // arguments to begin at SP+0. Completely unused for non-tail calls.
3076  int32_t FPDiff = 0;
3077  MachineFrameInfo &MFI = MF.getFrameInfo();
3078 
3079  // Adjust the stack pointer for the new arguments...
3080  // These operations are automatically eliminated by the prolog/epilog pass
3081  if (!IsSibCall) {
3082  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3083 
3084  if (!Subtarget->enableFlatScratch()) {
3085  SmallVector<SDValue, 4> CopyFromChains;
3086 
3087  // In the HSA case, this should be an identity copy.
3088  SDValue ScratchRSrcReg
3089  = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3090  RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3091  CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3092  Chain = DAG.getTokenFactor(DL, CopyFromChains);
3093  }
3094  }
3095 
3096  MVT PtrVT = MVT::i32;
3097 
3098  // Walk the register/memloc assignments, inserting copies/loads.
3099  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3100  CCValAssign &VA = ArgLocs[i];
3101  SDValue Arg = OutVals[i];
3102 
3103  // Promote the value if needed.
3104  switch (VA.getLocInfo()) {
3105  case CCValAssign::Full:
3106  break;
3107  case CCValAssign::BCvt:
3108  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3109  break;
3110  case CCValAssign::ZExt:
3111  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3112  break;
3113  case CCValAssign::SExt:
3114  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3115  break;
3116  case CCValAssign::AExt:
3117  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3118  break;
3119  case CCValAssign::FPExt:
3120  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3121  break;
3122  default:
3123  llvm_unreachable("Unknown loc info!");
3124  }
3125 
3126  if (VA.isRegLoc()) {
3127  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3128  } else {
3129  assert(VA.isMemLoc());
3130 
3131  SDValue DstAddr;
3132  MachinePointerInfo DstInfo;
3133 
3134  unsigned LocMemOffset = VA.getLocMemOffset();
3135  int32_t Offset = LocMemOffset;
3136 
3137  SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3138  MaybeAlign Alignment;
3139 
3140  if (IsTailCall) {
3141  ISD::ArgFlagsTy Flags = Outs[i].Flags;
3142  unsigned OpSize = Flags.isByVal() ?
3143  Flags.getByValSize() : VA.getValVT().getStoreSize();
3144 
3145  // FIXME: We can have better than the minimum byval required alignment.
3146  Alignment =
3147  Flags.isByVal()
3148  ? Flags.getNonZeroByValAlign()
3149  : commonAlignment(Subtarget->getStackAlignment(), Offset);
3150 
3151  Offset = Offset + FPDiff;
3152  int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3153 
3154  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3155  DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3156 
3157  // Make sure any stack arguments overlapping with where we're storing
3158  // are loaded before this eventual operation. Otherwise they'll be
3159  // clobbered.
3160 
3161  // FIXME: Why is this really necessary? This seems to just result in a
3162  // lot of code to copy the stack and write them back to the same
3163  // locations, which are supposed to be immutable?
3164  Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3165  } else {
3166  // Stores to the argument stack area are relative to the stack pointer.
3167  SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3168  MVT::i32);
3169  DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3170  DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3171  Alignment =
3172  commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3173  }
3174 
3175  if (Outs[i].Flags.isByVal()) {
3176  SDValue SizeNode =
3177  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3178  SDValue Cpy =
3179  DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3180  Outs[i].Flags.getNonZeroByValAlign(),
3181  /*isVol = */ false, /*AlwaysInline = */ true,
3182  /*isTailCall = */ false, DstInfo,
3184 
3185  MemOpChains.push_back(Cpy);
3186  } else {
3187  SDValue Store =
3188  DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3189  MemOpChains.push_back(Store);
3190  }
3191  }
3192  }
3193 
3195  CallConv != CallingConv::AMDGPU_Gfx) {
3196  // Copy special input registers after user input arguments.
3197  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3198  }
3199 
3200  if (!MemOpChains.empty())
3201  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3202 
3203  // Build a sequence of copy-to-reg nodes chained together with token chain
3204  // and flag operands which copy the outgoing args into the appropriate regs.
3205  SDValue InFlag;
3206  for (auto &RegToPass : RegsToPass) {
3207  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3208  RegToPass.second, InFlag);
3209  InFlag = Chain.getValue(1);
3210  }
3211 
3212 
3213  SDValue PhysReturnAddrReg;
3214  if (IsTailCall) {
3215  // Since the return is being combined with the call, we need to pass on the
3216  // return address.
3217 
3219  SDValue ReturnAddrReg = CreateLiveInRegister(
3220  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
3221 
3222  PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
3223  MVT::i64);
3224  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
3225  InFlag = Chain.getValue(1);
3226  }
3227 
3228  // We don't usually want to end the call-sequence here because we would tidy
3229  // the frame up *after* the call, however in the ABI-changing tail-call case
3230  // we've carefully laid out the parameters so that when sp is reset they'll be
3231  // in the correct location.
3232  if (IsTailCall && !IsSibCall) {
3233  Chain = DAG.getCALLSEQ_END(Chain,
3234  DAG.getTargetConstant(NumBytes, DL, MVT::i32),
3235  DAG.getTargetConstant(0, DL, MVT::i32),
3236  InFlag, DL);
3237  InFlag = Chain.getValue(1);
3238  }
3239 
3240  std::vector<SDValue> Ops;
3241  Ops.push_back(Chain);
3242  Ops.push_back(Callee);
3243  // Add a redundant copy of the callee global which will not be legalized, as
3244  // we need direct access to the callee later.
3245  if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3246  const GlobalValue *GV = GSD->getGlobal();
3247  Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3248  } else {
3249  Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3250  }
3251 
3252  if (IsTailCall) {
3253  // Each tail call may have to adjust the stack by a different amount, so
3254  // this information must travel along with the operation for eventual
3255  // consumption by emitEpilogue.
3256  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3257 
3258  Ops.push_back(PhysReturnAddrReg);
3259  }
3260 
3261  // Add argument registers to the end of the list so that they are known live
3262  // into the call.
3263  for (auto &RegToPass : RegsToPass) {
3264  Ops.push_back(DAG.getRegister(RegToPass.first,
3265  RegToPass.second.getValueType()));
3266  }
3267 
3268  // Add a register mask operand representing the call-preserved registers.
3269 
3270  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
3271  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3272  assert(Mask && "Missing call preserved mask for calling convention");
3273  Ops.push_back(DAG.getRegisterMask(Mask));
3274 
3275  if (InFlag.getNode())
3276  Ops.push_back(InFlag);
3277 
3278  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3279 
3280  // If we're doing a tall call, use a TC_RETURN here rather than an
3281  // actual call instruction.
3282  if (IsTailCall) {
3283  MFI.setHasTailCall();
3284  return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
3285  }
3286 
3287  // Returns a chain and a flag for retval copy to use.
3288  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3289  Chain = Call.getValue(0);
3290  InFlag = Call.getValue(1);
3291 
3292  uint64_t CalleePopBytes = NumBytes;
3293  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
3294  DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
3295  InFlag, DL);
3296  if (!Ins.empty())
3297  InFlag = Chain.getValue(1);
3298 
3299  // Handle result values, copying them out of physregs into vregs that we
3300  // return.
3301  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3302  InVals, IsThisReturn,
3303  IsThisReturn ? OutVals[0] : SDValue());
3304 }
3305 
3306 // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3307 // except for applying the wave size scale to the increment amount.
3309  SDValue Op, SelectionDAG &DAG) const {
3310  const MachineFunction &MF = DAG.getMachineFunction();
3312 
3313  SDLoc dl(Op);
3314  EVT VT = Op.getValueType();
3315  SDValue Tmp1 = Op;
3316  SDValue Tmp2 = Op.getValue(1);
3317  SDValue Tmp3 = Op.getOperand(2);
3318  SDValue Chain = Tmp1.getOperand(0);
3319 
3320  Register SPReg = Info->getStackPtrOffsetReg();
3321 
3322  // Chain the dynamic stack allocation so that it doesn't modify the stack
3323  // pointer when other instructions are using the stack.
3324  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3325 
3326  SDValue Size = Tmp2.getOperand(1);
3327  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3328  Chain = SP.getValue(1);
3329  MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3330  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3331  const TargetFrameLowering *TFL = ST.getFrameLowering();
3332  unsigned Opc =
3333  TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3334  ISD::ADD : ISD::SUB;
3335 
3336  SDValue ScaledSize = DAG.getNode(
3337  ISD::SHL, dl, VT, Size,
3338  DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
3339 
3340  Align StackAlign = TFL->getStackAlign();
3341  Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3342  if (Alignment && *Alignment > StackAlign) {
3343  Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3344  DAG.getConstant(-(uint64_t)Alignment->value()
3345  << ST.getWavefrontSizeLog2(),
3346  dl, VT));
3347  }
3348 
3349  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3350  Tmp2 = DAG.getCALLSEQ_END(
3351  Chain, DAG.getIntPtrConstant(0, dl, true),
3352  DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
3353 
3354  return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3355 }
3356 
3358  SelectionDAG &DAG) const {
3359  // We only handle constant sizes here to allow non-entry block, static sized
3360  // allocas. A truly dynamic value is more difficult to support because we
3361  // don't know if the size value is uniform or not. If the size isn't uniform,
3362  // we would need to do a wave reduction to get the maximum size to know how
3363  // much to increment the uniform stack pointer.
3364  SDValue Size = Op.getOperand(1);
3365  if (isa<ConstantSDNode>(Size))
3366  return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3367 
3369 }
3370 
3372  const MachineFunction &MF) const {
3374  .Case("m0", AMDGPU::M0)
3375  .Case("exec", AMDGPU::EXEC)
3376  .Case("exec_lo", AMDGPU::EXEC_LO)
3377  .Case("exec_hi", AMDGPU::EXEC_HI)
3378  .Case("flat_scratch", AMDGPU::FLAT_SCR)
3379  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3380  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3381  .Default(Register());
3382 
3383  if (Reg == AMDGPU::NoRegister) {
3384  report_fatal_error(Twine("invalid register name \""
3385  + StringRef(RegName) + "\"."));
3386 
3387  }
3388 
3389  if (!Subtarget->hasFlatScrRegister() &&
3390  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
3391  report_fatal_error(Twine("invalid register \""
3392  + StringRef(RegName) + "\" for subtarget."));
3393  }
3394 
3395  switch (Reg) {
3396  case AMDGPU::M0:
3397  case AMDGPU::EXEC_LO:
3398  case AMDGPU::EXEC_HI:
3399  case AMDGPU::FLAT_SCR_LO:
3400  case AMDGPU::FLAT_SCR_HI:
3401  if (VT.getSizeInBits() == 32)
3402  return Reg;
3403  break;
3404  case AMDGPU::EXEC:
3405  case AMDGPU::FLAT_SCR:
3406  if (VT.getSizeInBits() == 64)
3407  return Reg;
3408  break;
3409  default:
3410  llvm_unreachable("missing register type checking");
3411  }
3412 
3413  report_fatal_error(Twine("invalid type for register \""
3414  + StringRef(RegName) + "\"."));
3415 }
3416 
3417 // If kill is not the last instruction, split the block so kill is always a
3418 // proper terminator.
3421  MachineBasicBlock *BB) const {
3422  MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
3423  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3424  MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
3425  return SplitBB;
3426 }
3427 
3428 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3429 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3430 // be the first instruction in the remainder block.
3431 //
3432 /// \returns { LoopBody, Remainder }
3433 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3435  MachineFunction *MF = MBB.getParent();
3437 
3438  // To insert the loop we need to split the block. Move everything after this
3439  // point to a new block, and insert a new empty block between the two.
3441  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3443  ++MBBI;
3444 
3445  MF->insert(MBBI, LoopBB);
3446  MF->insert(MBBI, RemainderBB);
3447 
3448  LoopBB->addSuccessor(LoopBB);
3449  LoopBB->addSuccessor(RemainderBB);
3450 
3451  // Move the rest of the block into a new block.
3452  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3453 
3454  if (InstInLoop) {
3455  auto Next = std::next(I);
3456 
3457  // Move instruction to loop body.
3458  LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
3459 
3460  // Move the rest of the block.
3461  RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
3462  } else {
3463  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3464  }
3465 
3466  MBB.addSuccessor(LoopBB);
3467 
3468  return std::make_pair(LoopBB, RemainderBB);
3469 }
3470 
3471 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3473  MachineBasicBlock *MBB = MI.getParent();
3474  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3475  auto I = MI.getIterator();
3476  auto E = std::next(I);
3477 
3478  BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
3479  .addImm(0);
3480 
3481  MIBundleBuilder Bundler(*MBB, I, E);
3482  finalizeBundle(*MBB, Bundler.begin());
3483 }
3484 
3487  MachineBasicBlock *BB) const {
3488  const DebugLoc &DL = MI.getDebugLoc();
3489 
3490  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3491 
3492  MachineBasicBlock *LoopBB;
3493  MachineBasicBlock *RemainderBB;
3494  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3495 
3496  // Apparently kill flags are only valid if the def is in the same block?
3497  if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
3498  Src->setIsKill(false);
3499 
3500  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
3501 
3502  MachineBasicBlock::iterator I = LoopBB->end();
3503 
3504  const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
3506 
3507  // Clear TRAP_STS.MEM_VIOL
3508  BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
3509  .addImm(0)
3510  .addImm(EncodedReg);
3511 
3513 
3514  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3515 
3516  // Load and check TRAP_STS.MEM_VIOL
3517  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
3518  .addImm(EncodedReg);
3519 
3520  // FIXME: Do we need to use an isel pseudo that may clobber scc?
3521  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
3523  .addImm(0);
3524  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3525  .addMBB(LoopBB);
3526 
3527  return RemainderBB;
3528 }
3529 
3530 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3531 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3532 // will only do one iteration. In the worst case, this will loop 64 times.
3533 //
3534 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3537  MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3538  const DebugLoc &DL, const MachineOperand &Idx,
3539  unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
3540  unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
3541  Register &SGPRIdxReg) {
3542 
3543  MachineFunction *MF = OrigBB.getParent();
3544  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3545  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3546  MachineBasicBlock::iterator I = LoopBB.begin();
3547 
3548  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3549  Register PhiExec = MRI.createVirtualRegister(BoolRC);
3550  Register NewExec = MRI.createVirtualRegister(BoolRC);
3551  Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3552  Register CondReg = MRI.createVirtualRegister(BoolRC);
3553 
3554  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
3555  .addReg(InitReg)
3556  .addMBB(&OrigBB)
3557  .addReg(ResultReg)
3558  .addMBB(&LoopBB);
3559 
3560  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
3561  .addReg(InitSaveExecReg)
3562  .addMBB(&OrigBB)
3563  .addReg(NewExec)
3564  .addMBB(&LoopBB);
3565 
3566  // Read the next variant <- also loop target.
3567  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3568  .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
3569 
3570  // Compare the just read M0 value to all possible Idx values.
3571  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3572  .addReg(CurrentIdxReg)
3573  .addReg(Idx.getReg(), 0, Idx.getSubReg());
3574 
3575  // Update EXEC, save the original EXEC value to VCC.
3576  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3577  : AMDGPU::S_AND_SAVEEXEC_B64),
3578  NewExec)
3579  .addReg(CondReg, RegState::Kill);
3580 
3581  MRI.setSimpleHint(NewExec, CondReg);
3582 
3583  if (UseGPRIdxMode) {
3584  if (Offset == 0) {
3585  SGPRIdxReg = CurrentIdxReg;
3586  } else {
3587  SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3588  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3589  .addReg(CurrentIdxReg, RegState::Kill)
3590  .addImm(Offset);
3591  }
3592  } else {
3593  // Move index from VCC into M0
3594  if (Offset == 0) {
3595  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3596  .addReg(CurrentIdxReg, RegState::Kill);
3597  } else {
3598  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3599  .addReg(CurrentIdxReg, RegState::Kill)
3600  .addImm(Offset);
3601  }
3602  }
3603 
3604  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3605  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3606  MachineInstr *InsertPt =
3607  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3608  : AMDGPU::S_XOR_B64_term), Exec)
3609  .addReg(Exec)
3610  .addReg(NewExec);
3611 
3612  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3613  // s_cbranch_scc0?
3614 
3615  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3616  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3617  .addMBB(&LoopBB);
3618 
3619  return InsertPt->getIterator();
3620 }
3621 
3622 // This has slightly sub-optimal regalloc when the source vector is killed by
3623 // the read. The register allocator does not understand that the kill is
3624 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3625 // subregister from it, using 1 more VGPR than necessary. This was saved when
3626 // this was expanded after register allocation.
3629  unsigned InitResultReg, unsigned PhiReg, int Offset,
3630  bool UseGPRIdxMode, Register &SGPRIdxReg) {
3631  MachineFunction *MF = MBB.getParent();
3632  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3633  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3635  const DebugLoc &DL = MI.getDebugLoc();
3637 
3638  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3639  Register DstReg = MI.getOperand(0).getReg();
3640  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
3641  Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
3642  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3643  unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3644 
3645  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3646 
3647  // Save the EXEC mask
3648  BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
3649  .addReg(Exec);
3650 
3651  MachineBasicBlock *LoopBB;
3652  MachineBasicBlock *RemainderBB;
3653  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
3654 
3655  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3656 
3657  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3658  InitResultReg, DstReg, PhiReg, TmpExec,
3659  Offset, UseGPRIdxMode, SGPRIdxReg);
3660 
3661  MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
3663  ++MBBI;
3664  MF->insert(MBBI, LandingPad);
3665  LoopBB->removeSuccessor(RemainderBB);
3666  LandingPad->addSuccessor(RemainderBB);
3667  LoopBB->addSuccessor(LandingPad);
3668  MachineBasicBlock::iterator First = LandingPad->begin();
3669  BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
3670  .addReg(SaveExec);
3671 
3672  return InsPt;
3673 }
3674 
3675 // Returns subreg index, offset
3676 static std::pair<unsigned, int>
3678  const TargetRegisterClass *SuperRC,
3679  unsigned VecReg,
3680  int Offset) {
3681  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3682 
3683  // Skip out of bounds offsets, or else we would end up using an undefined
3684  // register.
3685  if (Offset >= NumElts || Offset < 0)
3686  return std::make_pair(AMDGPU::sub0, Offset);
3687 
3688  return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
3689 }
3690 
3693  int Offset) {
3694  MachineBasicBlock *MBB = MI.getParent();
3695  const DebugLoc &DL = MI.getDebugLoc();
3697 
3698  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3699 
3700  assert(Idx->getReg() != AMDGPU::NoRegister);
3701 
3702  if (Offset == 0) {
3703  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
3704  } else {
3705  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3706  .add(*Idx)
3707  .addImm(Offset);
3708  }
3709 }
3710 
3713  int Offset) {
3714  MachineBasicBlock *MBB = MI.getParent();
3715  const DebugLoc &DL = MI.getDebugLoc();
3717 
3718  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3719 
3720  if (Offset == 0)
3721  return Idx->getReg();
3722 
3723  Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3724  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3725  .add(*Idx)
3726  .addImm(Offset);
3727  return Tmp;
3728 }
3729 
3732  const GCNSubtarget &ST) {
3733  const SIInstrInfo *TII = ST.getInstrInfo();
3734  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3735  MachineFunction *MF = MBB.getParent();
3737 
3738  Register Dst = MI.getOperand(0).getReg();
3739  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3740  Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3741  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3742 
3743  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3744  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3745 
3746  unsigned SubReg;
3747  std::tie(SubReg, Offset)
3748  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3749 
3750  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3751 
3752  // Check for a SGPR index.
3753  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3755  const DebugLoc &DL = MI.getDebugLoc();
3756 
3757  if (UseGPRIdxMode) {
3758  // TODO: Look at the uses to avoid the copy. This may require rescheduling
3759  // to avoid interfering with other uses, so probably requires a new
3760  // optimization pass.
3762 
3763  const MCInstrDesc &GPRIDXDesc =
3764  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3765  BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3766  .addReg(SrcReg)
3767  .addReg(Idx)
3768  .addImm(SubReg);
3769  } else {
3771 
3772  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3773  .addReg(SrcReg, 0, SubReg)
3774  .addReg(SrcReg, RegState::Implicit);
3775  }
3776 
3777  MI.eraseFromParent();
3778 
3779  return &MBB;
3780  }
3781 
3782  // Control flow needs to be inserted if indexing with a VGPR.
3783  const DebugLoc &DL = MI.getDebugLoc();
3785 
3786  Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3787  Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3788 
3789  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3790 
3791  Register SGPRIdxReg;
3792  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
3793  UseGPRIdxMode, SGPRIdxReg);
3794 
3795  MachineBasicBlock *LoopBB = InsPt->getParent();
3796 
3797  if (UseGPRIdxMode) {
3798  const MCInstrDesc &GPRIDXDesc =
3799  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3800 
3801  BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3802  .addReg(SrcReg)
3803  .addReg(SGPRIdxReg)
3804  .addImm(SubReg);
3805  } else {
3806  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3807  .addReg(SrcReg, 0, SubReg)
3808  .addReg(SrcReg, RegState::Implicit);
3809  }
3810 
3811  MI.eraseFromParent();
3812 
3813  return LoopBB;
3814 }
3815 
3818  const GCNSubtarget &ST) {
3819  const SIInstrInfo *TII = ST.getInstrInfo();
3820  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3821  MachineFunction *MF = MBB.getParent();
3823 
3824  Register Dst = MI.getOperand(0).getReg();
3825  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3826  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3827  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3828  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3829  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3830  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3831 
3832  // This can be an immediate, but will be folded later.
3833  assert(Val->getReg());
3834 
3835  unsigned SubReg;
3836  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3837  SrcVec->getReg(),
3838  Offset);
3839  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3840 
3841  if (Idx->getReg() == AMDGPU::NoRegister) {
3843  const DebugLoc &DL = MI.getDebugLoc();
3844 
3845  assert(Offset == 0);
3846 
3847  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3848  .add(*SrcVec)
3849  .add(*Val)
3850  .addImm(SubReg);
3851 
3852  MI.eraseFromParent();
3853  return &MBB;
3854  }
3855 
3856  // Check for a SGPR index.
3857  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3859  const DebugLoc &DL = MI.getDebugLoc();
3860 
3861  if (UseGPRIdxMode) {
3863 
3864  const MCInstrDesc &GPRIDXDesc =
3865  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3866  BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3867  .addReg(SrcVec->getReg())
3868  .add(*Val)
3869  .addReg(Idx)
3870  .addImm(SubReg);
3871  } else {
3873 
3874  const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3875  TRI.getRegSizeInBits(*VecRC), 32, false);
3876  BuildMI(MBB, I, DL, MovRelDesc, Dst)
3877  .addReg(SrcVec->getReg())
3878  .add(*Val)
3879  .addImm(SubReg);
3880  }
3881  MI.eraseFromParent();
3882  return &MBB;
3883  }
3884 
3885  // Control flow needs to be inserted if indexing with a VGPR.
3886  if (Val->isReg())
3887  MRI.clearKillFlags(Val->getReg());
3888 
3889  const DebugLoc &DL = MI.getDebugLoc();
3890 
3891  Register PhiReg = MRI.createVirtualRegister(VecRC);
3892 
3893  Register SGPRIdxReg;
3894  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
3895  UseGPRIdxMode, SGPRIdxReg);
3896  MachineBasicBlock *LoopBB = InsPt->getParent();
3897 
3898  if (UseGPRIdxMode) {
3899  const MCInstrDesc &GPRIDXDesc =
3900  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3901 
3902  BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3903  .addReg(PhiReg)
3904  .add(*Val)
3905  .addReg(SGPRIdxReg)
3906  .addImm(AMDGPU::sub0);
3907  } else {
3908  const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3909  TRI.getRegSizeInBits(*VecRC), 32, false);
3910  BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
3911  .addReg(PhiReg)
3912  .add(*Val)
3913  .addImm(AMDGPU::sub0);
3914  }
3915 
3916  MI.eraseFromParent();
3917  return LoopBB;
3918 }
3919 
3921  MachineInstr &MI, MachineBasicBlock *BB) const {
3922 
3923  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3924  MachineFunction *MF = BB->getParent();
3926 
3927  switch (MI.getOpcode()) {
3928  case AMDGPU::S_UADDO_PSEUDO:
3929  case AMDGPU::S_USUBO_PSEUDO: {
3930  const DebugLoc &DL = MI.getDebugLoc();
3931  MachineOperand &Dest0 = MI.getOperand(0);
3932  MachineOperand &Dest1 = MI.getOperand(1);
3933  MachineOperand &Src0 = MI.getOperand(2);
3934  MachineOperand &Src1 = MI.getOperand(3);
3935 
3936  unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
3937  ? AMDGPU::S_ADD_I32
3938  : AMDGPU::S_SUB_I32;
3939  BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
3940 
3941  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
3942  .addImm(1)
3943  .addImm(0);
3944 
3945  MI.eraseFromParent();
3946  return BB;
3947  }
3948  case AMDGPU::S_ADD_U64_PSEUDO:
3949  case AMDGPU::S_SUB_U64_PSEUDO: {
3950  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3951  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3952  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3953  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3954  const DebugLoc &DL = MI.getDebugLoc();
3955 
3956  MachineOperand &Dest = MI.getOperand(0);
3957  MachineOperand &Src0 = MI.getOperand(1);
3958  MachineOperand &Src1 = MI.getOperand(2);
3959 
3960  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3961  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3962 
3963  MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
3964  MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
3965  MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
3966  MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
3967 
3968  MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
3969  MI<