LLVM  13.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIISelLowering.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/Statistic.h"
22 #include "llvm/BinaryFormat/ELF.h"
23 #include "llvm/CodeGen/Analysis.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
31 #include "llvm/Support/KnownBits.h"
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "si-lower"
36 
37 STATISTIC(NumTailCalls, "Number of tail calls");
38 
40  "amdgpu-disable-loop-alignment",
41  cl::desc("Do not align and prefetch loops"),
42  cl::init(false));
43 
45  "amdgpu-reserve-vgpr-for-sgpr-spill",
46  cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true));
47 
49  "amdgpu-use-divergent-register-indexing",
50  cl::Hidden,
51  cl::desc("Use indirect register addressing for divergent indexes"),
52  cl::init(false));
53 
54 static bool hasFP32Denormals(const MachineFunction &MF) {
56  return Info->getMode().allFP32Denormals();
57 }
58 
59 static bool hasFP64FP16Denormals(const MachineFunction &MF) {
61  return Info->getMode().allFP64FP16Denormals();
62 }
63 
64 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
65  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
66  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
67  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
68  return AMDGPU::SGPR0 + Reg;
69  }
70  }
71  llvm_unreachable("Cannot allocate sgpr");
72 }
73 
75  const GCNSubtarget &STI)
76  : AMDGPUTargetLowering(TM, STI),
77  Subtarget(&STI) {
78  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
79  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
80 
81  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
82  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
83 
84  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
85 
86  const SIRegisterInfo *TRI = STI.getRegisterInfo();
87  const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
88 
89  addRegisterClass(MVT::f64, V64RegClass);
90  addRegisterClass(MVT::v2f32, V64RegClass);
91 
92  addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
93  addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
94 
95  addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
96  addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
97 
98  addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
99  addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
100 
101  addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
102  addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
103 
104  addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
105  addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
106 
107  addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
108  addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
109 
110  addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
111  addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
112 
113  addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
114  addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
115 
116  addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
117  addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
118 
119  if (Subtarget->has16BitInsts()) {
120  addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
121  addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
122 
123  // Unless there are also VOP3P operations, not operations are really legal.
124  addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
125  addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
126  addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
127  addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
128  }
129 
130  addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
131  addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
132 
134 
135  // The boolean content concept here is too inflexible. Compares only ever
136  // really produce a 1-bit result. Any copy/extend from these will turn into a
137  // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
138  // it's what most targets use.
141 
142  // We need to custom lower vector stores from local memory
151 
160 
177 
183 
186 
191 
197 
202 
211 
220 
227 
230 
233 
237 
238 #if 0
241 #endif
242 
243  // We only support LOAD/STORE and vector manipulation ops for vectors
244  // with > 4 elements.
249  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
250  switch (Op) {
251  case ISD::LOAD:
252  case ISD::STORE:
253  case ISD::BUILD_VECTOR:
254  case ISD::BITCAST:
260  break;
261  case ISD::CONCAT_VECTORS:
263  break;
264  default:
266  break;
267  }
268  }
269  }
270 
272 
273  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
274  // is expanded to avoid having two separate loops in case the index is a VGPR.
275 
276  // Most operations are naturally 32-bit vector operations. We only support
277  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
278  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
281 
284 
287 
290  }
291 
292  for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
295 
298 
301 
304  }
305 
306  for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
309 
312 
315 
318  }
319 
320  for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
323 
326 
329 
332  }
333 
338 
341 
342  // Avoid stack access for these.
343  // TODO: Generalize to more vector types.
348 
354 
358 
363 
364  // Deal with vec3 vector operations when widened to vec4.
369 
370  // Deal with vec5 vector operations when widened to vec8.
375 
376  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
377  // and output demarshalling
380 
381  // We can't return success/failure, only the old value,
382  // let LLVM add the comparison
385 
386  if (Subtarget->hasFlatAddressSpace()) {
389  }
390 
393 
394  // FIXME: This should be narrowed to i32, but that only happens if i64 is
395  // illegal.
396  // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
399 
400  // On SI this is s_memtime and s_memrealtime on VI.
404 
405  if (Subtarget->has16BitInsts()) {
411  }
412 
413  if (Subtarget->hasMadMacF32Insts())
415 
416  if (!Subtarget->hasBFI()) {
417  // fcopysign can be done in a single instruction with BFI.
420  }
421 
422  if (!Subtarget->hasBCNT(32))
424 
425  if (!Subtarget->hasBCNT(64))
427 
428  if (Subtarget->hasFFBH())
430 
431  if (Subtarget->hasFFBL())
433 
434  // We only really have 32-bit BFE instructions (and 16-bit on VI).
435  //
436  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
437  // effort to match them now. We want this to be false for i64 cases when the
438  // extraction isn't restricted to the upper or lower half. Ideally we would
439  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
440  // span the midpoint are probably relatively rare, so don't worry about them
441  // for now.
442  if (Subtarget->hasBFE())
443  setHasExtractBitsInsn(true);
444 
445  // Clamp modifier on add/sub
446  if (Subtarget->hasIntClamp()) {
449  }
450 
451  if (Subtarget->hasAddNoCarry()) {
456  }
457 
462 
463 
464  // These are really only legal for ieee_mode functions. We should be avoiding
465  // them for functions that don't have ieee_mode enabled, so just say they are
466  // legal.
471 
472 
473  if (Subtarget->haveRoundOpsF64()) {
477  } else {
482  }
483 
485 
490 
491  if (Subtarget->has16BitInsts()) {
493 
496 
499 
502 
505 
512 
514 
520 
522 
524 
526 
528 
533 
536 
537  // F16 - Constant Actions.
539 
540  // F16 - Load/Store Actions.
545 
546  // F16 - VOP1 Actions.
550 
553 
559 
560  // F16 - VOP2 Actions.
563 
565 
566  // F16 - VOP3 Actions.
568  if (STI.hasMadF16())
570 
571  for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
572  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
573  switch (Op) {
574  case ISD::LOAD:
575  case ISD::STORE:
576  case ISD::BUILD_VECTOR:
577  case ISD::BITCAST:
583  break;
584  case ISD::CONCAT_VECTORS:
586  break;
587  default:
589  break;
590  }
591  }
592  }
593 
594  // v_perm_b32 can handle either of these.
598 
599  // XXX - Do these do anything? Vector constants turn into build_vector.
602 
605 
610 
615 
622 
627 
632 
637 
641 
642  if (!Subtarget->hasVOP3PInsts()) {
645  }
646 
648  // This isn't really legal, but this avoids the legalizer unrolling it (and
649  // allows matching fneg (fabs x) patterns)
651 
656 
659 
662  }
663 
664  if (Subtarget->hasVOP3PInsts()) {
675 
680 
684 
687 
689 
692 
695 
702 
707 
712 
716 
719 
723 
727 
728  if (Subtarget->hasPackedFP32Ops()) {
733 
734  for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) {
738  }
739  }
740  }
741 
744 
745  if (Subtarget->has16BitInsts()) {
750  } else {
751  // Legalization hack.
754 
757  }
758 
761  }
762 
765 
773 
785 
796 
824 
825  // All memory operations. Some folding on the pointer operand is done to help
826  // matching the constant offsets in the addressing modes.
847 
848  // FIXME: In other contexts we pretend this is a per-function property.
849  setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
850 
852 }
853 
855  return Subtarget;
856 }
857 
858 //===----------------------------------------------------------------------===//
859 // TargetLowering queries
860 //===----------------------------------------------------------------------===//
861 
862 // v_mad_mix* support a conversion from f16 to f32.
863 //
864 // There is only one special case when denormals are enabled we don't currently,
865 // where this is OK to use.
866 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
867  EVT DestVT, EVT SrcVT) const {
868  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
869  (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
870  DestVT.getScalarType() == MVT::f32 &&
871  SrcVT.getScalarType() == MVT::f16 &&
872  // TODO: This probably only requires no input flushing?
874 }
875 
877  // SI has some legal vector types, but no legal vector operations. Say no
878  // shuffles are legal in order to prefer scalarizing some vector operations.
879  return false;
880 }
881 
883  CallingConv::ID CC,
884  EVT VT) const {
885  if (CC == CallingConv::AMDGPU_KERNEL)
887 
888  if (VT.isVector()) {
889  EVT ScalarVT = VT.getScalarType();
890  unsigned Size = ScalarVT.getSizeInBits();
891  if (Size == 16) {
892  if (Subtarget->has16BitInsts())
893  return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
894  return VT.isInteger() ? MVT::i32 : MVT::f32;
895  }
896 
897  if (Size < 16)
898  return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
899  return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
900  }
901 
902  if (VT.getSizeInBits() > 32)
903  return MVT::i32;
904 
906 }
907 
909  CallingConv::ID CC,
910  EVT VT) const {
911  if (CC == CallingConv::AMDGPU_KERNEL)
913 
914  if (VT.isVector()) {
915  unsigned NumElts = VT.getVectorNumElements();
916  EVT ScalarVT = VT.getScalarType();
917  unsigned Size = ScalarVT.getSizeInBits();
918 
919  // FIXME: Should probably promote 8-bit vectors to i16.
920  if (Size == 16 && Subtarget->has16BitInsts())
921  return (NumElts + 1) / 2;
922 
923  if (Size <= 32)
924  return NumElts;
925 
926  if (Size > 32)
927  return NumElts * ((Size + 31) / 32);
928  } else if (VT.getSizeInBits() > 32)
929  return (VT.getSizeInBits() + 31) / 32;
930 
932 }
933 
936  EVT VT, EVT &IntermediateVT,
937  unsigned &NumIntermediates, MVT &RegisterVT) const {
938  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
939  unsigned NumElts = VT.getVectorNumElements();
940  EVT ScalarVT = VT.getScalarType();
941  unsigned Size = ScalarVT.getSizeInBits();
942  // FIXME: We should fix the ABI to be the same on targets without 16-bit
943  // support, but unless we can properly handle 3-vectors, it will be still be
944  // inconsistent.
945  if (Size == 16 && Subtarget->has16BitInsts()) {
946  RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
947  IntermediateVT = RegisterVT;
948  NumIntermediates = (NumElts + 1) / 2;
949  return NumIntermediates;
950  }
951 
952  if (Size == 32) {
953  RegisterVT = ScalarVT.getSimpleVT();
954  IntermediateVT = RegisterVT;
955  NumIntermediates = NumElts;
956  return NumIntermediates;
957  }
958 
959  if (Size < 16 && Subtarget->has16BitInsts()) {
960  // FIXME: Should probably form v2i16 pieces
961  RegisterVT = MVT::i16;
962  IntermediateVT = ScalarVT;
963  NumIntermediates = NumElts;
964  return NumIntermediates;
965  }
966 
967 
968  if (Size != 16 && Size <= 32) {
969  RegisterVT = MVT::i32;
970  IntermediateVT = ScalarVT;
971  NumIntermediates = NumElts;
972  return NumIntermediates;
973  }
974 
975  if (Size > 32) {
976  RegisterVT = MVT::i32;
977  IntermediateVT = RegisterVT;
978  NumIntermediates = NumElts * ((Size + 31) / 32);
979  return NumIntermediates;
980  }
981  }
982 
984  Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
985 }
986 
987 static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) {
988  assert(DMaskLanes != 0);
989 
990  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
991  unsigned NumElts = std::min(DMaskLanes, VT->getNumElements());
992  return EVT::getVectorVT(Ty->getContext(),
993  EVT::getEVT(VT->getElementType()),
994  NumElts);
995  }
996 
997  return EVT::getEVT(Ty);
998 }
999 
1000 // Peek through TFE struct returns to only use the data size.
1001 static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) {
1002  auto *ST = dyn_cast<StructType>(Ty);
1003  if (!ST)
1004  return memVTFromImageData(Ty, DMaskLanes);
1005 
1006  // Some intrinsics return an aggregate type - special case to work out the
1007  // correct memVT.
1008  //
1009  // Only limited forms of aggregate type currently expected.
1010  if (ST->getNumContainedTypes() != 2 ||
1011  !ST->getContainedType(1)->isIntegerTy(32))
1012  return EVT();
1013  return memVTFromImageData(ST->getContainedType(0), DMaskLanes);
1014 }
1015 
1017  const CallInst &CI,
1018  MachineFunction &MF,
1019  unsigned IntrID) const {
1020  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1021  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
1023  (Intrinsic::ID)IntrID);
1024  if (Attr.hasFnAttribute(Attribute::ReadNone))
1025  return false;
1026 
1028 
1029  if (RsrcIntr->IsImage) {
1030  Info.ptrVal =
1032  Info.align.reset();
1033  } else {
1034  Info.ptrVal =
1036  }
1037 
1039  if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
1040  unsigned DMaskLanes = 4;
1041 
1042  if (RsrcIntr->IsImage) {
1045  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1046  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1047 
1048  if (!BaseOpcode->Gather4) {
1049  // If this isn't a gather, we may have excess loaded elements in the
1050  // IR type. Check the dmask for the real number of elements loaded.
1051  unsigned DMask
1052  = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1053  DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
1054  }
1055 
1056  Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes);
1057  } else
1058  Info.memVT = EVT::getEVT(CI.getType());
1059 
1060  // FIXME: What does alignment mean for an image?
1063  } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
1064  Info.opc = ISD::INTRINSIC_VOID;
1065 
1066  Type *DataTy = CI.getArgOperand(0)->getType();
1067  if (RsrcIntr->IsImage) {
1068  unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1069  unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
1070  Info.memVT = memVTFromImageData(DataTy, DMaskLanes);
1071  } else
1072  Info.memVT = EVT::getEVT(DataTy);
1073 
1075  } else {
1076  // Atomic
1077  Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1079  Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1083 
1084  // XXX - Should this be volatile without known ordering?
1086  }
1087  return true;
1088  }
1089 
1090  switch (IntrID) {
1091  case Intrinsic::amdgcn_atomic_inc:
1092  case Intrinsic::amdgcn_atomic_dec:
1093  case Intrinsic::amdgcn_ds_ordered_add:
1094  case Intrinsic::amdgcn_ds_ordered_swap:
1095  case Intrinsic::amdgcn_ds_fadd:
1096  case Intrinsic::amdgcn_ds_fmin:
1097  case Intrinsic::amdgcn_ds_fmax: {
1099  Info.memVT = MVT::getVT(CI.getType());
1100  Info.ptrVal = CI.getOperand(0);
1101  Info.align.reset();
1103 
1104  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1105  if (!Vol->isZero())
1107 
1108  return true;
1109  }
1110  case Intrinsic::amdgcn_buffer_atomic_fadd: {
1112 
1114  Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1115  Info.ptrVal =
1117  Info.align.reset();
1119 
1120  const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1121  if (!Vol || !Vol->isZero())
1123 
1124  return true;
1125  }
1126  case Intrinsic::amdgcn_ds_append:
1127  case Intrinsic::amdgcn_ds_consume: {
1129  Info.memVT = MVT::getVT(CI.getType());
1130  Info.ptrVal = CI.getOperand(0);
1131  Info.align.reset();
1133 
1134  const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1135  if (!Vol->isZero())
1137 
1138  return true;
1139  }
1140  case Intrinsic::amdgcn_global_atomic_csub: {
1142  Info.memVT = MVT::getVT(CI.getType());
1143  Info.ptrVal = CI.getOperand(0);
1144  Info.align.reset();
1148  return true;
1149  }
1150  case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1153  Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1154  Info.ptrVal =
1156  Info.align.reset();
1159  return true;
1160  }
1161  case Intrinsic::amdgcn_global_atomic_fadd:
1162  case Intrinsic::amdgcn_global_atomic_fmin:
1163  case Intrinsic::amdgcn_global_atomic_fmax:
1164  case Intrinsic::amdgcn_flat_atomic_fadd:
1165  case Intrinsic::amdgcn_flat_atomic_fmin:
1166  case Intrinsic::amdgcn_flat_atomic_fmax: {
1168  Info.memVT = MVT::getVT(CI.getType());
1169  Info.ptrVal = CI.getOperand(0);
1170  Info.align.reset();
1175  return true;
1176  }
1177  case Intrinsic::amdgcn_ds_gws_init:
1178  case Intrinsic::amdgcn_ds_gws_barrier:
1179  case Intrinsic::amdgcn_ds_gws_sema_v:
1180  case Intrinsic::amdgcn_ds_gws_sema_br:
1181  case Intrinsic::amdgcn_ds_gws_sema_p:
1182  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1183  Info.opc = ISD::INTRINSIC_VOID;
1184 
1186  Info.ptrVal =
1188 
1189  // This is an abstract access, but we need to specify a type and size.
1190  Info.memVT = MVT::i32;
1191  Info.size = 4;
1192  Info.align = Align(4);
1193 
1195  if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1197  return true;
1198  }
1199  default:
1200  return false;
1201  }
1202 }
1203 
1206  Type *&AccessTy) const {
1207  switch (II->getIntrinsicID()) {
1208  case Intrinsic::amdgcn_atomic_inc:
1209  case Intrinsic::amdgcn_atomic_dec:
1210  case Intrinsic::amdgcn_ds_ordered_add:
1211  case Intrinsic::amdgcn_ds_ordered_swap:
1212  case Intrinsic::amdgcn_ds_append:
1213  case Intrinsic::amdgcn_ds_consume:
1214  case Intrinsic::amdgcn_ds_fadd:
1215  case Intrinsic::amdgcn_ds_fmin:
1216  case Intrinsic::amdgcn_ds_fmax:
1217  case Intrinsic::amdgcn_global_atomic_fadd:
1218  case Intrinsic::amdgcn_flat_atomic_fadd:
1219  case Intrinsic::amdgcn_flat_atomic_fmin:
1220  case Intrinsic::amdgcn_flat_atomic_fmax:
1221  case Intrinsic::amdgcn_global_atomic_csub: {
1222  Value *Ptr = II->getArgOperand(0);
1223  AccessTy = II->getType();
1224  Ops.push_back(Ptr);
1225  return true;
1226  }
1227  default:
1228  return false;
1229  }
1230 }
1231 
1232 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
1233  if (!Subtarget->hasFlatInstOffsets()) {
1234  // Flat instructions do not have offsets, and only have the register
1235  // address.
1236  return AM.BaseOffs == 0 && AM.Scale == 0;
1237  }
1238 
1239  return AM.Scale == 0 &&
1240  (AM.BaseOffs == 0 ||
1241  Subtarget->getInstrInfo()->isLegalFLATOffset(
1243 }
1244 
1246  if (Subtarget->hasFlatGlobalInsts())
1247  return AM.Scale == 0 &&
1248  (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1251 
1252  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1253  // Assume the we will use FLAT for all global memory accesses
1254  // on VI.
1255  // FIXME: This assumption is currently wrong. On VI we still use
1256  // MUBUF instructions for the r + i addressing mode. As currently
1257  // implemented, the MUBUF instructions only work on buffer < 4GB.
1258  // It may be possible to support > 4GB buffers with MUBUF instructions,
1259  // by setting the stride value in the resource descriptor which would
1260  // increase the size limit to (stride * 4GB). However, this is risky,
1261  // because it has never been validated.
1262  return isLegalFlatAddressingMode(AM);
1263  }
1264 
1265  return isLegalMUBUFAddressingMode(AM);
1266 }
1267 
1268 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1269  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1270  // additionally can do r + r + i with addr64. 32-bit has more addressing
1271  // mode options. Depending on the resource constant, it can also do
1272  // (i64 r0) + (i32 r1) * (i14 i).
1273  //
1274  // Private arrays end up using a scratch buffer most of the time, so also
1275  // assume those use MUBUF instructions. Scratch loads / stores are currently
1276  // implemented as mubuf instructions with offen bit set, so slightly
1277  // different than the normal addr64.
1278  if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
1279  return false;
1280 
1281  // FIXME: Since we can split immediate into soffset and immediate offset,
1282  // would it make sense to allow any immediate?
1283 
1284  switch (AM.Scale) {
1285  case 0: // r + i or just i, depending on HasBaseReg.
1286  return true;
1287  case 1:
1288  return true; // We have r + r or r + i.
1289  case 2:
1290  if (AM.HasBaseReg) {
1291  // Reject 2 * r + r.
1292  return false;
1293  }
1294 
1295  // Allow 2 * r as r + r
1296  // Or 2 * r + i is allowed as r + r + i.
1297  return true;
1298  default: // Don't allow n * r
1299  return false;
1300  }
1301 }
1302 
1304  const AddrMode &AM, Type *Ty,
1305  unsigned AS, Instruction *I) const {
1306  // No global is ever allowed as a base.
1307  if (AM.BaseGV)
1308  return false;
1309 
1310  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1311  return isLegalGlobalAddressingMode(AM);
1312 
1313  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1316  // If the offset isn't a multiple of 4, it probably isn't going to be
1317  // correctly aligned.
1318  // FIXME: Can we get the real alignment here?
1319  if (AM.BaseOffs % 4 != 0)
1320  return isLegalMUBUFAddressingMode(AM);
1321 
1322  // There are no SMRD extloads, so if we have to do a small type access we
1323  // will use a MUBUF load.
1324  // FIXME?: We also need to do this if unaligned, but we don't know the
1325  // alignment here.
1326  if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1327  return isLegalGlobalAddressingMode(AM);
1328 
1329  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1330  // SMRD instructions have an 8-bit, dword offset on SI.
1331  if (!isUInt<8>(AM.BaseOffs / 4))
1332  return false;
1333  } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1334  // On CI+, this can also be a 32-bit literal constant offset. If it fits
1335  // in 8-bits, it can use a smaller encoding.
1336  if (!isUInt<32>(AM.BaseOffs / 4))
1337  return false;
1338  } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1339  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1340  if (!isUInt<20>(AM.BaseOffs))
1341  return false;
1342  } else
1343  llvm_unreachable("unhandled generation");
1344 
1345  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1346  return true;
1347 
1348  if (AM.Scale == 1 && AM.HasBaseReg)
1349  return true;
1350 
1351  return false;
1352 
1353  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1354  return isLegalMUBUFAddressingMode(AM);
1355  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1356  AS == AMDGPUAS::REGION_ADDRESS) {
1357  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1358  // field.
1359  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1360  // an 8-bit dword offset but we don't know the alignment here.
1361  if (!isUInt<16>(AM.BaseOffs))
1362  return false;
1363 
1364  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1365  return true;
1366 
1367  if (AM.Scale == 1 && AM.HasBaseReg)
1368  return true;
1369 
1370  return false;
1371  } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1373  // For an unknown address space, this usually means that this is for some
1374  // reason being used for pure arithmetic, and not based on some addressing
1375  // computation. We don't have instructions that compute pointers with any
1376  // addressing modes, so treat them as having no offset like flat
1377  // instructions.
1378  return isLegalFlatAddressingMode(AM);
1379  }
1380 
1381  // Assume a user alias of global for unknown address spaces.
1382  return isLegalGlobalAddressingMode(AM);
1383 }
1384 
1385 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1386  const SelectionDAG &DAG) const {
1387  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1388  return (MemVT.getSizeInBits() <= 4 * 32);
1389  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1390  unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1391  return (MemVT.getSizeInBits() <= MaxPrivateBits);
1392  } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1393  return (MemVT.getSizeInBits() <= 2 * 32);
1394  }
1395  return true;
1396 }
1397 
1399  unsigned Size, unsigned AddrSpace, Align Alignment,
1400  MachineMemOperand::Flags Flags, bool *IsFast) const {
1401  if (IsFast)
1402  *IsFast = false;
1403 
1404  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1405  AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1406  // Check if alignment requirements for ds_read/write instructions are
1407  // disabled.
1408  if (Subtarget->hasUnalignedDSAccessEnabled() &&
1409  !Subtarget->hasLDSMisalignedBug()) {
1410  if (IsFast)
1411  *IsFast = Alignment != Align(2);
1412  return true;
1413  }
1414 
1415  // Either, the alignment requirements are "enabled", or there is an
1416  // unaligned LDS access related hardware bug though alignment requirements
1417  // are "disabled". In either case, we need to check for proper alignment
1418  // requirements.
1419  //
1420  if (Size == 64) {
1421  // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1422  // can do a 4 byte aligned, 8 byte access in a single operation using
1423  // ds_read2/write2_b32 with adjacent offsets.
1424  bool AlignedBy4 = Alignment >= Align(4);
1425  if (IsFast)
1426  *IsFast = AlignedBy4;
1427 
1428  return AlignedBy4;
1429  }
1430  if (Size == 96) {
1431  // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1432  // gfx8 and older.
1433  bool AlignedBy16 = Alignment >= Align(16);
1434  if (IsFast)
1435  *IsFast = AlignedBy16;
1436 
1437  return AlignedBy16;
1438  }
1439  if (Size == 128) {
1440  // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1441  // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1442  // single operation using ds_read2/write2_b64.
1443  bool AlignedBy8 = Alignment >= Align(8);
1444  if (IsFast)
1445  *IsFast = AlignedBy8;
1446 
1447  return AlignedBy8;
1448  }
1449  }
1450 
1451  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1452  bool AlignedBy4 = Alignment >= Align(4);
1453  if (IsFast)
1454  *IsFast = AlignedBy4;
1455 
1456  return AlignedBy4 ||
1457  Subtarget->enableFlatScratch() ||
1458  Subtarget->hasUnalignedScratchAccess();
1459  }
1460 
1461  // FIXME: We have to be conservative here and assume that flat operations
1462  // will access scratch. If we had access to the IR function, then we
1463  // could determine if any private memory was used in the function.
1464  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1465  !Subtarget->hasUnalignedScratchAccess()) {
1466  bool AlignedBy4 = Alignment >= Align(4);
1467  if (IsFast)
1468  *IsFast = AlignedBy4;
1469 
1470  return AlignedBy4;
1471  }
1472 
1473  if (Subtarget->hasUnalignedBufferAccessEnabled() &&
1474  !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1475  AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
1476  // If we have an uniform constant load, it still requires using a slow
1477  // buffer instruction if unaligned.
1478  if (IsFast) {
1479  // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
1480  // 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
1481  *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1482  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1483  Alignment >= Align(4) : Alignment != Align(2);
1484  }
1485 
1486  return true;
1487  }
1488 
1489  // Smaller than dword value must be aligned.
1490  if (Size < 32)
1491  return false;
1492 
1493  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1494  // byte-address are ignored, thus forcing Dword alignment.
1495  // This applies to private, global, and constant memory.
1496  if (IsFast)
1497  *IsFast = true;
1498 
1499  return Size >= 32 && Alignment >= Align(4);
1500 }
1501 
1503  EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1504  bool *IsFast) const {
1505  if (IsFast)
1506  *IsFast = false;
1507 
1508  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1509  // which isn't a simple VT.
1510  // Until MVT is extended to handle this, simply check for the size and
1511  // rely on the condition below: allow accesses if the size is a multiple of 4.
1512  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1513  VT.getStoreSize() > 16)) {
1514  return false;
1515  }
1516 
1517  return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
1518  Alignment, Flags, IsFast);
1519 }
1520 
1522  const MemOp &Op, const AttributeList &FuncAttributes) const {
1523  // FIXME: Should account for address space here.
1524 
1525  // The default fallback uses the private pointer size as a guess for a type to
1526  // use. Make sure we switch these to 64-bit accesses.
1527 
1528  if (Op.size() >= 16 &&
1529  Op.isDstAligned(Align(4))) // XXX: Should only do for global
1530  return MVT::v4i32;
1531 
1532  if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1533  return MVT::v2i32;
1534 
1535  // Use the default.
1536  return MVT::Other;
1537 }
1538 
1540  const MemSDNode *MemNode = cast<MemSDNode>(N);
1541  const Value *Ptr = MemNode->getMemOperand()->getValue();
1542  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
1543  return I && I->getMetadata("amdgpu.noclobber");
1544 }
1545 
1547  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1549 }
1550 
1552  unsigned DestAS) const {
1553  // Flat -> private/local is a simple truncate.
1554  // Flat -> global is no-op
1555  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1556  return true;
1557 
1558  const GCNTargetMachine &TM =
1559  static_cast<const GCNTargetMachine &>(getTargetMachine());
1560  return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1561 }
1562 
1564  const MemSDNode *MemNode = cast<MemSDNode>(N);
1565 
1566  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1567 }
1568 
1571  int NumElts = VT.getVectorNumElements();
1572  if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16))
1575 }
1576 
1578  Type *Ty) const {
1579  // FIXME: Could be smarter if called for vector constants.
1580  return true;
1581 }
1582 
1584  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1585  switch (Op) {
1586  case ISD::LOAD:
1587  case ISD::STORE:
1588 
1589  // These operations are done with 32-bit instructions anyway.
1590  case ISD::AND:
1591  case ISD::OR:
1592  case ISD::XOR:
1593  case ISD::SELECT:
1594  // TODO: Extensions?
1595  return true;
1596  default:
1597  return false;
1598  }
1599  }
1600 
1601  // SimplifySetCC uses this function to determine whether or not it should
1602  // create setcc with i1 operands. We don't have instructions for i1 setcc.
1603  if (VT == MVT::i1 && Op == ISD::SETCC)
1604  return false;
1605 
1607 }
1608 
1609 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1610  const SDLoc &SL,
1611  SDValue Chain,
1612  uint64_t Offset) const {
1613  const DataLayout &DL = DAG.getDataLayout();
1614  MachineFunction &MF = DAG.getMachineFunction();
1616 
1617  const ArgDescriptor *InputPtrReg;
1618  const TargetRegisterClass *RC;
1619  LLT ArgTy;
1620 
1621  std::tie(InputPtrReg, RC, ArgTy) =
1623 
1626  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1627  MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1628 
1629  return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
1630 }
1631 
1632 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1633  const SDLoc &SL) const {
1635  FIRST_IMPLICIT);
1636  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1637 }
1638 
1639 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1640  const SDLoc &SL, SDValue Val,
1641  bool Signed,
1642  const ISD::InputArg *Arg) const {
1643  // First, if it is a widened vector, narrow it.
1644  if (VT.isVector() &&
1645  VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1646  EVT NarrowedVT =
1648  VT.getVectorNumElements());
1649  Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1650  DAG.getConstant(0, SL, MVT::i32));
1651  }
1652 
1653  // Then convert the vector elements or scalar value.
1654  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1655  VT.bitsLT(MemVT)) {
1656  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1657  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1658  }
1659 
1660  if (MemVT.isFloatingPoint())
1661  Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1662  else if (Signed)
1663  Val = DAG.getSExtOrTrunc(Val, SL, VT);
1664  else
1665  Val = DAG.getZExtOrTrunc(Val, SL, VT);
1666 
1667  return Val;
1668 }
1669 
1670 SDValue SITargetLowering::lowerKernargMemParameter(
1671  SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
1672  uint64_t Offset, Align Alignment, bool Signed,
1673  const ISD::InputArg *Arg) const {
1675 
1676  // Try to avoid using an extload by loading earlier than the argument address,
1677  // and extracting the relevant bits. The load should hopefully be merged with
1678  // the previous argument.
1679  if (MemVT.getStoreSize() < 4 && Alignment < 4) {
1680  // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1681  int64_t AlignDownOffset = alignDown(Offset, 4);
1682  int64_t OffsetDiff = Offset - AlignDownOffset;
1683 
1684  EVT IntVT = MemVT.changeTypeToInteger();
1685 
1686  // TODO: If we passed in the base kernel offset we could have a better
1687  // alignment than 4, but we don't really need it.
1688  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1689  SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
1692 
1693  SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1694  SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1695 
1696  SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1697  ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1698  ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1699 
1700 
1701  return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1702  }
1703 
1704  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1705  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
1708 
1709  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1710  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1711 }
1712 
1713 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1714  const SDLoc &SL, SDValue Chain,
1715  const ISD::InputArg &Arg) const {
1716  MachineFunction &MF = DAG.getMachineFunction();
1717  MachineFrameInfo &MFI = MF.getFrameInfo();
1718 
1719  if (Arg.Flags.isByVal()) {
1720  unsigned Size = Arg.Flags.getByValSize();
1721  int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1722  return DAG.getFrameIndex(FrameIdx, MVT::i32);
1723  }
1724 
1725  unsigned ArgOffset = VA.getLocMemOffset();
1726  unsigned ArgSize = VA.getValVT().getStoreSize();
1727 
1728  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1729 
1730  // Create load nodes to retrieve arguments from the stack.
1731  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1732  SDValue ArgValue;
1733 
1734  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1736  MVT MemVT = VA.getValVT();
1737 
1738  switch (VA.getLocInfo()) {
1739  default:
1740  break;
1741  case CCValAssign::BCvt:
1742  MemVT = VA.getLocVT();
1743  break;
1744  case CCValAssign::SExt:
1745  ExtType = ISD::SEXTLOAD;
1746  break;
1747  case CCValAssign::ZExt:
1748  ExtType = ISD::ZEXTLOAD;
1749  break;
1750  case CCValAssign::AExt:
1751  ExtType = ISD::EXTLOAD;
1752  break;
1753  }
1754 
1755  ArgValue = DAG.getExtLoad(
1756  ExtType, SL, VA.getLocVT(), Chain, FIN,
1758  MemVT);
1759  return ArgValue;
1760 }
1761 
1762 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1763  const SIMachineFunctionInfo &MFI,
1764  EVT VT,
1766  const ArgDescriptor *Reg;
1767  const TargetRegisterClass *RC;
1768  LLT Ty;
1769 
1770  std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
1771  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1772 }
1773 
1775  CallingConv::ID CallConv,
1777  FunctionType *FType,
1779  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1780  const ISD::InputArg *Arg = &Ins[I];
1781 
1782  assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1783  "vector type argument should have been split");
1784 
1785  // First check if it's a PS input addr.
1786  if (CallConv == CallingConv::AMDGPU_PS &&
1787  !Arg->Flags.isInReg() && PSInputNum <= 15) {
1788  bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1789 
1790  // Inconveniently only the first part of the split is marked as isSplit,
1791  // so skip to the end. We only want to increment PSInputNum once for the
1792  // entire split argument.
1793  if (Arg->Flags.isSplit()) {
1794  while (!Arg->Flags.isSplitEnd()) {
1795  assert((!Arg->VT.isVector() ||
1796  Arg->VT.getScalarSizeInBits() == 16) &&
1797  "unexpected vector split in ps argument type");
1798  if (!SkipArg)
1799  Splits.push_back(*Arg);
1800  Arg = &Ins[++I];
1801  }
1802  }
1803 
1804  if (SkipArg) {
1805  // We can safely skip PS inputs.
1806  Skipped.set(Arg->getOrigArgIndex());
1807  ++PSInputNum;
1808  continue;
1809  }
1810 
1811  Info->markPSInputAllocated(PSInputNum);
1812  if (Arg->Used)
1813  Info->markPSInputEnabled(PSInputNum);
1814 
1815  ++PSInputNum;
1816  }
1817 
1818  Splits.push_back(*Arg);
1819  }
1820 }
1821 
1822 // Allocate special inputs passed in VGPRs.
1824  MachineFunction &MF,
1825  const SIRegisterInfo &TRI,
1826  SIMachineFunctionInfo &Info) const {
1827  const LLT S32 = LLT::scalar(32);
1829 
1830  if (Info.hasWorkItemIDX()) {
1831  Register Reg = AMDGPU::VGPR0;
1832  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1833 
1834  CCInfo.AllocateReg(Reg);
1835  unsigned Mask = (Subtarget->hasPackedTID() &&
1836  Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1837  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
1838  }
1839 
1840  if (Info.hasWorkItemIDY()) {
1841  assert(Info.hasWorkItemIDX());
1842  if (Subtarget->hasPackedTID()) {
1843  Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1844  0x3ff << 10));
1845  } else {
1846  unsigned Reg = AMDGPU::VGPR1;
1847  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1848 
1849  CCInfo.AllocateReg(Reg);
1850  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1851  }
1852  }
1853 
1854  if (Info.hasWorkItemIDZ()) {
1855  assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
1856  if (Subtarget->hasPackedTID()) {
1857  Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1858  0x3ff << 20));
1859  } else {
1860  unsigned Reg = AMDGPU::VGPR2;
1861  MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1862 
1863  CCInfo.AllocateReg(Reg);
1864  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1865  }
1866  }
1867 }
1868 
1869 // Try to allocate a VGPR at the end of the argument list, or if no argument
1870 // VGPRs are left allocating a stack slot.
1871 // If \p Mask is is given it indicates bitfield position in the register.
1872 // If \p Arg is given use it with new ]p Mask instead of allocating new.
1873 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
1875  if (Arg.isSet())
1877 
1878  ArrayRef<MCPhysReg> ArgVGPRs
1879  = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1880  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1881  if (RegIdx == ArgVGPRs.size()) {
1882  // Spill to stack required.
1883  int64_t Offset = CCInfo.AllocateStack(4, Align(4));
1884 
1886  }
1887 
1888  unsigned Reg = ArgVGPRs[RegIdx];
1889  Reg = CCInfo.AllocateReg(Reg);
1890  assert(Reg != AMDGPU::NoRegister);
1891 
1892  MachineFunction &MF = CCInfo.getMachineFunction();
1893  Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1894  MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
1896 }
1897 
1899  const TargetRegisterClass *RC,
1900  unsigned NumArgRegs) {
1901  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1902  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1903  if (RegIdx == ArgSGPRs.size())
1904  report_fatal_error("ran out of SGPRs for arguments");
1905 
1906  unsigned Reg = ArgSGPRs[RegIdx];
1907  Reg = CCInfo.AllocateReg(Reg);
1908  assert(Reg != AMDGPU::NoRegister);
1909 
1910  MachineFunction &MF = CCInfo.getMachineFunction();
1911  MF.addLiveIn(Reg, RC);
1913 }
1914 
1915 // If this has a fixed position, we still should allocate the register in the
1916 // CCInfo state. Technically we could get away with this for values passed
1917 // outside of the normal argument range.
1919  const TargetRegisterClass *RC,
1920  MCRegister Reg) {
1921  Reg = CCInfo.AllocateReg(Reg);
1922  assert(Reg != AMDGPU::NoRegister);
1923  MachineFunction &MF = CCInfo.getMachineFunction();
1924  MF.addLiveIn(Reg, RC);
1925 }
1926 
1928  if (Arg) {
1929  allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
1930  Arg.getRegister());
1931  } else
1932  Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1933 }
1934 
1936  if (Arg) {
1937  allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
1938  Arg.getRegister());
1939  } else
1940  Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1941 }
1942 
1943 /// Allocate implicit function VGPR arguments at the end of allocated user
1944 /// arguments.
1946  CCState &CCInfo, MachineFunction &MF,
1947  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
1948  const unsigned Mask = 0x3ff;
1950 
1951  if (Info.hasWorkItemIDX()) {
1952  Arg = allocateVGPR32Input(CCInfo, Mask);
1953  Info.setWorkItemIDX(Arg);
1954  }
1955 
1956  if (Info.hasWorkItemIDY()) {
1957  Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
1958  Info.setWorkItemIDY(Arg);
1959  }
1960 
1961  if (Info.hasWorkItemIDZ())
1962  Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
1963 }
1964 
1965 /// Allocate implicit function VGPR arguments in fixed registers.
1967  CCState &CCInfo, MachineFunction &MF,
1968  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
1969  Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
1970  if (!Reg)
1971  report_fatal_error("failed to allocated VGPR for implicit arguments");
1972 
1973  const unsigned Mask = 0x3ff;
1974  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
1975  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
1976  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
1977 }
1978 
1980  CCState &CCInfo,
1981  MachineFunction &MF,
1982  const SIRegisterInfo &TRI,
1983  SIMachineFunctionInfo &Info) const {
1984  auto &ArgInfo = Info.getArgInfo();
1985 
1986  // TODO: Unify handling with private memory pointers.
1987 
1988  if (Info.hasDispatchPtr())
1989  allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
1990 
1991  if (Info.hasQueuePtr())
1992  allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
1993 
1994  // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
1995  // constant offset from the kernarg segment.
1996  if (Info.hasImplicitArgPtr())
1997  allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
1998 
1999  if (Info.hasDispatchID())
2000  allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2001 
2002  // flat_scratch_init is not applicable for non-kernel functions.
2003 
2004  if (Info.hasWorkGroupIDX())
2005  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2006 
2007  if (Info.hasWorkGroupIDY())
2008  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2009 
2010  if (Info.hasWorkGroupIDZ())
2011  allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2012 }
2013 
2014 // Allocate special inputs passed in user SGPRs.
2016  MachineFunction &MF,
2017  const SIRegisterInfo &TRI,
2018  SIMachineFunctionInfo &Info) const {
2019  if (Info.hasImplicitBufferPtr()) {
2020  Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2021  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2022  CCInfo.AllocateReg(ImplicitBufferPtrReg);
2023  }
2024 
2025  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2026  if (Info.hasPrivateSegmentBuffer()) {
2027  Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2028  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2029  CCInfo.AllocateReg(PrivateSegmentBufferReg);
2030  }
2031 
2032  if (Info.hasDispatchPtr()) {
2033  Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2034  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2035  CCInfo.AllocateReg(DispatchPtrReg);
2036  }
2037 
2038  if (Info.hasQueuePtr()) {
2039  Register QueuePtrReg = Info.addQueuePtr(TRI);
2040  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2041  CCInfo.AllocateReg(QueuePtrReg);
2042  }
2043 
2044  if (Info.hasKernargSegmentPtr()) {
2046  Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2047  CCInfo.AllocateReg(InputPtrReg);
2048 
2049  Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2051  }
2052 
2053  if (Info.hasDispatchID()) {
2054  Register DispatchIDReg = Info.addDispatchID(TRI);
2055  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2056  CCInfo.AllocateReg(DispatchIDReg);
2057  }
2058 
2059  if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2060  Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2061  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2062  CCInfo.AllocateReg(FlatScratchInitReg);
2063  }
2064 
2065  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2066  // these from the dispatch pointer.
2067 }
2068 
2069 // Allocate special input registers that are initialized per-wave.
2071  MachineFunction &MF,
2073  CallingConv::ID CallConv,
2074  bool IsShader) const {
2075  if (Info.hasWorkGroupIDX()) {
2076  Register Reg = Info.addWorkGroupIDX();
2077  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2078  CCInfo.AllocateReg(Reg);
2079  }
2080 
2081  if (Info.hasWorkGroupIDY()) {
2082  Register Reg = Info.addWorkGroupIDY();
2083  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2084  CCInfo.AllocateReg(Reg);
2085  }
2086 
2087  if (Info.hasWorkGroupIDZ()) {
2088  Register Reg = Info.addWorkGroupIDZ();
2089  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2090  CCInfo.AllocateReg(Reg);
2091  }
2092 
2093  if (Info.hasWorkGroupInfo()) {
2094  Register Reg = Info.addWorkGroupInfo();
2095  MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2096  CCInfo.AllocateReg(Reg);
2097  }
2098 
2099  if (Info.hasPrivateSegmentWaveByteOffset()) {
2100  // Scratch wave offset passed in system SGPR.
2101  unsigned PrivateSegmentWaveByteOffsetReg;
2102 
2103  if (IsShader) {
2104  PrivateSegmentWaveByteOffsetReg =
2105  Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2106 
2107  // This is true if the scratch wave byte offset doesn't have a fixed
2108  // location.
2109  if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2110  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2111  Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2112  }
2113  } else
2114  PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2115 
2116  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2117  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2118  }
2119 }
2120 
2122  MachineFunction &MF,
2123  const SIRegisterInfo &TRI,
2125  // Now that we've figured out where the scratch register inputs are, see if
2126  // should reserve the arguments and use them directly.
2127  MachineFrameInfo &MFI = MF.getFrameInfo();
2128  bool HasStackObjects = MFI.hasStackObjects();
2129  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2130 
2131  // Record that we know we have non-spill stack objects so we don't need to
2132  // check all stack objects later.
2133  if (HasStackObjects)
2134  Info.setHasNonSpillStackObjects(true);
2135 
2136  // Everything live out of a block is spilled with fast regalloc, so it's
2137  // almost certain that spilling will be required.
2138  if (TM.getOptLevel() == CodeGenOpt::None)
2139  HasStackObjects = true;
2140 
2141  // For now assume stack access is needed in any callee functions, so we need
2142  // the scratch registers to pass in.
2143  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2144 
2145  if (!ST.enableFlatScratch()) {
2146  if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2147  // If we have stack objects, we unquestionably need the private buffer
2148  // resource. For the Code Object V2 ABI, this will be the first 4 user
2149  // SGPR inputs. We can reserve those and use them directly.
2150 
2151  Register PrivateSegmentBufferReg =
2153  Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2154  } else {
2155  unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2156  // We tentatively reserve the last registers (skipping the last registers
2157  // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2158  // we'll replace these with the ones immediately after those which were
2159  // really allocated. In the prologue copies will be inserted from the
2160  // argument to these reserved registers.
2161 
2162  // Without HSA, relocations are used for the scratch pointer and the
2163  // buffer resource setup is always inserted in the prologue. Scratch wave
2164  // offset is still in an input SGPR.
2165  Info.setScratchRSrcReg(ReservedBufferReg);
2166  }
2167  }
2168 
2170 
2171  // For entry functions we have to set up the stack pointer if we use it,
2172  // whereas non-entry functions get this "for free". This means there is no
2173  // intrinsic advantage to using S32 over S34 in cases where we do not have
2174  // calls but do need a frame pointer (i.e. if we are requested to have one
2175  // because frame pointer elimination is disabled). To keep things simple we
2176  // only ever use S32 as the call ABI stack pointer, and so using it does not
2177  // imply we need a separate frame pointer.
2178  //
2179  // Try to use s32 as the SP, but move it if it would interfere with input
2180  // arguments. This won't work with calls though.
2181  //
2182  // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2183  // registers.
2184  if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2185  Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2186  } else {
2188 
2189  if (MFI.hasCalls())
2190  report_fatal_error("call in graphics shader with too many input SGPRs");
2191 
2192  for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2193  if (!MRI.isLiveIn(Reg)) {
2194  Info.setStackPtrOffsetReg(Reg);
2195  break;
2196  }
2197  }
2198 
2199  if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2200  report_fatal_error("failed to find register for SP");
2201  }
2202 
2203  // hasFP should be accurate for entry functions even before the frame is
2204  // finalized, because it does not rely on the known stack size, only
2205  // properties like whether variable sized objects are present.
2206  if (ST.getFrameLowering()->hasFP(MF)) {
2207  Info.setFrameOffsetReg(AMDGPU::SGPR33);
2208  }
2209 }
2210 
2213  return !Info->isEntryFunction();
2214 }
2215 
2217 
2218 }
2219 
2221  MachineBasicBlock *Entry,
2222  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2224 
2225  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2226  if (!IStart)
2227  return;
2228 
2229  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2230  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2231  MachineBasicBlock::iterator MBBI = Entry->begin();
2232  for (const MCPhysReg *I = IStart; *I; ++I) {
2233  const TargetRegisterClass *RC = nullptr;
2234  if (AMDGPU::SReg_64RegClass.contains(*I))
2235  RC = &AMDGPU::SGPR_64RegClass;
2236  else if (AMDGPU::SReg_32RegClass.contains(*I))
2237  RC = &AMDGPU::SGPR_32RegClass;
2238  else
2239  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2240 
2241  Register NewVR = MRI->createVirtualRegister(RC);
2242  // Create copy from CSR to a virtual register.
2243  Entry->addLiveIn(*I);
2244  BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2245  .addReg(*I);
2246 
2247  // Insert the copy-back instructions right before the terminator.
2248  for (auto *Exit : Exits)
2249  BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2250  TII->get(TargetOpcode::COPY), *I)
2251  .addReg(NewVR);
2252  }
2253 }
2254 
2256  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2257  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2258  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2260 
2261  MachineFunction &MF = DAG.getMachineFunction();
2262  const Function &Fn = MF.getFunction();
2263  FunctionType *FType = MF.getFunction().getFunctionType();
2265 
2266  if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2267  DiagnosticInfoUnsupported NoGraphicsHSA(
2268  Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2269  DAG.getContext()->diagnose(NoGraphicsHSA);
2270  return DAG.getEntryNode();
2271  }
2272 
2273  Info->allocateModuleLDSGlobal(Fn.getParent());
2274 
2277  BitVector Skipped(Ins.size());
2278  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2279  *DAG.getContext());
2280 
2281  bool IsGraphics = AMDGPU::isGraphics(CallConv);
2282  bool IsKernel = AMDGPU::isKernel(CallConv);
2283  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2284 
2285  if (IsGraphics) {
2286  assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
2287  (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
2288  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2289  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
2290  !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
2291  !Info->hasWorkItemIDZ());
2292  }
2293 
2294  if (CallConv == CallingConv::AMDGPU_PS) {
2295  processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2296 
2297  // At least one interpolation mode must be enabled or else the GPU will
2298  // hang.
2299  //
2300  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2301  // set PSInputAddr, the user wants to enable some bits after the compilation
2302  // based on run-time states. Since we can't know what the final PSInputEna
2303  // will look like, so we shouldn't do anything here and the user should take
2304  // responsibility for the correct programming.
2305  //
2306  // Otherwise, the following restrictions apply:
2307  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2308  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2309  // enabled too.
2310  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2311  ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2312  CCInfo.AllocateReg(AMDGPU::VGPR0);
2313  CCInfo.AllocateReg(AMDGPU::VGPR1);
2314  Info->markPSInputAllocated(0);
2315  Info->markPSInputEnabled(0);
2316  }
2317  if (Subtarget->isAmdPalOS()) {
2318  // For isAmdPalOS, the user does not enable some bits after compilation
2319  // based on run-time states; the register values being generated here are
2320  // the final ones set in hardware. Therefore we need to apply the
2321  // workaround to PSInputAddr and PSInputEnable together. (The case where
2322  // a bit is set in PSInputAddr but not PSInputEnable is where the
2323  // frontend set up an input arg for a particular interpolation mode, but
2324  // nothing uses that input arg. Really we should have an earlier pass
2325  // that removes such an arg.)
2326  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2327  if ((PsInputBits & 0x7F) == 0 ||
2328  ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2329  Info->markPSInputEnabled(
2330  countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
2331  }
2332  } else if (IsKernel) {
2333  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2334  } else {
2335  Splits.append(Ins.begin(), Ins.end());
2336  }
2337 
2338  if (IsEntryFunc) {
2339  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2340  allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2341  } else {
2342  // For the fixed ABI, pass workitem IDs in the last argument register.
2344  allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2345  }
2346 
2347  if (IsKernel) {
2349  } else {
2350  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2351  CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2352  }
2353 
2354  SmallVector<SDValue, 16> Chains;
2355 
2356  // FIXME: This is the minimum kernel argument alignment. We should improve
2357  // this to the maximum alignment of the arguments.
2358  //
2359  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2360  // kern arg offset.
2361  const Align KernelArgBaseAlign = Align(16);
2362 
2363  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2364  const ISD::InputArg &Arg = Ins[i];
2365  if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2366  InVals.push_back(DAG.getUNDEF(Arg.VT));
2367  continue;
2368  }
2369 
2370  CCValAssign &VA = ArgLocs[ArgIdx++];
2371  MVT VT = VA.getLocVT();
2372 
2373  if (IsEntryFunc && VA.isMemLoc()) {
2374  VT = Ins[i].VT;
2375  EVT MemVT = VA.getLocVT();
2376 
2377  const uint64_t Offset = VA.getLocMemOffset();
2378  Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2379 
2380  if (Arg.Flags.isByRef()) {
2381  SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2382 
2383  const GCNTargetMachine &TM =
2384  static_cast<const GCNTargetMachine &>(getTargetMachine());
2385  if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2386  Arg.Flags.getPointerAddrSpace())) {
2387  Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
2388  Arg.Flags.getPointerAddrSpace());
2389  }
2390 
2391  InVals.push_back(Ptr);
2392  continue;
2393  }
2394 
2395  SDValue Arg = lowerKernargMemParameter(
2396  DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2397  Chains.push_back(Arg.getValue(1));
2398 
2399  auto *ParamTy =
2400  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2401  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2402  ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2403  ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2404  // On SI local pointers are just offsets into LDS, so they are always
2405  // less than 16-bits. On CI and newer they could potentially be
2406  // real pointers, so we can't guarantee their size.
2407  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2408  DAG.getValueType(MVT::i16));
2409  }
2410 
2411  InVals.push_back(Arg);
2412  continue;
2413  } else if (!IsEntryFunc && VA.isMemLoc()) {
2414  SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2415  InVals.push_back(Val);
2416  if (!Arg.Flags.isByVal())
2417  Chains.push_back(Val.getValue(1));
2418  continue;
2419  }
2420 
2421  assert(VA.isRegLoc() && "Parameter must be in a register!");
2422 
2423  Register Reg = VA.getLocReg();
2425  EVT ValVT = VA.getValVT();
2426 
2427  Reg = MF.addLiveIn(Reg, RC);
2428  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2429 
2430  if (Arg.Flags.isSRet()) {
2431  // The return object should be reasonably addressable.
2432 
2433  // FIXME: This helps when the return is a real sret. If it is a
2434  // automatically inserted sret (i.e. CanLowerReturn returns false), an
2435  // extra copy is inserted in SelectionDAGBuilder which obscures this.
2436  unsigned NumBits
2438  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2439  DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2440  }
2441 
2442  // If this is an 8 or 16-bit value, it is really passed promoted
2443  // to 32 bits. Insert an assert[sz]ext to capture this, then
2444  // truncate to the right size.
2445  switch (VA.getLocInfo()) {
2446  case CCValAssign::Full:
2447  break;
2448  case CCValAssign::BCvt:
2449  Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2450  break;
2451  case CCValAssign::SExt:
2452  Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2453  DAG.getValueType(ValVT));
2454  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2455  break;
2456  case CCValAssign::ZExt:
2457  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2458  DAG.getValueType(ValVT));
2459  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2460  break;
2461  case CCValAssign::AExt:
2462  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2463  break;
2464  default:
2465  llvm_unreachable("Unknown loc info!");
2466  }
2467 
2468  InVals.push_back(Val);
2469  }
2470 
2471  if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
2472  // Special inputs come after user arguments.
2473  allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2474  }
2475 
2476  // Start adding system SGPRs.
2477  if (IsEntryFunc) {
2478  allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
2479  } else {
2480  CCInfo.AllocateReg(Info->getScratchRSrcReg());
2481  allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2482  }
2483 
2484  auto &ArgUsageInfo =
2486  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2487 
2488  unsigned StackArgSize = CCInfo.getNextStackOffset();
2489  Info->setBytesInStackArgArea(StackArgSize);
2490 
2491  return Chains.empty() ? Chain :
2492  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2493 }
2494 
2495 // TODO: If return values can't fit in registers, we should return as many as
2496 // possible in registers before passing on stack.
2498  CallingConv::ID CallConv,
2499  MachineFunction &MF, bool IsVarArg,
2500  const SmallVectorImpl<ISD::OutputArg> &Outs,
2501  LLVMContext &Context) const {
2502  // Replacing returns with sret/stack usage doesn't make sense for shaders.
2503  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2504  // for shaders. Vector types should be explicitly handled by CC.
2505  if (AMDGPU::isEntryFunctionCC(CallConv))
2506  return true;
2507 
2509  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2510  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2511 }
2512 
2513 SDValue
2515  bool isVarArg,
2516  const SmallVectorImpl<ISD::OutputArg> &Outs,
2517  const SmallVectorImpl<SDValue> &OutVals,
2518  const SDLoc &DL, SelectionDAG &DAG) const {
2519  MachineFunction &MF = DAG.getMachineFunction();
2521 
2522  if (AMDGPU::isKernel(CallConv)) {
2523  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2524  OutVals, DL, DAG);
2525  }
2526 
2527  bool IsShader = AMDGPU::isShader(CallConv);
2528 
2529  Info->setIfReturnsVoid(Outs.empty());
2530  bool IsWaveEnd = Info->returnsVoid() && IsShader;
2531 
2532  // CCValAssign - represent the assignment of the return value to a location.
2535 
2536  // CCState - Info about the registers and stack slots.
2537  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2538  *DAG.getContext());
2539 
2540  // Analyze outgoing return values.
2541  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2542 
2543  SDValue Flag;
2544  SmallVector<SDValue, 48> RetOps;
2545  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2546 
2547  // Add return address for callable functions.
2548  if (!Info->isEntryFunction()) {
2550  SDValue ReturnAddrReg = CreateLiveInRegister(
2551  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2552 
2553  SDValue ReturnAddrVirtualReg = DAG.getRegister(
2554  MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass),
2555  MVT::i64);
2556  Chain =
2557  DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
2558  Flag = Chain.getValue(1);
2559  RetOps.push_back(ReturnAddrVirtualReg);
2560  }
2561 
2562  // Copy the result values into the output registers.
2563  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2564  ++I, ++RealRVLocIdx) {
2565  CCValAssign &VA = RVLocs[I];
2566  assert(VA.isRegLoc() && "Can only return in registers!");
2567  // TODO: Partially return in registers if return values don't fit.
2568  SDValue Arg = OutVals[RealRVLocIdx];
2569 
2570  // Copied from other backends.
2571  switch (VA.getLocInfo()) {
2572  case CCValAssign::Full:
2573  break;
2574  case CCValAssign::BCvt:
2575  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2576  break;
2577  case CCValAssign::SExt:
2578  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2579  break;
2580  case CCValAssign::ZExt:
2581  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2582  break;
2583  case CCValAssign::AExt:
2584  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2585  break;
2586  default:
2587  llvm_unreachable("Unknown loc info!");
2588  }
2589 
2590  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2591  Flag = Chain.getValue(1);
2592  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2593  }
2594 
2595  // FIXME: Does sret work properly?
2596  if (!Info->isEntryFunction()) {
2597  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2598  const MCPhysReg *I =
2599  TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2600  if (I) {
2601  for (; *I; ++I) {
2602  if (AMDGPU::SReg_64RegClass.contains(*I))
2603  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2604  else if (AMDGPU::SReg_32RegClass.contains(*I))
2605  RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2606  else
2607  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2608  }
2609  }
2610  }
2611 
2612  // Update chain and glue.
2613  RetOps[0] = Chain;
2614  if (Flag.getNode())
2615  RetOps.push_back(Flag);
2616 
2617  unsigned Opc = AMDGPUISD::ENDPGM;
2618  if (!IsWaveEnd)
2620  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2621 }
2622 
2624  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2625  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2626  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2627  SDValue ThisVal) const {
2628  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2629 
2630  // Assign locations to each value returned by this call.
2632  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2633  *DAG.getContext());
2634  CCInfo.AnalyzeCallResult(Ins, RetCC);
2635 
2636  // Copy all of the result registers out of their specified physreg.
2637  for (unsigned i = 0; i != RVLocs.size(); ++i) {
2638  CCValAssign VA = RVLocs[i];
2639  SDValue Val;
2640 
2641  if (VA.isRegLoc()) {
2642  Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2643  Chain = Val.getValue(1);
2644  InFlag = Val.getValue(2);
2645  } else if (VA.isMemLoc()) {
2646  report_fatal_error("TODO: return values in memory");
2647  } else
2648  llvm_unreachable("unknown argument location type");
2649 
2650  switch (VA.getLocInfo()) {
2651  case CCValAssign::Full:
2652  break;
2653  case CCValAssign::BCvt:
2654  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2655  break;
2656  case CCValAssign::ZExt:
2657  Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2658  DAG.getValueType(VA.getValVT()));
2659  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2660  break;
2661  case CCValAssign::SExt:
2662  Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2663  DAG.getValueType(VA.getValVT()));
2664  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2665  break;
2666  case CCValAssign::AExt:
2667  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2668  break;
2669  default:
2670  llvm_unreachable("Unknown loc info!");
2671  }
2672 
2673  InVals.push_back(Val);
2674  }
2675 
2676  return Chain;
2677 }
2678 
2679 // Add code to pass special inputs required depending on used features separate
2680 // from the explicit user arguments present in the IR.
2682  CallLoweringInfo &CLI,
2683  CCState &CCInfo,
2684  const SIMachineFunctionInfo &Info,
2685  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2686  SmallVectorImpl<SDValue> &MemOpChains,
2687  SDValue Chain) const {
2688  // If we don't have a call site, this was a call inserted by
2689  // legalization. These can never use special inputs.
2690  if (!CLI.CB)
2691  return;
2692 
2693  SelectionDAG &DAG = CLI.DAG;
2694  const SDLoc &DL = CLI.DL;
2695 
2696  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2697  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2698 
2699  const AMDGPUFunctionArgInfo *CalleeArgInfo
2701  if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
2702  auto &ArgUsageInfo =
2704  CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2705  }
2706 
2707  // TODO: Unify with private memory register handling. This is complicated by
2708  // the fact that at least in kernels, the input argument is not necessarily
2709  // in the same location as the input.
2718  };
2719 
2720  for (auto InputID : InputRegs) {
2721  const ArgDescriptor *OutgoingArg;
2722  const TargetRegisterClass *ArgRC;
2723  LLT ArgTy;
2724 
2725  std::tie(OutgoingArg, ArgRC, ArgTy) =
2726  CalleeArgInfo->getPreloadedValue(InputID);
2727  if (!OutgoingArg)
2728  continue;
2729 
2730  const ArgDescriptor *IncomingArg;
2731  const TargetRegisterClass *IncomingArgRC;
2732  LLT Ty;
2733  std::tie(IncomingArg, IncomingArgRC, Ty) =
2734  CallerArgInfo.getPreloadedValue(InputID);
2735  assert(IncomingArgRC == ArgRC);
2736 
2737  // All special arguments are ints for now.
2738  EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2739  SDValue InputReg;
2740 
2741  if (IncomingArg) {
2742  InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2743  } else {
2744  // The implicit arg ptr is special because it doesn't have a corresponding
2745  // input for kernels, and is computed from the kernarg segment pointer.
2747  InputReg = getImplicitArgPtr(DAG, DL);
2748  }
2749 
2750  if (OutgoingArg->isRegister()) {
2751  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2752  if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
2753  report_fatal_error("failed to allocate implicit input argument");
2754  } else {
2755  unsigned SpecialArgOffset =
2756  CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
2757  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2758  SpecialArgOffset);
2759  MemOpChains.push_back(ArgStore);
2760  }
2761  }
2762 
2763  // Pack workitem IDs into a single register or pass it as is if already
2764  // packed.
2765  const ArgDescriptor *OutgoingArg;
2766  const TargetRegisterClass *ArgRC;
2767  LLT Ty;
2768 
2769  std::tie(OutgoingArg, ArgRC, Ty) =
2771  if (!OutgoingArg)
2772  std::tie(OutgoingArg, ArgRC, Ty) =
2774  if (!OutgoingArg)
2775  std::tie(OutgoingArg, ArgRC, Ty) =
2777  if (!OutgoingArg)
2778  return;
2779 
2780  const ArgDescriptor *IncomingArgX = std::get<0>(
2782  const ArgDescriptor *IncomingArgY = std::get<0>(
2784  const ArgDescriptor *IncomingArgZ = std::get<0>(
2786 
2787  SDValue InputReg;
2788  SDLoc SL;
2789 
2790  // If incoming ids are not packed we need to pack them.
2791  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX)
2792  InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
2793 
2794  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
2795  SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
2796  Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
2797  DAG.getShiftAmountConstant(10, MVT::i32, SL));
2798  InputReg = InputReg.getNode() ?
2799  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
2800  }
2801 
2802  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
2803  SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
2804  Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
2805  DAG.getShiftAmountConstant(20, MVT::i32, SL));
2806  InputReg = InputReg.getNode() ?
2807  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
2808  }
2809 
2810  if (!InputReg.getNode()) {
2811  // Workitem ids are already packed, any of present incoming arguments
2812  // will carry all required fields.
2814  IncomingArgX ? *IncomingArgX :
2815  IncomingArgY ? *IncomingArgY :
2816  *IncomingArgZ, ~0u);
2817  InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
2818  }
2819 
2820  if (OutgoingArg->isRegister()) {
2821  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2822  CCInfo.AllocateReg(OutgoingArg->getRegister());
2823  } else {
2824  unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
2825  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2826  SpecialArgOffset);
2827  MemOpChains.push_back(ArgStore);
2828  }
2829 }
2830 
2832  return CC == CallingConv::Fast;
2833 }
2834 
2835 /// Return true if we might ever do TCO for calls with this calling convention.
2837  switch (CC) {
2838  case CallingConv::C:
2840  return true;
2841  default:
2842  return canGuaranteeTCO(CC);
2843  }
2844 }
2845 
2847  SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2848  const SmallVectorImpl<ISD::OutputArg> &Outs,
2849  const SmallVectorImpl<SDValue> &OutVals,
2850  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2851  if (!mayTailCallThisCC(CalleeCC))
2852  return false;
2853 
2854  MachineFunction &MF = DAG.getMachineFunction();
2855  const Function &CallerF = MF.getFunction();
2856  CallingConv::ID CallerCC = CallerF.getCallingConv();
2858  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2859 
2860  // Kernels aren't callable, and don't have a live in return address so it
2861  // doesn't make sense to do a tail call with entry functions.
2862  if (!CallerPreserved)
2863  return false;
2864 
2865  bool CCMatch = CallerCC == CalleeCC;
2866 
2868  if (canGuaranteeTCO(CalleeCC) && CCMatch)
2869  return true;
2870  return false;
2871  }
2872 
2873  // TODO: Can we handle var args?
2874  if (IsVarArg)
2875  return false;
2876 
2877  for (const Argument &Arg : CallerF.args()) {
2878  if (Arg.hasByValAttr())
2879  return false;
2880  }
2881 
2882  LLVMContext &Ctx = *DAG.getContext();
2883 
2884  // Check that the call results are passed in the same way.
2885  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2886  CCAssignFnForCall(CalleeCC, IsVarArg),
2887  CCAssignFnForCall(CallerCC, IsVarArg)))
2888  return false;
2889 
2890  // The callee has to preserve all registers the caller needs to preserve.
2891  if (!CCMatch) {
2892  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2893  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2894  return false;
2895  }
2896 
2897  // Nothing more to check if the callee is taking no arguments.
2898  if (Outs.empty())
2899  return true;
2900 
2902  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2903 
2904  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2905 
2906  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2907  // If the stack arguments for this call do not fit into our own save area then
2908  // the call cannot be made tail.
2909  // TODO: Is this really necessary?
2910  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2911  return false;
2912 
2913  const MachineRegisterInfo &MRI = MF.getRegInfo();
2914  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2915 }
2916 
2918  if (!CI->isTailCall())
2919  return false;
2920 
2921  const Function *ParentFn = CI->getParent()->getParent();
2922  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2923  return false;
2924  return true;
2925 }
2926 
2927 // The wave scratch offset register is used as the global base pointer.
2929  SmallVectorImpl<SDValue> &InVals) const {
2930  SelectionDAG &DAG = CLI.DAG;
2931  const SDLoc &DL = CLI.DL;
2933  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2935  SDValue Chain = CLI.Chain;
2936  SDValue Callee = CLI.Callee;
2937  bool &IsTailCall = CLI.IsTailCall;
2938  CallingConv::ID CallConv = CLI.CallConv;
2939  bool IsVarArg = CLI.IsVarArg;
2940  bool IsSibCall = false;
2941  bool IsThisReturn = false;
2942  MachineFunction &MF = DAG.getMachineFunction();
2943 
2944  if (Callee.isUndef() || isNullConstant(Callee)) {
2945  if (!CLI.IsTailCall) {
2946  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
2947  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
2948  }
2949 
2950  return Chain;
2951  }
2952 
2953  if (IsVarArg) {
2954  return lowerUnhandledCall(CLI, InVals,
2955  "unsupported call to variadic function ");
2956  }
2957 
2958  if (!CLI.CB)
2959  report_fatal_error("unsupported libcall legalization");
2960 
2961  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2962  return lowerUnhandledCall(CLI, InVals,
2963  "unsupported required tail call to function ");
2964  }
2965 
2966  if (AMDGPU::isShader(CallConv)) {
2967  // Note the issue is with the CC of the called function, not of the call
2968  // itself.
2969  return lowerUnhandledCall(CLI, InVals,
2970  "unsupported call to a shader function ");
2971  }
2972 
2974  CallConv != CallingConv::AMDGPU_Gfx) {
2975  // Only allow calls with specific calling conventions.
2976  return lowerUnhandledCall(CLI, InVals,
2977  "unsupported calling convention for call from "
2978  "graphics shader of function ");
2979  }
2980 
2981  if (IsTailCall) {
2982  IsTailCall = isEligibleForTailCallOptimization(
2983  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2984  if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
2985  report_fatal_error("failed to perform tail call elimination on a call "
2986  "site marked musttail");
2987  }
2988 
2989  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2990 
2991  // A sibling call is one where we're under the usual C ABI and not planning
2992  // to change that but can still do a tail call:
2993  if (!TailCallOpt && IsTailCall)
2994  IsSibCall = true;
2995 
2996  if (IsTailCall)
2997  ++NumTailCalls;
2998  }
2999 
3002  SmallVector<SDValue, 8> MemOpChains;
3003 
3004  // Analyze operands of the call, assigning locations to each operand.
3006  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3007  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3008 
3010  CallConv != CallingConv::AMDGPU_Gfx) {
3011  // With a fixed ABI, allocate fixed registers before user arguments.
3012  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3013  }
3014 
3015  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3016 
3017  // Get a count of how many bytes are to be pushed on the stack.
3018  unsigned NumBytes = CCInfo.getNextStackOffset();
3019 
3020  if (IsSibCall) {
3021  // Since we're not changing the ABI to make this a tail call, the memory
3022  // operands are already available in the caller's incoming argument space.
3023  NumBytes = 0;
3024  }
3025 
3026  // FPDiff is the byte offset of the call's argument area from the callee's.
3027  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3028  // by this amount for a tail call. In a sibling call it must be 0 because the
3029  // caller will deallocate the entire stack and the callee still expects its
3030  // arguments to begin at SP+0. Completely unused for non-tail calls.
3031  int32_t FPDiff = 0;
3032  MachineFrameInfo &MFI = MF.getFrameInfo();
3033 
3034  // Adjust the stack pointer for the new arguments...
3035  // These operations are automatically eliminated by the prolog/epilog pass
3036  if (!IsSibCall) {
3037  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3038 
3039  if (!Subtarget->enableFlatScratch()) {
3040  SmallVector<SDValue, 4> CopyFromChains;
3041 
3042  // In the HSA case, this should be an identity copy.
3043  SDValue ScratchRSrcReg
3044  = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3045  RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3046  CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3047  Chain = DAG.getTokenFactor(DL, CopyFromChains);
3048  }
3049  }
3050 
3051  MVT PtrVT = MVT::i32;
3052 
3053  // Walk the register/memloc assignments, inserting copies/loads.
3054  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3055  CCValAssign &VA = ArgLocs[i];
3056  SDValue Arg = OutVals[i];
3057 
3058  // Promote the value if needed.
3059  switch (VA.getLocInfo()) {
3060  case CCValAssign::Full:
3061  break;
3062  case CCValAssign::BCvt:
3063  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3064  break;
3065  case CCValAssign::ZExt:
3066  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3067  break;
3068  case CCValAssign::SExt:
3069  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3070  break;
3071  case CCValAssign::AExt:
3072  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3073  break;
3074  case CCValAssign::FPExt:
3075  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3076  break;
3077  default:
3078  llvm_unreachable("Unknown loc info!");
3079  }
3080 
3081  if (VA.isRegLoc()) {
3082  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3083  } else {
3084  assert(VA.isMemLoc());
3085 
3086  SDValue DstAddr;
3087  MachinePointerInfo DstInfo;
3088 
3089  unsigned LocMemOffset = VA.getLocMemOffset();
3090  int32_t Offset = LocMemOffset;
3091 
3092  SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3093  MaybeAlign Alignment;
3094 
3095  if (IsTailCall) {
3096  ISD::ArgFlagsTy Flags = Outs[i].Flags;
3097  unsigned OpSize = Flags.isByVal() ?
3098  Flags.getByValSize() : VA.getValVT().getStoreSize();
3099 
3100  // FIXME: We can have better than the minimum byval required alignment.
3101  Alignment =
3102  Flags.isByVal()
3103  ? Flags.getNonZeroByValAlign()
3104  : commonAlignment(Subtarget->getStackAlignment(), Offset);
3105 
3106  Offset = Offset + FPDiff;
3107  int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3108 
3109  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3110  DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3111 
3112  // Make sure any stack arguments overlapping with where we're storing
3113  // are loaded before this eventual operation. Otherwise they'll be
3114  // clobbered.
3115 
3116  // FIXME: Why is this really necessary? This seems to just result in a
3117  // lot of code to copy the stack and write them back to the same
3118  // locations, which are supposed to be immutable?
3119  Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3120  } else {
3121  DstAddr = PtrOff;
3122  DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3123  Alignment =
3124  commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3125  }
3126 
3127  if (Outs[i].Flags.isByVal()) {
3128  SDValue SizeNode =
3129  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3130  SDValue Cpy =
3131  DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3132  Outs[i].Flags.getNonZeroByValAlign(),
3133  /*isVol = */ false, /*AlwaysInline = */ true,
3134  /*isTailCall = */ false, DstInfo,
3136 
3137  MemOpChains.push_back(Cpy);
3138  } else {
3139  SDValue Store =
3140  DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3141  MemOpChains.push_back(Store);
3142  }
3143  }
3144  }
3145 
3147  CallConv != CallingConv::AMDGPU_Gfx) {
3148  // Copy special input registers after user input arguments.
3149  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3150  }
3151 
3152  if (!MemOpChains.empty())
3153  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3154 
3155  // Build a sequence of copy-to-reg nodes chained together with token chain
3156  // and flag operands which copy the outgoing args into the appropriate regs.
3157  SDValue InFlag;
3158  for (auto &RegToPass : RegsToPass) {
3159  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3160  RegToPass.second, InFlag);
3161  InFlag = Chain.getValue(1);
3162  }
3163 
3164 
3165  SDValue PhysReturnAddrReg;
3166  if (IsTailCall) {
3167  // Since the return is being combined with the call, we need to pass on the
3168  // return address.
3169 
3171  SDValue ReturnAddrReg = CreateLiveInRegister(
3172  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
3173 
3174  PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
3175  MVT::i64);
3176  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
3177  InFlag = Chain.getValue(1);
3178  }
3179 
3180  // We don't usually want to end the call-sequence here because we would tidy
3181  // the frame up *after* the call, however in the ABI-changing tail-call case
3182  // we've carefully laid out the parameters so that when sp is reset they'll be
3183  // in the correct location.
3184  if (IsTailCall && !IsSibCall) {
3185  Chain = DAG.getCALLSEQ_END(Chain,
3186  DAG.getTargetConstant(NumBytes, DL, MVT::i32),
3187  DAG.getTargetConstant(0, DL, MVT::i32),
3188  InFlag, DL);
3189  InFlag = Chain.getValue(1);
3190  }
3191 
3192  std::vector<SDValue> Ops;
3193  Ops.push_back(Chain);
3194  Ops.push_back(Callee);
3195  // Add a redundant copy of the callee global which will not be legalized, as
3196  // we need direct access to the callee later.
3197  if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3198  const GlobalValue *GV = GSD->getGlobal();
3199  Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3200  } else {
3201  Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3202  }
3203 
3204  if (IsTailCall) {
3205  // Each tail call may have to adjust the stack by a different amount, so
3206  // this information must travel along with the operation for eventual
3207  // consumption by emitEpilogue.
3208  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3209 
3210  Ops.push_back(PhysReturnAddrReg);
3211  }
3212 
3213  // Add argument registers to the end of the list so that they are known live
3214  // into the call.
3215  for (auto &RegToPass : RegsToPass) {
3216  Ops.push_back(DAG.getRegister(RegToPass.first,
3217  RegToPass.second.getValueType()));
3218  }
3219 
3220  // Add a register mask operand representing the call-preserved registers.
3221 
3222  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
3223  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3224  assert(Mask && "Missing call preserved mask for calling convention");
3225  Ops.push_back(DAG.getRegisterMask(Mask));
3226 
3227  if (InFlag.getNode())
3228  Ops.push_back(InFlag);
3229 
3230  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3231 
3232  // If we're doing a tall call, use a TC_RETURN here rather than an
3233  // actual call instruction.
3234  if (IsTailCall) {
3235  MFI.setHasTailCall();
3236  return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
3237  }
3238 
3239  // Returns a chain and a flag for retval copy to use.
3240  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3241  Chain = Call.getValue(0);
3242  InFlag = Call.getValue(1);
3243 
3244  uint64_t CalleePopBytes = NumBytes;
3245  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
3246  DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
3247  InFlag, DL);
3248  if (!Ins.empty())
3249  InFlag = Chain.getValue(1);
3250 
3251  // Handle result values, copying them out of physregs into vregs that we
3252  // return.
3253  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3254  InVals, IsThisReturn,
3255  IsThisReturn ? OutVals[0] : SDValue());
3256 }
3257 
3258 // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3259 // except for applying the wave size scale to the increment amount.
3261  SDValue Op, SelectionDAG &DAG) const {
3262  const MachineFunction &MF = DAG.getMachineFunction();
3264 
3265  SDLoc dl(Op);
3266  EVT VT = Op.getValueType();
3267  SDValue Tmp1 = Op;
3268  SDValue Tmp2 = Op.getValue(1);
3269  SDValue Tmp3 = Op.getOperand(2);
3270  SDValue Chain = Tmp1.getOperand(0);
3271 
3272  Register SPReg = Info->getStackPtrOffsetReg();
3273 
3274  // Chain the dynamic stack allocation so that it doesn't modify the stack
3275  // pointer when other instructions are using the stack.
3276  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3277 
3278  SDValue Size = Tmp2.getOperand(1);
3279  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3280  Chain = SP.getValue(1);
3281  MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3282  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3283  const TargetFrameLowering *TFL = ST.getFrameLowering();
3284  unsigned Opc =
3285  TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3286  ISD::ADD : ISD::SUB;
3287 
3288  SDValue ScaledSize = DAG.getNode(
3289  ISD::SHL, dl, VT, Size,
3290  DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
3291 
3292  Align StackAlign = TFL->getStackAlign();
3293  Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3294  if (Alignment && *Alignment > StackAlign) {
3295  Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3296  DAG.getConstant(-(uint64_t)Alignment->value()
3297  << ST.getWavefrontSizeLog2(),
3298  dl, VT));
3299  }
3300 
3301  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3302  Tmp2 = DAG.getCALLSEQ_END(
3303  Chain, DAG.getIntPtrConstant(0, dl, true),
3304  DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
3305 
3306  return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3307 }
3308 
3310  SelectionDAG &DAG) const {
3311  // We only handle constant sizes here to allow non-entry block, static sized
3312  // allocas. A truly dynamic value is more difficult to support because we
3313  // don't know if the size value is uniform or not. If the size isn't uniform,
3314  // we would need to do a wave reduction to get the maximum size to know how
3315  // much to increment the uniform stack pointer.
3316  SDValue Size = Op.getOperand(1);
3317  if (isa<ConstantSDNode>(Size))
3318  return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3319 
3321 }
3322 
3324  const MachineFunction &MF) const {
3326  .Case("m0", AMDGPU::M0)
3327  .Case("exec", AMDGPU::EXEC)
3328  .Case("exec_lo", AMDGPU::EXEC_LO)
3329  .Case("exec_hi", AMDGPU::EXEC_HI)
3330  .Case("flat_scratch", AMDGPU::FLAT_SCR)
3331  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3332  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3333  .Default(Register());
3334 
3335  if (Reg == AMDGPU::NoRegister) {
3336  report_fatal_error(Twine("invalid register name \""
3337  + StringRef(RegName) + "\"."));
3338 
3339  }
3340 
3341  if (!Subtarget->hasFlatScrRegister() &&
3342  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
3343  report_fatal_error(Twine("invalid register \""
3344  + StringRef(RegName) + "\" for subtarget."));
3345  }
3346 
3347  switch (Reg) {
3348  case AMDGPU::M0:
3349  case AMDGPU::EXEC_LO:
3350  case AMDGPU::EXEC_HI:
3351  case AMDGPU::FLAT_SCR_LO:
3352  case AMDGPU::FLAT_SCR_HI:
3353  if (VT.getSizeInBits() == 32)
3354  return Reg;
3355  break;
3356  case AMDGPU::EXEC:
3357  case AMDGPU::FLAT_SCR:
3358  if (VT.getSizeInBits() == 64)
3359  return Reg;
3360  break;
3361  default:
3362  llvm_unreachable("missing register type checking");
3363  }
3364 
3365  report_fatal_error(Twine("invalid type for register \""
3366  + StringRef(RegName) + "\"."));
3367 }
3368 
3369 // If kill is not the last instruction, split the block so kill is always a
3370 // proper terminator.
3373  MachineBasicBlock *BB) const {
3374  MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
3375  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3376  MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
3377  return SplitBB;
3378 }
3379 
3380 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3381 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3382 // be the first instruction in the remainder block.
3383 //
3384 /// \returns { LoopBody, Remainder }
3385 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3387  MachineFunction *MF = MBB.getParent();
3389 
3390  // To insert the loop we need to split the block. Move everything after this
3391  // point to a new block, and insert a new empty block between the two.
3393  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3395  ++MBBI;
3396 
3397  MF->insert(MBBI, LoopBB);
3398  MF->insert(MBBI, RemainderBB);
3399 
3400  LoopBB->addSuccessor(LoopBB);
3401  LoopBB->addSuccessor(RemainderBB);
3402 
3403  // Move the rest of the block into a new block.
3404  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3405 
3406  if (InstInLoop) {
3407  auto Next = std::next(I);
3408 
3409  // Move instruction to loop body.
3410  LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
3411 
3412  // Move the rest of the block.
3413  RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
3414  } else {
3415  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3416  }
3417 
3418  MBB.addSuccessor(LoopBB);
3419 
3420  return std::make_pair(LoopBB, RemainderBB);
3421 }
3422 
3423 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3425  MachineBasicBlock *MBB = MI.getParent();
3426  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3427  auto I = MI.getIterator();
3428  auto E = std::next(I);
3429 
3430  BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
3431  .addImm(0);
3432 
3433  MIBundleBuilder Bundler(*MBB, I, E);
3434  finalizeBundle(*MBB, Bundler.begin());
3435 }
3436 
3439  MachineBasicBlock *BB) const {
3440  const DebugLoc &DL = MI.getDebugLoc();
3441 
3442  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3443 
3444  MachineBasicBlock *LoopBB;
3445  MachineBasicBlock *RemainderBB;
3446  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3447 
3448  // Apparently kill flags are only valid if the def is in the same block?
3449  if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
3450  Src->setIsKill(false);
3451 
3452  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
3453 
3454  MachineBasicBlock::iterator I = LoopBB->end();
3455 
3456  const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
3458 
3459  // Clear TRAP_STS.MEM_VIOL
3460  BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
3461  .addImm(0)
3462  .addImm(EncodedReg);
3463 
3465 
3466  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3467 
3468  // Load and check TRAP_STS.MEM_VIOL
3469  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
3470  .addImm(EncodedReg);
3471 
3472  // FIXME: Do we need to use an isel pseudo that may clobber scc?
3473  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
3475  .addImm(0);
3476  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3477  .addMBB(LoopBB);
3478 
3479  return RemainderBB;
3480 }
3481 
3482 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3483 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3484 // will only do one iteration. In the worst case, this will loop 64 times.
3485 //
3486 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3489  MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3490  const DebugLoc &DL, const MachineOperand &Idx,
3491  unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
3492  unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
3493  Register &SGPRIdxReg) {
3494 
3495  MachineFunction *MF = OrigBB.getParent();
3496  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3497  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3498  MachineBasicBlock::iterator I = LoopBB.begin();
3499 
3500  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3501  Register PhiExec = MRI.createVirtualRegister(BoolRC);
3502  Register NewExec = MRI.createVirtualRegister(BoolRC);
3503  Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3504  Register CondReg = MRI.createVirtualRegister(BoolRC);
3505 
3506  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
3507  .addReg(InitReg)
3508  .addMBB(&OrigBB)
3509  .addReg(ResultReg)
3510  .addMBB(&LoopBB);
3511 
3512  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
3513  .addReg(InitSaveExecReg)
3514  .addMBB(&OrigBB)
3515  .addReg(NewExec)
3516  .addMBB(&LoopBB);
3517 
3518  // Read the next variant <- also loop target.
3519  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3520  .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
3521 
3522  // Compare the just read M0 value to all possible Idx values.
3523  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3524  .addReg(CurrentIdxReg)
3525  .addReg(Idx.getReg(), 0, Idx.getSubReg());
3526 
3527  // Update EXEC, save the original EXEC value to VCC.
3528  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3529  : AMDGPU::S_AND_SAVEEXEC_B64),
3530  NewExec)
3531  .addReg(CondReg, RegState::Kill);
3532 
3533  MRI.setSimpleHint(NewExec, CondReg);
3534 
3535  if (UseGPRIdxMode) {
3536  if (Offset == 0) {
3537  SGPRIdxReg = CurrentIdxReg;
3538  } else {
3539  SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3540  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3541  .addReg(CurrentIdxReg, RegState::Kill)
3542  .addImm(Offset);
3543  }
3544  } else {
3545  // Move index from VCC into M0
3546  if (Offset == 0) {
3547  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3548  .addReg(CurrentIdxReg, RegState::Kill);
3549  } else {
3550  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3551  .addReg(CurrentIdxReg, RegState::Kill)
3552  .addImm(Offset);
3553  }
3554  }
3555 
3556  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3557  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3558  MachineInstr *InsertPt =
3559  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3560  : AMDGPU::S_XOR_B64_term), Exec)
3561  .addReg(Exec)
3562  .addReg(NewExec);
3563 
3564  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3565  // s_cbranch_scc0?
3566 
3567  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3568  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3569  .addMBB(&LoopBB);
3570 
3571  return InsertPt->getIterator();
3572 }
3573 
3574 // This has slightly sub-optimal regalloc when the source vector is killed by
3575 // the read. The register allocator does not understand that the kill is
3576 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3577 // subregister from it, using 1 more VGPR than necessary. This was saved when
3578 // this was expanded after register allocation.
3581  unsigned InitResultReg, unsigned PhiReg, int Offset,
3582  bool UseGPRIdxMode, Register &SGPRIdxReg) {
3583  MachineFunction *MF = MBB.getParent();
3584  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3585  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3587  const DebugLoc &DL = MI.getDebugLoc();
3589 
3590  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3591  Register DstReg = MI.getOperand(0).getReg();
3592  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
3593  Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
3594  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3595  unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3596 
3597  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3598 
3599  // Save the EXEC mask
3600  BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
3601  .addReg(Exec);
3602 
3603  MachineBasicBlock *LoopBB;
3604  MachineBasicBlock *RemainderBB;
3605  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
3606 
3607  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3608 
3609  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3610  InitResultReg, DstReg, PhiReg, TmpExec,
3611  Offset, UseGPRIdxMode, SGPRIdxReg);
3612 
3613  MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
3615  ++MBBI;
3616  MF->insert(MBBI, LandingPad);
3617  LoopBB->removeSuccessor(RemainderBB);
3618  LandingPad->addSuccessor(RemainderBB);
3619  LoopBB->addSuccessor(LandingPad);
3620  MachineBasicBlock::iterator First = LandingPad->begin();
3621  BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
3622  .addReg(SaveExec);
3623 
3624  return InsPt;
3625 }
3626 
3627 // Returns subreg index, offset
3628 static std::pair<unsigned, int>
3630  const TargetRegisterClass *SuperRC,
3631  unsigned VecReg,
3632  int Offset) {
3633  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3634 
3635  // Skip out of bounds offsets, or else we would end up using an undefined
3636  // register.
3637  if (Offset >= NumElts || Offset < 0)
3638  return std::make_pair(AMDGPU::sub0, Offset);
3639 
3640  return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
3641 }
3642 
3645  int Offset) {
3646  MachineBasicBlock *MBB = MI.getParent();
3647  const DebugLoc &DL = MI.getDebugLoc();
3649 
3650  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3651 
3652  assert(Idx->getReg() != AMDGPU::NoRegister);
3653 
3654  if (Offset == 0) {
3655  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
3656  } else {
3657  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3658  .add(*Idx)
3659  .addImm(Offset);
3660  }
3661 }
3662 
3665  int Offset) {
3666  MachineBasicBlock *MBB = MI.getParent();
3667  const DebugLoc &DL = MI.getDebugLoc();
3669 
3670  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3671 
3672  if (Offset == 0)
3673  return Idx->getReg();
3674 
3675  Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3676  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3677  .add(*Idx)
3678  .addImm(Offset);
3679  return Tmp;
3680 }
3681 
3684  const GCNSubtarget &ST) {
3685  const SIInstrInfo *TII = ST.getInstrInfo();
3686  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3687  MachineFunction *MF = MBB.getParent();
3689 
3690  Register Dst = MI.getOperand(0).getReg();
3691  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3692  Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3693  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3694 
3695  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3696  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3697 
3698  unsigned SubReg;
3699  std::tie(SubReg, Offset)
3700  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3701 
3702  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3703 
3704  // Check for a SGPR index.
3705  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3707  const DebugLoc &DL = MI.getDebugLoc();
3708 
3709  if (UseGPRIdxMode) {
3710  // TODO: Look at the uses to avoid the copy. This may require rescheduling
3711  // to avoid interfering with other uses, so probably requires a new
3712  // optimization pass.
3714 
3715  const MCInstrDesc &GPRIDXDesc =
3716  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3717  BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3718  .addReg(SrcReg)
3719  .addReg(Idx)
3720  .addImm(SubReg);
3721  } else {
3723 
3724  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3725  .addReg(SrcReg, 0, SubReg)
3726  .addReg(SrcReg, RegState::Implicit);
3727  }
3728 
3729  MI.eraseFromParent();
3730 
3731  return &MBB;
3732  }
3733 
3734  // Control flow needs to be inserted if indexing with a VGPR.
3735  const DebugLoc &DL = MI.getDebugLoc();
3737 
3738  Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3739  Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3740 
3741  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3742 
3743  Register SGPRIdxReg;
3744  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
3745  UseGPRIdxMode, SGPRIdxReg);
3746 
3747  MachineBasicBlock *LoopBB = InsPt->getParent();
3748 
3749  if (UseGPRIdxMode) {
3750  const MCInstrDesc &GPRIDXDesc =
3751  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3752 
3753  BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3754  .addReg(SrcReg)
3755  .addReg(SGPRIdxReg)
3756  .addImm(SubReg);
3757  } else {
3758  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3759  .addReg(SrcReg, 0, SubReg)
3760  .addReg(SrcReg, RegState::Implicit);
3761  }
3762 
3763  MI.eraseFromParent();
3764 
3765  return LoopBB;
3766 }
3767 
3770  const GCNSubtarget &ST) {
3771  const SIInstrInfo *TII = ST.getInstrInfo();
3772  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3773  MachineFunction *MF = MBB.getParent();
3775 
3776  Register Dst = MI.getOperand(0).getReg();
3777  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3778  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3779  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3780  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3781  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3782  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3783 
3784  // This can be an immediate, but will be folded later.
3785  assert(Val->getReg());
3786 
3787  unsigned SubReg;
3788  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3789  SrcVec->getReg(),
3790  Offset);
3791  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3792 
3793  if (Idx->getReg() == AMDGPU::NoRegister) {
3795  const DebugLoc &DL = MI.getDebugLoc();
3796 
3797  assert(Offset == 0);
3798 
3799  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3800  .add(*SrcVec)
3801  .add(*Val)
3802  .addImm(SubReg);
3803 
3804  MI.eraseFromParent();
3805  return &MBB;
3806  }
3807 
3808  // Check for a SGPR index.
3809  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3811  const DebugLoc &DL = MI.getDebugLoc();
3812 
3813  if (UseGPRIdxMode) {
3815 
3816  const MCInstrDesc &GPRIDXDesc =
3817  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3818  BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3819  .addReg(SrcVec->getReg())
3820  .add(*Val)
3821  .addReg(Idx)
3822  .addImm(SubReg);
3823  } else {
3825 
3826  const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3827  TRI.getRegSizeInBits(*VecRC), 32, false);
3828  BuildMI(MBB, I, DL, MovRelDesc, Dst)
3829  .addReg(SrcVec->getReg())
3830  .add(*Val)
3831  .addImm(SubReg);
3832  }
3833  MI.eraseFromParent();
3834  return &MBB;
3835  }
3836 
3837  // Control flow needs to be inserted if indexing with a VGPR.
3838  if (Val->isReg())
3839  MRI.clearKillFlags(Val->getReg());
3840 
3841  const DebugLoc &DL = MI.getDebugLoc();
3842 
3843  Register PhiReg = MRI.createVirtualRegister(VecRC);
3844 
3845  Register SGPRIdxReg;
3846  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
3847  UseGPRIdxMode, SGPRIdxReg);
3848  MachineBasicBlock *LoopBB = InsPt->getParent();
3849 
3850  if (UseGPRIdxMode) {
3851  const MCInstrDesc &GPRIDXDesc =
3852  TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3853 
3854  BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3855  .addReg(PhiReg)
3856  .add(*Val)
3857  .addReg(SGPRIdxReg)
3858  .addImm(AMDGPU::sub0);
3859  } else {
3860  const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3861  TRI.getRegSizeInBits(*VecRC), 32, false);
3862  BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
3863  .addReg(PhiReg)
3864  .add(*Val)
3865  .addImm(AMDGPU::sub0);
3866  }
3867 
3868  MI.eraseFromParent();
3869  return LoopBB;
3870 }
3871 
3873  MachineInstr &MI, MachineBasicBlock *BB) const {
3874 
3875  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3876  MachineFunction *MF = BB->getParent();
3878 
3879  switch (MI.getOpcode()) {
3880  case AMDGPU::S_UADDO_PSEUDO:
3881  case AMDGPU::S_USUBO_PSEUDO: {
3882  const DebugLoc &DL = MI.getDebugLoc();
3883  MachineOperand &Dest0 = MI.getOperand(0);
3884  MachineOperand &Dest1 = MI.getOperand(1);
3885  MachineOperand &Src0 = MI.getOperand(2);
3886  MachineOperand &Src1 = MI.getOperand(3);
3887 
3888  unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
3889  ? AMDGPU::S_ADD_I32
3890  : AMDGPU::S_SUB_I32;
3891  BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
3892 
3893  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
3894  .addImm(1)
3895  .addImm(0);
3896 
3897  MI.eraseFromParent();
3898  return BB;
3899  }
3900  case AMDGPU::S_ADD_U64_PSEUDO:
3901  case AMDGPU::S_SUB_U64_PSEUDO: {
3902  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3903  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3904  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3905  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3906  const DebugLoc &DL = MI.getDebugLoc();
3907 
3908  MachineOperand &Dest = MI.getOperand(0);
3909  MachineOperand &Src0 = MI.getOperand(1);
3910  MachineOperand &Src1 = MI.getOperand(2);
3911 
3912  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3913  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3914 
3915  MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
3916  MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
3917  MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
3918  MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
3919 
3920  MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
3921  MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
3922  MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
3923  MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
3924 
3925  bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3926 
3927  unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3928  unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3929  BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
3930  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
3931  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3932  .addReg(DestSub0)
3933  .addImm(AMDGPU::sub0)
3934  .addReg(DestSub1)
3935  .addImm(AMDGPU::sub1);
3936  MI.eraseFromParent();
3937  return BB;
3938  }
3939  case AMDGPU::V_ADD_U64_PSEUDO:
3940  case AMDGPU::V_SUB_U64_PSEUDO: {
3941  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3942  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3943  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3944  const DebugLoc &DL = MI.getDebugLoc();
3945 
3946  bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
3947 
3948  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3949 
3950  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3951  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3952 
3953  Register CarryReg = MRI.createVirtualRegister(CarryRC);
3954  Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
3955 
3956  MachineOperand &Dest = MI.getOperand(0);
3957  MachineOperand &Src0 = MI.getOperand(1);
3958  MachineOperand &Src1 = MI.getOperand(2);
3959 
3960  const TargetRegisterClass *Src0RC = Src0.isReg()
3961  ? MRI.getRegClass(Src0.getReg())
3962  : &AMDGPU::VReg_64RegClass;
3963  const TargetRegisterClass *Src1RC = Src1.isReg()
3964  ? MRI.getRegClass(Src1.getReg())
3965  : &AMDGPU::VReg_64RegClass;
3966 
3967  const TargetRegisterClass *Src0SubRC =
3968  TRI->getSubRegClass(Src0RC, AMDGPU::sub0);
3969  const TargetRegisterClass *Src1SubRC =
3970  TRI->getSubRegClass(Src1RC, AMDGPU::sub1);
3971 
3972  MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
3973  MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
3974  MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
3975  MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
3976 
3977  MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
3978  MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
3979  MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
3980  MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
3981 
3982  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
3983  MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3984  .addReg(CarryReg, RegState::Define)
3985  .add(SrcReg0Sub0)
3986  .add(SrcReg1Sub0)
3987  .addImm(0); // clamp bit
3988 
3989  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
3990  MachineInstr *HiHalf =
3991  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3992  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
3993  .add(SrcReg0Sub1)
3994  .