LLVM 19.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101 addRegisterClass(MVT::Untyped, V64RegClass);
102
103 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
105
106 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
111
112 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
114
115 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
117
118 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
120
121 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
123
124 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
126
127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129
130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132
133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135
136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138
139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141
142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147
148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
191
192 // We need to custom lower vector stores from local memory
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Custom);
199
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 if (isTypeLegal(MVT::bf16)) {
208 for (unsigned Opc :
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Opc, MVT::bf16, Promote);
220 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221 }
222
224
226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227
228 // TODO: Could make these legal
232
233 // We only need to custom lower because we can't specify an action for bf16
234 // sources.
237
239 AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
240 }
241
242 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
243 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
248 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
253 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
254 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
255 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
256 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
257 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
258
259 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
260 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
261 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
262 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
263 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
264 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
265 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
266
267 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
268
272 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
273
274 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
275
277 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
278
280 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
281 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
282
284 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
285 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
286 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
287 Expand);
289 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
290 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
291 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
292 Expand);
293
295 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
296 MVT::v3i16, MVT::v4i16, MVT::Other},
297 Custom);
298
301 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
302
304
306
308 Expand);
309
310#if 0
312#endif
313
314 // We only support LOAD/STORE and vector manipulation ops for vectors
315 // with > 4 elements.
316 for (MVT VT :
317 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
318 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
319 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
320 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
321 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
322 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
323 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
324 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
325 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
326 switch (Op) {
327 case ISD::LOAD:
328 case ISD::STORE:
330 case ISD::BITCAST:
331 case ISD::UNDEF:
335 case ISD::IS_FPCLASS:
336 break;
341 break;
342 default:
344 break;
345 }
346 }
347 }
348
350
351 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
352 // is expanded to avoid having two separate loops in case the index is a VGPR.
353
354 // Most operations are naturally 32-bit vector operations. We only support
355 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
356 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
358 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
359
361 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
362
364 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
365
367 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
368 }
369
370 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
372 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
373
375 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
376
378 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
379
381 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
382 }
383
384 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
386 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
387
389 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
390
392 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
393
395 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
396 }
397
398 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
400 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
401
403 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
404
406 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
407
409 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
410 }
411
412 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
414 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
415
417 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
418
420 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
421
423 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
424 }
425
427 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
428 Expand);
429
430 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
431 Custom);
432
433 // Avoid stack access for these.
434 // TODO: Generalize to more vector types.
436 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
437 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
438 Custom);
439
440 // Deal with vec3 vector operations when widened to vec4.
442 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
443
444 // Deal with vec5/6/7 vector operations when widened to vec8.
446 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
450 Custom);
451
452 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
453 // and output demarshalling
454 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
455
456 // We can't return success/failure, only the old value,
457 // let LLVM add the comparison
459 Expand);
460
461 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
462
463 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
464
465 // FIXME: This should be narrowed to i32, but that only happens if i64 is
466 // illegal.
467 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
468 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
469
470 // On SI this is s_memtime and s_memrealtime on VI.
472
473 if (Subtarget->hasSMemRealTime() ||
477
478 if (Subtarget->has16BitInsts()) {
481 } else {
483 }
484
485 if (Subtarget->hasMadMacF32Insts())
487
488 if (!Subtarget->hasBFI())
489 // fcopysign can be done in a single instruction with BFI.
490 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
491
492 if (!Subtarget->hasBCNT(32))
494
495 if (!Subtarget->hasBCNT(64))
497
498 if (Subtarget->hasFFBH())
500
501 if (Subtarget->hasFFBL())
503
504 // We only really have 32-bit BFE instructions (and 16-bit on VI).
505 //
506 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
507 // effort to match them now. We want this to be false for i64 cases when the
508 // extraction isn't restricted to the upper or lower half. Ideally we would
509 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
510 // span the midpoint are probably relatively rare, so don't worry about them
511 // for now.
512 if (Subtarget->hasBFE())
514
515 // Clamp modifier on add/sub
516 if (Subtarget->hasIntClamp())
518
519 if (Subtarget->hasAddNoCarry())
520 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
521 Legal);
522
523 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
524 Custom);
525
526 // These are really only legal for ieee_mode functions. We should be avoiding
527 // them for functions that don't have ieee_mode enabled, so just say they are
528 // legal.
530 {MVT::f32, MVT::f64}, Legal);
531
532 if (Subtarget->haveRoundOpsF64())
534 Legal);
535 else
537 MVT::f64, Custom);
538
540 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
541 Legal);
542 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
543
546
547 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
548 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
549
550 // Custom lower these because we can't specify a rule based on an illegal
551 // source bf16.
554
555 if (Subtarget->has16BitInsts()) {
558 MVT::i16, Legal);
559
560 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
561
563 MVT::i16, Expand);
564
568 ISD::CTPOP},
569 MVT::i16, Promote);
570
572
573 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
574
576 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
578 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
579
583
585
586 // F16 - Constant Actions.
589
590 // F16 - Load/Store Actions.
592 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
594 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
595
596 // BF16 - Load/Store Actions.
598 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
600 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
601
602 // F16 - VOP1 Actions.
605 MVT::f16, Custom);
606
609
610 // F16 - VOP2 Actions.
611 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
612 Expand);
616
617 // F16 - VOP3 Actions.
619 if (STI.hasMadF16())
621
622 for (MVT VT :
623 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
624 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
625 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
626 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
627 switch (Op) {
628 case ISD::LOAD:
629 case ISD::STORE:
631 case ISD::BITCAST:
632 case ISD::UNDEF:
638 case ISD::IS_FPCLASS:
639 break;
642 break;
643 default:
645 break;
646 }
647 }
648 }
649
650 // v_perm_b32 can handle either of these.
651 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
653
654 // XXX - Do these do anything? Vector constants turn into build_vector.
655 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
656
657 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
658 Legal);
659
661 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
663 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
664
666 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
668 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
669
670 setOperationAction(ISD::AND, MVT::v2i16, Promote);
671 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
672 setOperationAction(ISD::OR, MVT::v2i16, Promote);
673 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
674 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
675 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
676
678 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
680 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
681 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
682 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
683
685 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
687 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
689 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
690
692 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
694 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
695 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
696 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
697
699 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
701 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
702
704 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
706 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
708 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
709
710 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
712 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
713 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
714 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
715 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
716
718 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
720 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
721 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
722 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
723
724 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
725 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
726 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
727 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
728 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
730
732 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
734 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
735 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
736 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
737
739 MVT::v2i32, Expand);
741
743 MVT::v4i32, Expand);
744
746 MVT::v8i32, Expand);
747
748 if (!Subtarget->hasVOP3PInsts())
750 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
751
752 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
753 // This isn't really legal, but this avoids the legalizer unrolling it (and
754 // allows matching fneg (fabs x) patterns)
755 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
756
759
761 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
762 Custom);
763
765 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
766 Expand);
767
768 for (MVT Vec16 :
769 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
770 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
773 Vec16, Custom);
775 }
776 }
777
778 if (Subtarget->hasVOP3PInsts()) {
782 MVT::v2i16, Legal);
783
786 MVT::v2f16, Legal);
787
788 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
789 Custom);
790
792 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
793 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
794 Custom);
795
796 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
797 // Split vector operations.
802 VT, Custom);
803
804 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
805 // Split vector operations.
807 VT, Custom);
808
809 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
810 Custom);
811
812 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
813 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
814 Custom);
815
816 if (Subtarget->hasPackedFP32Ops()) {
818 MVT::v2f32, Legal);
820 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
821 Custom);
822 }
823 }
824
826
827 if (Subtarget->has16BitInsts()) {
829 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
831 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
832 } else {
833 // Legalization hack.
834 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
835
837 }
838
840 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
841 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
842 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
843 MVT::v32f16, MVT::v32bf16},
844 Custom);
845
847
848 if (Subtarget->hasScalarSMulU64())
850
851 if (Subtarget->hasMad64_32())
853
854 if (Subtarget->hasPrefetch())
856
857 if (Subtarget->hasIEEEMinMax())
859 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
860
862 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
863 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
864 Custom);
865
867 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
868 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
869 MVT::i16, MVT::i8, MVT::i128},
870 Custom);
871
873 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
874 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
875 MVT::i8, MVT::i128},
876 Custom);
877
882
883 // TODO: Could move this to custom lowering, could benefit from combines on
884 // extract of relevant bits.
886
888
891 ISD::SUB,
893 ISD::FADD,
894 ISD::FSUB,
895 ISD::FDIV,
902 ISD::FMA,
903 ISD::SMIN,
904 ISD::SMAX,
905 ISD::UMIN,
906 ISD::UMAX,
908 ISD::AND,
909 ISD::OR,
910 ISD::XOR,
911 ISD::FSHR,
921
922 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
924
925 // All memory operations. Some folding on the pointer operand is done to help
926 // matching the constant offsets in the addressing modes.
949
950 // FIXME: In other contexts we pretend this is a per-function property.
952
954}
955
957 return Subtarget;
958}
959
960//===----------------------------------------------------------------------===//
961// TargetLowering queries
962//===----------------------------------------------------------------------===//
963
964// v_mad_mix* support a conversion from f16 to f32.
965//
966// There is only one special case when denormals are enabled we don't currently,
967// where this is OK to use.
968bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
969 EVT DestVT, EVT SrcVT) const {
970 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
971 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
972 DestVT.getScalarType() == MVT::f32 &&
973 SrcVT.getScalarType() == MVT::f16 &&
974 // TODO: This probably only requires no input flushing?
976}
977
979 LLT DestTy, LLT SrcTy) const {
980 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
981 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
982 DestTy.getScalarSizeInBits() == 32 &&
983 SrcTy.getScalarSizeInBits() == 16 &&
984 // TODO: This probably only requires no input flushing?
986}
987
989 // SI has some legal vector types, but no legal vector operations. Say no
990 // shuffles are legal in order to prefer scalarizing some vector operations.
991 return false;
992}
993
996 EVT VT) const {
999
1000 if (VT.isVector()) {
1001 EVT ScalarVT = VT.getScalarType();
1002 unsigned Size = ScalarVT.getSizeInBits();
1003 if (Size == 16) {
1004 if (Subtarget->has16BitInsts()) {
1005 if (VT.isInteger())
1006 return MVT::v2i16;
1007 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1008 }
1009 return VT.isInteger() ? MVT::i32 : MVT::f32;
1010 }
1011
1012 if (Size < 16)
1013 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1014 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1015 }
1016
1017 if (VT.getSizeInBits() > 32)
1018 return MVT::i32;
1019
1021}
1022
1025 EVT VT) const {
1028
1029 if (VT.isVector()) {
1030 unsigned NumElts = VT.getVectorNumElements();
1031 EVT ScalarVT = VT.getScalarType();
1032 unsigned Size = ScalarVT.getSizeInBits();
1033
1034 // FIXME: Should probably promote 8-bit vectors to i16.
1035 if (Size == 16 && Subtarget->has16BitInsts())
1036 return (NumElts + 1) / 2;
1037
1038 if (Size <= 32)
1039 return NumElts;
1040
1041 if (Size > 32)
1042 return NumElts * ((Size + 31) / 32);
1043 } else if (VT.getSizeInBits() > 32)
1044 return (VT.getSizeInBits() + 31) / 32;
1045
1047}
1048
1050 LLVMContext &Context, CallingConv::ID CC,
1051 EVT VT, EVT &IntermediateVT,
1052 unsigned &NumIntermediates, MVT &RegisterVT) const {
1053 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1054 unsigned NumElts = VT.getVectorNumElements();
1055 EVT ScalarVT = VT.getScalarType();
1056 unsigned Size = ScalarVT.getSizeInBits();
1057 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1058 // support, but unless we can properly handle 3-vectors, it will be still be
1059 // inconsistent.
1060 if (Size == 16 && Subtarget->has16BitInsts()) {
1061 if (ScalarVT == MVT::bf16) {
1062 RegisterVT = MVT::i32;
1063 IntermediateVT = MVT::v2bf16;
1064 } else {
1065 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1066 IntermediateVT = RegisterVT;
1067 }
1068 NumIntermediates = (NumElts + 1) / 2;
1069 return NumIntermediates;
1070 }
1071
1072 if (Size == 32) {
1073 RegisterVT = ScalarVT.getSimpleVT();
1074 IntermediateVT = RegisterVT;
1075 NumIntermediates = NumElts;
1076 return NumIntermediates;
1077 }
1078
1079 if (Size < 16 && Subtarget->has16BitInsts()) {
1080 // FIXME: Should probably form v2i16 pieces
1081 RegisterVT = MVT::i16;
1082 IntermediateVT = ScalarVT;
1083 NumIntermediates = NumElts;
1084 return NumIntermediates;
1085 }
1086
1087
1088 if (Size != 16 && Size <= 32) {
1089 RegisterVT = MVT::i32;
1090 IntermediateVT = ScalarVT;
1091 NumIntermediates = NumElts;
1092 return NumIntermediates;
1093 }
1094
1095 if (Size > 32) {
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = RegisterVT;
1098 NumIntermediates = NumElts * ((Size + 31) / 32);
1099 return NumIntermediates;
1100 }
1101 }
1102
1104 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1105}
1106
1107static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
1108 assert(MaxNumLanes != 0);
1109
1110 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1111 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1112 return EVT::getVectorVT(Ty->getContext(),
1113 EVT::getEVT(VT->getElementType()),
1114 NumElts);
1115 }
1116
1117 return EVT::getEVT(Ty);
1118}
1119
1120// Peek through TFE struct returns to only use the data size.
1121static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
1122 auto *ST = dyn_cast<StructType>(Ty);
1123 if (!ST)
1124 return memVTFromLoadIntrData(Ty, MaxNumLanes);
1125
1126 // TFE intrinsics return an aggregate type.
1127 assert(ST->getNumContainedTypes() == 2 &&
1128 ST->getContainedType(1)->isIntegerTy(32));
1129 return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
1130}
1131
1132/// Map address space 7 to MVT::v5i32 because that's its in-memory
1133/// representation. This return value is vector-typed because there is no
1134/// MVT::i160 and it is not clear if one can be added. While this could
1135/// cause issues during codegen, these address space 7 pointers will be
1136/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1137/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1138/// modeling, to work.
1140 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1141 return MVT::v5i32;
1143 DL.getPointerSizeInBits(AS) == 192)
1144 return MVT::v6i32;
1146}
1147/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1148/// v8i32 when padding is added.
1149/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1150/// also v8i32 with padding.
1152 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1153 DL.getPointerSizeInBits(AS) == 160) ||
1155 DL.getPointerSizeInBits(AS) == 192))
1156 return MVT::v8i32;
1158}
1159
1161 const CallInst &CI,
1162 MachineFunction &MF,
1163 unsigned IntrID) const {
1165 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1167
1168 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1171 (Intrinsic::ID)IntrID);
1172 MemoryEffects ME = Attr.getMemoryEffects();
1173 if (ME.doesNotAccessMemory())
1174 return false;
1175
1176 // TODO: Should images get their own address space?
1177 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1178
1179 if (RsrcIntr->IsImage)
1180 Info.align.reset();
1181
1182 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1183 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1184 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1185 // We conservatively set the memory operand of a buffer intrinsic to the
1186 // base resource pointer, so that we can access alias information about
1187 // those pointers. Cases like "this points at the same value
1188 // but with a different offset" are handled in
1189 // areMemAccessesTriviallyDisjoint.
1190 Info.ptrVal = RsrcArg;
1191 }
1192
1193 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1194 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1197 if (ME.onlyReadsMemory()) {
1198 unsigned MaxNumLanes = 4;
1199
1200 if (RsrcIntr->IsImage) {
1203 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1205
1206 if (!BaseOpcode->Gather4) {
1207 // If this isn't a gather, we may have excess loaded elements in the
1208 // IR type. Check the dmask for the real number of elements loaded.
1209 unsigned DMask
1210 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1211 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1212 }
1213 }
1214
1215 Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
1216
1217 // FIXME: What does alignment mean for an image?
1220 } else if (ME.onlyWritesMemory()) {
1222
1223 Type *DataTy = CI.getArgOperand(0)->getType();
1224 if (RsrcIntr->IsImage) {
1225 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1226 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1227 Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
1228 } else
1229 Info.memVT = EVT::getEVT(DataTy);
1230
1232 } else {
1233 // Atomic
1234 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1236 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1240
1241 switch (IntrID) {
1242 default:
1243 // XXX - Should this be volatile without known ordering?
1245 break;
1246 case Intrinsic::amdgcn_raw_buffer_load_lds:
1247 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1248 case Intrinsic::amdgcn_struct_buffer_load_lds:
1249 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1250 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1251 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1252 Info.ptrVal = CI.getArgOperand(1);
1253 return true;
1254 }
1255 }
1256 }
1257 return true;
1258 }
1259
1260 switch (IntrID) {
1261 case Intrinsic::amdgcn_ds_ordered_add:
1262 case Intrinsic::amdgcn_ds_ordered_swap:
1263 case Intrinsic::amdgcn_ds_fadd:
1264 case Intrinsic::amdgcn_ds_fmin:
1265 case Intrinsic::amdgcn_ds_fmax: {
1267 Info.memVT = MVT::getVT(CI.getType());
1268 Info.ptrVal = CI.getOperand(0);
1269 Info.align.reset();
1271
1272 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1273 if (!Vol->isZero())
1275
1276 return true;
1277 }
1278 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1280 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1281 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1282 Info.align.reset();
1284
1285 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1286 if (!Vol || !Vol->isZero())
1288
1289 return true;
1290 }
1291 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1292 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1294 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1295 Info.ptrVal = nullptr;
1296 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1298 return true;
1299 }
1300 case Intrinsic::amdgcn_ds_append:
1301 case Intrinsic::amdgcn_ds_consume: {
1303 Info.memVT = MVT::getVT(CI.getType());
1304 Info.ptrVal = CI.getOperand(0);
1305 Info.align.reset();
1307
1308 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1309 if (!Vol->isZero())
1311
1312 return true;
1313 }
1314 case Intrinsic::amdgcn_global_atomic_csub: {
1316 Info.memVT = MVT::getVT(CI.getType());
1317 Info.ptrVal = CI.getOperand(0);
1318 Info.align.reset();
1322 return true;
1323 }
1324 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1326 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1327
1328 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1329 Info.align.reset();
1332 return true;
1333 }
1334 case Intrinsic::amdgcn_global_atomic_fadd:
1335 case Intrinsic::amdgcn_global_atomic_fmin:
1336 case Intrinsic::amdgcn_global_atomic_fmax:
1337 case Intrinsic::amdgcn_global_atomic_fmin_num:
1338 case Intrinsic::amdgcn_global_atomic_fmax_num:
1339 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1340 case Intrinsic::amdgcn_flat_atomic_fadd:
1341 case Intrinsic::amdgcn_flat_atomic_fmin:
1342 case Intrinsic::amdgcn_flat_atomic_fmax:
1343 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1344 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1345 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1346 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1347 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1349 Info.memVT = MVT::getVT(CI.getType());
1350 Info.ptrVal = CI.getOperand(0);
1351 Info.align.reset();
1356 return true;
1357 }
1358 case Intrinsic::amdgcn_global_load_tr_b64:
1359 case Intrinsic::amdgcn_global_load_tr_b128: {
1361 Info.memVT = MVT::getVT(CI.getType());
1362 Info.ptrVal = CI.getOperand(0);
1363 Info.align.reset();
1365 return true;
1366 }
1367 case Intrinsic::amdgcn_ds_gws_init:
1368 case Intrinsic::amdgcn_ds_gws_barrier:
1369 case Intrinsic::amdgcn_ds_gws_sema_v:
1370 case Intrinsic::amdgcn_ds_gws_sema_br:
1371 case Intrinsic::amdgcn_ds_gws_sema_p:
1372 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1374
1375 const GCNTargetMachine &TM =
1376 static_cast<const GCNTargetMachine &>(getTargetMachine());
1377
1379 Info.ptrVal = MFI->getGWSPSV(TM);
1380
1381 // This is an abstract access, but we need to specify a type and size.
1382 Info.memVT = MVT::i32;
1383 Info.size = 4;
1384 Info.align = Align(4);
1385
1386 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1388 else
1390 return true;
1391 }
1392 case Intrinsic::amdgcn_global_load_lds: {
1394 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1395 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1396 Info.ptrVal = CI.getArgOperand(1);
1398 return true;
1399 }
1400 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1402
1403 const GCNTargetMachine &TM =
1404 static_cast<const GCNTargetMachine &>(getTargetMachine());
1405
1407 Info.ptrVal = MFI->getGWSPSV(TM);
1408
1409 // This is an abstract access, but we need to specify a type and size.
1410 Info.memVT = MVT::i32;
1411 Info.size = 4;
1412 Info.align = Align(4);
1413
1415 return true;
1416 }
1417 default:
1418 return false;
1419 }
1420}
1421
1423 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1424 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1425 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1426 // The DAG's ValueType loses the addrspaces.
1427 // Add them as 2 extra Constant operands "from" and "to".
1428 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1429 unsigned DstAS = I.getType()->getPointerAddressSpace();
1430 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1431 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1432 break;
1433 }
1434 default:
1435 break;
1436 }
1437}
1438
1441 Type *&AccessTy) const {
1442 Value *Ptr = nullptr;
1443 switch (II->getIntrinsicID()) {
1444 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1445 case Intrinsic::amdgcn_ds_append:
1446 case Intrinsic::amdgcn_ds_consume:
1447 case Intrinsic::amdgcn_ds_fadd:
1448 case Intrinsic::amdgcn_ds_fmax:
1449 case Intrinsic::amdgcn_ds_fmin:
1450 case Intrinsic::amdgcn_ds_ordered_add:
1451 case Intrinsic::amdgcn_ds_ordered_swap:
1452 case Intrinsic::amdgcn_flat_atomic_fadd:
1453 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1454 case Intrinsic::amdgcn_flat_atomic_fmax:
1455 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1456 case Intrinsic::amdgcn_flat_atomic_fmin:
1457 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1458 case Intrinsic::amdgcn_global_atomic_csub:
1459 case Intrinsic::amdgcn_global_atomic_fadd:
1460 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1461 case Intrinsic::amdgcn_global_atomic_fmax:
1462 case Intrinsic::amdgcn_global_atomic_fmax_num:
1463 case Intrinsic::amdgcn_global_atomic_fmin:
1464 case Intrinsic::amdgcn_global_atomic_fmin_num:
1465 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1466 case Intrinsic::amdgcn_global_load_tr_b64:
1467 case Intrinsic::amdgcn_global_load_tr_b128:
1468 Ptr = II->getArgOperand(0);
1469 break;
1470 case Intrinsic::amdgcn_global_load_lds:
1471 Ptr = II->getArgOperand(1);
1472 break;
1473 default:
1474 return false;
1475 }
1476 AccessTy = II->getType();
1477 Ops.push_back(Ptr);
1478 return true;
1479}
1480
1481bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1482 unsigned AddrSpace,
1483 uint64_t FlatVariant) const {
1484 if (!Subtarget->hasFlatInstOffsets()) {
1485 // Flat instructions do not have offsets, and only have the register
1486 // address.
1487 return AM.BaseOffs == 0 && AM.Scale == 0;
1488 }
1489
1490 return AM.Scale == 0 &&
1491 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1492 AM.BaseOffs, AddrSpace, FlatVariant));
1493}
1494
1496 if (Subtarget->hasFlatGlobalInsts())
1497 return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS,
1499
1500 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1501 // Assume the we will use FLAT for all global memory accesses
1502 // on VI.
1503 // FIXME: This assumption is currently wrong. On VI we still use
1504 // MUBUF instructions for the r + i addressing mode. As currently
1505 // implemented, the MUBUF instructions only work on buffer < 4GB.
1506 // It may be possible to support > 4GB buffers with MUBUF instructions,
1507 // by setting the stride value in the resource descriptor which would
1508 // increase the size limit to (stride * 4GB). However, this is risky,
1509 // because it has never been validated.
1510 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1512 }
1513
1514 return isLegalMUBUFAddressingMode(AM);
1515}
1516
1517bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1518 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1519 // additionally can do r + r + i with addr64. 32-bit has more addressing
1520 // mode options. Depending on the resource constant, it can also do
1521 // (i64 r0) + (i32 r1) * (i14 i).
1522 //
1523 // Private arrays end up using a scratch buffer most of the time, so also
1524 // assume those use MUBUF instructions. Scratch loads / stores are currently
1525 // implemented as mubuf instructions with offen bit set, so slightly
1526 // different than the normal addr64.
1527 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1528 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1529 return false;
1530
1531 // FIXME: Since we can split immediate into soffset and immediate offset,
1532 // would it make sense to allow any immediate?
1533
1534 switch (AM.Scale) {
1535 case 0: // r + i or just i, depending on HasBaseReg.
1536 return true;
1537 case 1:
1538 return true; // We have r + r or r + i.
1539 case 2:
1540 if (AM.HasBaseReg) {
1541 // Reject 2 * r + r.
1542 return false;
1543 }
1544
1545 // Allow 2 * r as r + r
1546 // Or 2 * r + i is allowed as r + r + i.
1547 return true;
1548 default: // Don't allow n * r
1549 return false;
1550 }
1551}
1552
1554 const AddrMode &AM, Type *Ty,
1555 unsigned AS, Instruction *I) const {
1556 // No global is ever allowed as a base.
1557 if (AM.BaseGV)
1558 return false;
1559
1560 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1561 return isLegalGlobalAddressingMode(AM);
1562
1563 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1567 // If the offset isn't a multiple of 4, it probably isn't going to be
1568 // correctly aligned.
1569 // FIXME: Can we get the real alignment here?
1570 if (AM.BaseOffs % 4 != 0)
1571 return isLegalMUBUFAddressingMode(AM);
1572
1573 if (!Subtarget->hasScalarSubwordLoads()) {
1574 // There are no SMRD extloads, so if we have to do a small type access we
1575 // will use a MUBUF load.
1576 // FIXME?: We also need to do this if unaligned, but we don't know the
1577 // alignment here.
1578 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1579 return isLegalGlobalAddressingMode(AM);
1580 }
1581
1583 // SMRD instructions have an 8-bit, dword offset on SI.
1584 if (!isUInt<8>(AM.BaseOffs / 4))
1585 return false;
1586 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1587 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1588 // in 8-bits, it can use a smaller encoding.
1589 if (!isUInt<32>(AM.BaseOffs / 4))
1590 return false;
1591 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1592 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1593 if (!isUInt<20>(AM.BaseOffs))
1594 return false;
1595 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1596 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1597 // for S_BUFFER_* instructions).
1598 if (!isInt<21>(AM.BaseOffs))
1599 return false;
1600 } else {
1601 // On GFX12, all offsets are signed 24-bit in bytes.
1602 if (!isInt<24>(AM.BaseOffs))
1603 return false;
1604 }
1605
1606 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1607 return true;
1608
1609 if (AM.Scale == 1 && AM.HasBaseReg)
1610 return true;
1611
1612 return false;
1613 }
1614
1615 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1616 return Subtarget->enableFlatScratch()
1617 ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS,
1619 : isLegalMUBUFAddressingMode(AM);
1620
1621 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1622 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1623 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1624 // field.
1625 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1626 // an 8-bit dword offset but we don't know the alignment here.
1627 if (!isUInt<16>(AM.BaseOffs))
1628 return false;
1629
1630 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1631 return true;
1632
1633 if (AM.Scale == 1 && AM.HasBaseReg)
1634 return true;
1635
1636 return false;
1637 }
1638
1640 // For an unknown address space, this usually means that this is for some
1641 // reason being used for pure arithmetic, and not based on some addressing
1642 // computation. We don't have instructions that compute pointers with any
1643 // addressing modes, so treat them as having no offset like flat
1644 // instructions.
1645 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1647 }
1648
1649 // Assume a user alias of global for unknown address spaces.
1650 return isLegalGlobalAddressingMode(AM);
1651}
1652
1654 const MachineFunction &MF) const {
1656 return (MemVT.getSizeInBits() <= 4 * 32);
1657 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1658 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1659 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1660 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1661 return (MemVT.getSizeInBits() <= 2 * 32);
1662 }
1663 return true;
1664}
1665
1667 unsigned Size, unsigned AddrSpace, Align Alignment,
1668 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1669 if (IsFast)
1670 *IsFast = 0;
1671
1672 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1673 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1674 // Check if alignment requirements for ds_read/write instructions are
1675 // disabled.
1676 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1677 return false;
1678
1679 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1680 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1681 Alignment < RequiredAlignment)
1682 return false;
1683
1684 // Either, the alignment requirements are "enabled", or there is an
1685 // unaligned LDS access related hardware bug though alignment requirements
1686 // are "disabled". In either case, we need to check for proper alignment
1687 // requirements.
1688 //
1689 switch (Size) {
1690 case 64:
1691 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1692 // address is negative, then the instruction is incorrectly treated as
1693 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1694 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1695 // load later in the SILoadStoreOptimizer.
1696 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1697 return false;
1698
1699 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1700 // can do a 4 byte aligned, 8 byte access in a single operation using
1701 // ds_read2/write2_b32 with adjacent offsets.
1702 RequiredAlignment = Align(4);
1703
1704 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1705 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1706 // ds_write2_b32 depending on the alignment. In either case with either
1707 // alignment there is no faster way of doing this.
1708
1709 // The numbers returned here and below are not additive, it is a 'speed
1710 // rank'. They are just meant to be compared to decide if a certain way
1711 // of lowering an operation is faster than another. For that purpose
1712 // naturally aligned operation gets it bitsize to indicate that "it
1713 // operates with a speed comparable to N-bit wide load". With the full
1714 // alignment ds128 is slower than ds96 for example. If underaligned it
1715 // is comparable to a speed of a single dword access, which would then
1716 // mean 32 < 128 and it is faster to issue a wide load regardless.
1717 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1718 // wider load which will not be aligned anymore the latter is slower.
1719 if (IsFast)
1720 *IsFast = (Alignment >= RequiredAlignment) ? 64
1721 : (Alignment < Align(4)) ? 32
1722 : 1;
1723 return true;
1724 }
1725
1726 break;
1727 case 96:
1728 if (!Subtarget->hasDS96AndDS128())
1729 return false;
1730
1731 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1732 // gfx8 and older.
1733
1734 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1735 // Naturally aligned access is fastest. However, also report it is Fast
1736 // if memory is aligned less than DWORD. A narrow load or store will be
1737 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1738 // be more of them, so overall we will pay less penalty issuing a single
1739 // instruction.
1740
1741 // See comment on the values above.
1742 if (IsFast)
1743 *IsFast = (Alignment >= RequiredAlignment) ? 96
1744 : (Alignment < Align(4)) ? 32
1745 : 1;
1746 return true;
1747 }
1748
1749 break;
1750 case 128:
1751 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1752 return false;
1753
1754 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1755 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1756 // single operation using ds_read2/write2_b64.
1757 RequiredAlignment = Align(8);
1758
1759 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1760 // Naturally aligned access is fastest. However, also report it is Fast
1761 // if memory is aligned less than DWORD. A narrow load or store will be
1762 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1763 // will be more of them, so overall we will pay less penalty issuing a
1764 // single instruction.
1765
1766 // See comment on the values above.
1767 if (IsFast)
1768 *IsFast = (Alignment >= RequiredAlignment) ? 128
1769 : (Alignment < Align(4)) ? 32
1770 : 1;
1771 return true;
1772 }
1773
1774 break;
1775 default:
1776 if (Size > 32)
1777 return false;
1778
1779 break;
1780 }
1781
1782 // See comment on the values above.
1783 // Note that we have a single-dword or sub-dword here, so if underaligned
1784 // it is a slowest possible access, hence returned value is 0.
1785 if (IsFast)
1786 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1787
1788 return Alignment >= RequiredAlignment ||
1789 Subtarget->hasUnalignedDSAccessEnabled();
1790 }
1791
1792 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1793 bool AlignedBy4 = Alignment >= Align(4);
1794 if (IsFast)
1795 *IsFast = AlignedBy4;
1796
1797 return AlignedBy4 ||
1798 Subtarget->enableFlatScratch() ||
1799 Subtarget->hasUnalignedScratchAccess();
1800 }
1801
1802 // FIXME: We have to be conservative here and assume that flat operations
1803 // will access scratch. If we had access to the IR function, then we
1804 // could determine if any private memory was used in the function.
1805 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1806 !Subtarget->hasUnalignedScratchAccess()) {
1807 bool AlignedBy4 = Alignment >= Align(4);
1808 if (IsFast)
1809 *IsFast = AlignedBy4;
1810
1811 return AlignedBy4;
1812 }
1813
1814 // So long as they are correct, wide global memory operations perform better
1815 // than multiple smaller memory ops -- even when misaligned
1816 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1817 if (IsFast)
1818 *IsFast = Size;
1819
1820 return Alignment >= Align(4) ||
1822 }
1823
1824 // Smaller than dword value must be aligned.
1825 if (Size < 32)
1826 return false;
1827
1828 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1829 // byte-address are ignored, thus forcing Dword alignment.
1830 // This applies to private, global, and constant memory.
1831 if (IsFast)
1832 *IsFast = 1;
1833
1834 return Size >= 32 && Alignment >= Align(4);
1835}
1836
1838 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1839 unsigned *IsFast) const {
1841 Alignment, Flags, IsFast);
1842}
1843
1845 const MemOp &Op, const AttributeList &FuncAttributes) const {
1846 // FIXME: Should account for address space here.
1847
1848 // The default fallback uses the private pointer size as a guess for a type to
1849 // use. Make sure we switch these to 64-bit accesses.
1850
1851 if (Op.size() >= 16 &&
1852 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1853 return MVT::v4i32;
1854
1855 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1856 return MVT::v2i32;
1857
1858 // Use the default.
1859 return MVT::Other;
1860}
1861
1863 const MemSDNode *MemNode = cast<MemSDNode>(N);
1864 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1865}
1866
1868 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1870}
1871
1873 unsigned DestAS) const {
1874 // Flat -> private/local is a simple truncate.
1875 // Flat -> global is no-op
1876 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1877 return true;
1878
1879 const GCNTargetMachine &TM =
1880 static_cast<const GCNTargetMachine &>(getTargetMachine());
1881 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1882}
1883
1885 const MemSDNode *MemNode = cast<MemSDNode>(N);
1886
1888}
1889
1892 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1893 VT.getScalarType().bitsLE(MVT::i16))
1896}
1897
1899 Type *Ty) const {
1900 // FIXME: Could be smarter if called for vector constants.
1901 return true;
1902}
1903
1905 unsigned Index) const {
1907 return false;
1908
1909 // TODO: Add more cases that are cheap.
1910 return Index == 0;
1911}
1912
1914 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1915 switch (Op) {
1916 case ISD::LOAD:
1917 case ISD::STORE:
1918
1919 // These operations are done with 32-bit instructions anyway.
1920 case ISD::AND:
1921 case ISD::OR:
1922 case ISD::XOR:
1923 case ISD::SELECT:
1924 // TODO: Extensions?
1925 return true;
1926 default:
1927 return false;
1928 }
1929 }
1930
1931 // SimplifySetCC uses this function to determine whether or not it should
1932 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1933 if (VT == MVT::i1 && Op == ISD::SETCC)
1934 return false;
1935
1937}
1938
1939SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1940 const SDLoc &SL,
1941 SDValue Chain,
1942 uint64_t Offset) const {
1943 const DataLayout &DL = DAG.getDataLayout();
1946
1947 const ArgDescriptor *InputPtrReg;
1948 const TargetRegisterClass *RC;
1949 LLT ArgTy;
1951
1952 std::tie(InputPtrReg, RC, ArgTy) =
1954
1955 // We may not have the kernarg segment argument if we have no kernel
1956 // arguments.
1957 if (!InputPtrReg)
1958 return DAG.getConstant(Offset, SL, PtrVT);
1959
1961 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1962 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1963
1964 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1965}
1966
1967SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1968 const SDLoc &SL) const {
1971 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1972}
1973
1974SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1975 const SDLoc &SL) const {
1976
1978 std::optional<uint32_t> KnownSize =
1980 if (KnownSize.has_value())
1981 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1982 return SDValue();
1983}
1984
1985SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1986 const SDLoc &SL, SDValue Val,
1987 bool Signed,
1988 const ISD::InputArg *Arg) const {
1989 // First, if it is a widened vector, narrow it.
1990 if (VT.isVector() &&
1992 EVT NarrowedVT =
1995 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1996 DAG.getConstant(0, SL, MVT::i32));
1997 }
1998
1999 // Then convert the vector elements or scalar value.
2000 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2001 VT.bitsLT(MemVT)) {
2002 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2003 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2004 }
2005
2006 if (MemVT.isFloatingPoint())
2007 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2008 else if (Signed)
2009 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2010 else
2011 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2012
2013 return Val;
2014}
2015
2016SDValue SITargetLowering::lowerKernargMemParameter(
2017 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2018 uint64_t Offset, Align Alignment, bool Signed,
2019 const ISD::InputArg *Arg) const {
2021
2022 // Try to avoid using an extload by loading earlier than the argument address,
2023 // and extracting the relevant bits. The load should hopefully be merged with
2024 // the previous argument.
2025 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2026 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2027 int64_t AlignDownOffset = alignDown(Offset, 4);
2028 int64_t OffsetDiff = Offset - AlignDownOffset;
2029
2030 EVT IntVT = MemVT.changeTypeToInteger();
2031
2032 // TODO: If we passed in the base kernel offset we could have a better
2033 // alignment than 4, but we don't really need it.
2034 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2035 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2038
2039 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2040 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2041
2042 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2043 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2044 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2045
2046
2047 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2048 }
2049
2050 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2051 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2054
2055 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2056 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2057}
2058
2059SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2060 const SDLoc &SL, SDValue Chain,
2061 const ISD::InputArg &Arg) const {
2063 MachineFrameInfo &MFI = MF.getFrameInfo();
2064
2065 if (Arg.Flags.isByVal()) {
2066 unsigned Size = Arg.Flags.getByValSize();
2067 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2068 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2069 }
2070
2071 unsigned ArgOffset = VA.getLocMemOffset();
2072 unsigned ArgSize = VA.getValVT().getStoreSize();
2073
2074 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2075
2076 // Create load nodes to retrieve arguments from the stack.
2077 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2078 SDValue ArgValue;
2079
2080 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2082 MVT MemVT = VA.getValVT();
2083
2084 switch (VA.getLocInfo()) {
2085 default:
2086 break;
2087 case CCValAssign::BCvt:
2088 MemVT = VA.getLocVT();
2089 break;
2090 case CCValAssign::SExt:
2091 ExtType = ISD::SEXTLOAD;
2092 break;
2093 case CCValAssign::ZExt:
2094 ExtType = ISD::ZEXTLOAD;
2095 break;
2096 case CCValAssign::AExt:
2097 ExtType = ISD::EXTLOAD;
2098 break;
2099 }
2100
2101 ArgValue = DAG.getExtLoad(
2102 ExtType, SL, VA.getLocVT(), Chain, FIN,
2104 MemVT);
2105 return ArgValue;
2106}
2107
2108SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2109 const SIMachineFunctionInfo &MFI,
2110 EVT VT,
2112 const ArgDescriptor *Reg = nullptr;
2113 const TargetRegisterClass *RC;
2114 LLT Ty;
2115
2117 const ArgDescriptor WorkGroupIDX =
2118 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2119 // If GridZ is not programmed in an entry function then the hardware will set
2120 // it to all zeros, so there is no need to mask the GridY value in the low
2121 // order bits.
2122 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2123 AMDGPU::TTMP7,
2124 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2125 const ArgDescriptor WorkGroupIDZ =
2126 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2127 if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
2128 switch (PVID) {
2130 Reg = &WorkGroupIDX;
2131 RC = &AMDGPU::SReg_32RegClass;
2132 Ty = LLT::scalar(32);
2133 break;
2135 Reg = &WorkGroupIDY;
2136 RC = &AMDGPU::SReg_32RegClass;
2137 Ty = LLT::scalar(32);
2138 break;
2140 Reg = &WorkGroupIDZ;
2141 RC = &AMDGPU::SReg_32RegClass;
2142 Ty = LLT::scalar(32);
2143 break;
2144 default:
2145 break;
2146 }
2147 }
2148
2149 if (!Reg)
2150 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2151 if (!Reg) {
2153 // It's possible for a kernarg intrinsic call to appear in a kernel with
2154 // no allocated segment, in which case we do not add the user sgpr
2155 // argument, so just return null.
2156 return DAG.getConstant(0, SDLoc(), VT);
2157 }
2158
2159 // It's undefined behavior if a function marked with the amdgpu-no-*
2160 // attributes uses the corresponding intrinsic.
2161 return DAG.getUNDEF(VT);
2162 }
2163
2164 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2165}
2166
2168 CallingConv::ID CallConv,
2169 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2170 FunctionType *FType,
2171 SIMachineFunctionInfo *Info) {
2172 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2173 const ISD::InputArg *Arg = &Ins[I];
2174
2175 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2176 "vector type argument should have been split");
2177
2178 // First check if it's a PS input addr.
2179 if (CallConv == CallingConv::AMDGPU_PS &&
2180 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2181 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2182
2183 // Inconveniently only the first part of the split is marked as isSplit,
2184 // so skip to the end. We only want to increment PSInputNum once for the
2185 // entire split argument.
2186 if (Arg->Flags.isSplit()) {
2187 while (!Arg->Flags.isSplitEnd()) {
2188 assert((!Arg->VT.isVector() ||
2189 Arg->VT.getScalarSizeInBits() == 16) &&
2190 "unexpected vector split in ps argument type");
2191 if (!SkipArg)
2192 Splits.push_back(*Arg);
2193 Arg = &Ins[++I];
2194 }
2195 }
2196
2197 if (SkipArg) {
2198 // We can safely skip PS inputs.
2199 Skipped.set(Arg->getOrigArgIndex());
2200 ++PSInputNum;
2201 continue;
2202 }
2203
2204 Info->markPSInputAllocated(PSInputNum);
2205 if (Arg->Used)
2206 Info->markPSInputEnabled(PSInputNum);
2207
2208 ++PSInputNum;
2209 }
2210
2211 Splits.push_back(*Arg);
2212 }
2213}
2214
2215// Allocate special inputs passed in VGPRs.
2217 MachineFunction &MF,
2218 const SIRegisterInfo &TRI,
2219 SIMachineFunctionInfo &Info) const {
2220 const LLT S32 = LLT::scalar(32);
2222
2223 if (Info.hasWorkItemIDX()) {
2224 Register Reg = AMDGPU::VGPR0;
2225 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2226
2227 CCInfo.AllocateReg(Reg);
2228 unsigned Mask = (Subtarget->hasPackedTID() &&
2229 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2230 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2231 }
2232
2233 if (Info.hasWorkItemIDY()) {
2234 assert(Info.hasWorkItemIDX());
2235 if (Subtarget->hasPackedTID()) {
2236 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2237 0x3ff << 10));
2238 } else {
2239 unsigned Reg = AMDGPU::VGPR1;
2240 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2241
2242 CCInfo.AllocateReg(Reg);
2243 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2244 }
2245 }
2246
2247 if (Info.hasWorkItemIDZ()) {
2248 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2249 if (Subtarget->hasPackedTID()) {
2250 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2251 0x3ff << 20));
2252 } else {
2253 unsigned Reg = AMDGPU::VGPR2;
2254 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2255
2256 CCInfo.AllocateReg(Reg);
2257 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2258 }
2259 }
2260}
2261
2262// Try to allocate a VGPR at the end of the argument list, or if no argument
2263// VGPRs are left allocating a stack slot.
2264// If \p Mask is is given it indicates bitfield position in the register.
2265// If \p Arg is given use it with new ]p Mask instead of allocating new.
2266static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2267 ArgDescriptor Arg = ArgDescriptor()) {
2268 if (Arg.isSet())
2269 return ArgDescriptor::createArg(Arg, Mask);
2270
2271 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2272 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2273 if (RegIdx == ArgVGPRs.size()) {
2274 // Spill to stack required.
2275 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2276
2277 return ArgDescriptor::createStack(Offset, Mask);
2278 }
2279
2280 unsigned Reg = ArgVGPRs[RegIdx];
2281 Reg = CCInfo.AllocateReg(Reg);
2282 assert(Reg != AMDGPU::NoRegister);
2283
2284 MachineFunction &MF = CCInfo.getMachineFunction();
2285 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2286 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2287 return ArgDescriptor::createRegister(Reg, Mask);
2288}
2289
2291 const TargetRegisterClass *RC,
2292 unsigned NumArgRegs) {
2293 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2294 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2295 if (RegIdx == ArgSGPRs.size())
2296 report_fatal_error("ran out of SGPRs for arguments");
2297
2298 unsigned Reg = ArgSGPRs[RegIdx];
2299 Reg = CCInfo.AllocateReg(Reg);
2300 assert(Reg != AMDGPU::NoRegister);
2301
2302 MachineFunction &MF = CCInfo.getMachineFunction();
2303 MF.addLiveIn(Reg, RC);
2305}
2306
2307// If this has a fixed position, we still should allocate the register in the
2308// CCInfo state. Technically we could get away with this for values passed
2309// outside of the normal argument range.
2311 const TargetRegisterClass *RC,
2312 MCRegister Reg) {
2313 Reg = CCInfo.AllocateReg(Reg);
2314 assert(Reg != AMDGPU::NoRegister);
2315 MachineFunction &MF = CCInfo.getMachineFunction();
2316 MF.addLiveIn(Reg, RC);
2317}
2318
2319static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2320 if (Arg) {
2321 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2322 Arg.getRegister());
2323 } else
2324 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2325}
2326
2327static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2328 if (Arg) {
2329 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2330 Arg.getRegister());
2331 } else
2332 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2333}
2334
2335/// Allocate implicit function VGPR arguments at the end of allocated user
2336/// arguments.
2338 CCState &CCInfo, MachineFunction &MF,
2339 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2340 const unsigned Mask = 0x3ff;
2341 ArgDescriptor Arg;
2342
2343 if (Info.hasWorkItemIDX()) {
2344 Arg = allocateVGPR32Input(CCInfo, Mask);
2345 Info.setWorkItemIDX(Arg);
2346 }
2347
2348 if (Info.hasWorkItemIDY()) {
2349 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2350 Info.setWorkItemIDY(Arg);
2351 }
2352
2353 if (Info.hasWorkItemIDZ())
2354 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2355}
2356
2357/// Allocate implicit function VGPR arguments in fixed registers.
2359 CCState &CCInfo, MachineFunction &MF,
2360 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2361 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2362 if (!Reg)
2363 report_fatal_error("failed to allocated VGPR for implicit arguments");
2364
2365 const unsigned Mask = 0x3ff;
2366 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2367 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2368 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2369}
2370
2372 CCState &CCInfo,
2373 MachineFunction &MF,
2374 const SIRegisterInfo &TRI,
2375 SIMachineFunctionInfo &Info) const {
2376 auto &ArgInfo = Info.getArgInfo();
2377 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2378
2379 // TODO: Unify handling with private memory pointers.
2380 if (UserSGPRInfo.hasDispatchPtr())
2381 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2382
2383 const Module *M = MF.getFunction().getParent();
2384 if (UserSGPRInfo.hasQueuePtr() &&
2386 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2387
2388 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2389 // constant offset from the kernarg segment.
2390 if (Info.hasImplicitArgPtr())
2391 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2392
2393 if (UserSGPRInfo.hasDispatchID())
2394 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2395
2396 // flat_scratch_init is not applicable for non-kernel functions.
2397
2398 if (Info.hasWorkGroupIDX())
2399 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2400
2401 if (Info.hasWorkGroupIDY())
2402 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2403
2404 if (Info.hasWorkGroupIDZ())
2405 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2406
2407 if (Info.hasLDSKernelId())
2408 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2409}
2410
2411// Allocate special inputs passed in user SGPRs.
2413 MachineFunction &MF,
2414 const SIRegisterInfo &TRI,
2415 SIMachineFunctionInfo &Info) const {
2416 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2417 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2418 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2419 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2420 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2421 }
2422
2423 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2424 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2425 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2426 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2427 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2428 }
2429
2430 if (UserSGPRInfo.hasDispatchPtr()) {
2431 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2432 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2433 CCInfo.AllocateReg(DispatchPtrReg);
2434 }
2435
2436 const Module *M = MF.getFunction().getParent();
2437 if (UserSGPRInfo.hasQueuePtr() &&
2439 Register QueuePtrReg = Info.addQueuePtr(TRI);
2440 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2441 CCInfo.AllocateReg(QueuePtrReg);
2442 }
2443
2444 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2446 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2447 CCInfo.AllocateReg(InputPtrReg);
2448
2449 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2450 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2451 }
2452
2453 if (UserSGPRInfo.hasDispatchID()) {
2454 Register DispatchIDReg = Info.addDispatchID(TRI);
2455 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2456 CCInfo.AllocateReg(DispatchIDReg);
2457 }
2458
2459 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2460 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2461 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2462 CCInfo.AllocateReg(FlatScratchInitReg);
2463 }
2464
2465 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2466 // these from the dispatch pointer.
2467}
2468
2469// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2470// sequential starting from the first argument.
2472 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2474 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2475 Function &F = MF.getFunction();
2476 unsigned LastExplicitArgOffset =
2477 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2478 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2479 bool InPreloadSequence = true;
2480 unsigned InIdx = 0;
2481 for (auto &Arg : F.args()) {
2482 if (!InPreloadSequence || !Arg.hasInRegAttr())
2483 break;
2484
2485 int ArgIdx = Arg.getArgNo();
2486 // Don't preload non-original args or parts not in the current preload
2487 // sequence.
2488 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2489 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2490 break;
2491
2492 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2493 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2494 InIdx++) {
2495 assert(ArgLocs[ArgIdx].isMemLoc());
2496 auto &ArgLoc = ArgLocs[InIdx];
2497 const Align KernelArgBaseAlign = Align(16);
2498 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2499 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2500 unsigned NumAllocSGPRs =
2501 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2502
2503 // Arg is preloaded into the previous SGPR.
2504 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2505 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2506 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2507 continue;
2508 }
2509
2510 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2511 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2512 // Check for free user SGPRs for preloading.
2513 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2514 SGPRInfo.getNumFreeUserSGPRs()) {
2515 InPreloadSequence = false;
2516 break;
2517 }
2518
2519 // Preload this argument.
2520 const TargetRegisterClass *RC =
2521 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2522 SmallVectorImpl<MCRegister> *PreloadRegs =
2523 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2524
2525 if (PreloadRegs->size() > 1)
2526 RC = &AMDGPU::SGPR_32RegClass;
2527 for (auto &Reg : *PreloadRegs) {
2528 assert(Reg);
2529 MF.addLiveIn(Reg, RC);
2530 CCInfo.AllocateReg(Reg);
2531 }
2532
2533 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2534 }
2535 }
2536}
2537
2539 const SIRegisterInfo &TRI,
2540 SIMachineFunctionInfo &Info) const {
2541 // Always allocate this last since it is a synthetic preload.
2542 if (Info.hasLDSKernelId()) {
2543 Register Reg = Info.addLDSKernelId();
2544 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2545 CCInfo.AllocateReg(Reg);
2546 }
2547}
2548
2549// Allocate special input registers that are initialized per-wave.
2551 MachineFunction &MF,
2553 CallingConv::ID CallConv,
2554 bool IsShader) const {
2555 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2556 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2557 // Note: user SGPRs are handled by the front-end for graphics shaders
2558 // Pad up the used user SGPRs with dead inputs.
2559
2560 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2561 // before enabling architected SGPRs for workgroup IDs.
2562 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2563
2564 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2565 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2566 // rely on it to reach 16 since if we end up having no stack usage, it will
2567 // not really be added.
2568 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2569 Info.hasWorkGroupIDY() +
2570 Info.hasWorkGroupIDZ() +
2571 Info.hasWorkGroupInfo();
2572 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2573 Register Reg = Info.addReservedUserSGPR();
2574 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2575 CCInfo.AllocateReg(Reg);
2576 }
2577 }
2578
2579 if (!HasArchitectedSGPRs) {
2580 if (Info.hasWorkGroupIDX()) {
2581 Register Reg = Info.addWorkGroupIDX();
2582 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2583 CCInfo.AllocateReg(Reg);
2584 }
2585
2586 if (Info.hasWorkGroupIDY()) {
2587 Register Reg = Info.addWorkGroupIDY();
2588 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2589 CCInfo.AllocateReg(Reg);
2590 }
2591
2592 if (Info.hasWorkGroupIDZ()) {
2593 Register Reg = Info.addWorkGroupIDZ();
2594 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2595 CCInfo.AllocateReg(Reg);
2596 }
2597 }
2598
2599 if (Info.hasWorkGroupInfo()) {
2600 Register Reg = Info.addWorkGroupInfo();
2601 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2602 CCInfo.AllocateReg(Reg);
2603 }
2604
2605 if (Info.hasPrivateSegmentWaveByteOffset()) {
2606 // Scratch wave offset passed in system SGPR.
2607 unsigned PrivateSegmentWaveByteOffsetReg;
2608
2609 if (IsShader) {
2610 PrivateSegmentWaveByteOffsetReg =
2611 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2612
2613 // This is true if the scratch wave byte offset doesn't have a fixed
2614 // location.
2615 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2616 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2617 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2618 }
2619 } else
2620 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2621
2622 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2623 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2624 }
2625
2626 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2627 Info.getNumPreloadedSGPRs() >= 16);
2628}
2629
2631 MachineFunction &MF,
2632 const SIRegisterInfo &TRI,
2633 SIMachineFunctionInfo &Info) {
2634 // Now that we've figured out where the scratch register inputs are, see if
2635 // should reserve the arguments and use them directly.
2636 MachineFrameInfo &MFI = MF.getFrameInfo();
2637 bool HasStackObjects = MFI.hasStackObjects();
2638 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2639
2640 // Record that we know we have non-spill stack objects so we don't need to
2641 // check all stack objects later.
2642 if (HasStackObjects)
2643 Info.setHasNonSpillStackObjects(true);
2644
2645 // Everything live out of a block is spilled with fast regalloc, so it's
2646 // almost certain that spilling will be required.
2647 if (TM.getOptLevel() == CodeGenOptLevel::None)
2648 HasStackObjects = true;
2649
2650 // For now assume stack access is needed in any callee functions, so we need
2651 // the scratch registers to pass in.
2652 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2653
2654 if (!ST.enableFlatScratch()) {
2655 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2656 // If we have stack objects, we unquestionably need the private buffer
2657 // resource. For the Code Object V2 ABI, this will be the first 4 user
2658 // SGPR inputs. We can reserve those and use them directly.
2659
2660 Register PrivateSegmentBufferReg =
2662 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2663 } else {
2664 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2665 // We tentatively reserve the last registers (skipping the last registers
2666 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2667 // we'll replace these with the ones immediately after those which were
2668 // really allocated. In the prologue copies will be inserted from the
2669 // argument to these reserved registers.
2670
2671 // Without HSA, relocations are used for the scratch pointer and the
2672 // buffer resource setup is always inserted in the prologue. Scratch wave
2673 // offset is still in an input SGPR.
2674 Info.setScratchRSrcReg(ReservedBufferReg);
2675 }
2676 }
2677
2679
2680 // For entry functions we have to set up the stack pointer if we use it,
2681 // whereas non-entry functions get this "for free". This means there is no
2682 // intrinsic advantage to using S32 over S34 in cases where we do not have
2683 // calls but do need a frame pointer (i.e. if we are requested to have one
2684 // because frame pointer elimination is disabled). To keep things simple we
2685 // only ever use S32 as the call ABI stack pointer, and so using it does not
2686 // imply we need a separate frame pointer.
2687 //
2688 // Try to use s32 as the SP, but move it if it would interfere with input
2689 // arguments. This won't work with calls though.
2690 //
2691 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2692 // registers.
2693 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2694 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2695 } else {
2697
2698 if (MFI.hasCalls())
2699 report_fatal_error("call in graphics shader with too many input SGPRs");
2700
2701 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2702 if (!MRI.isLiveIn(Reg)) {
2703 Info.setStackPtrOffsetReg(Reg);
2704 break;
2705 }
2706 }
2707
2708 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2709 report_fatal_error("failed to find register for SP");
2710 }
2711
2712 // hasFP should be accurate for entry functions even before the frame is
2713 // finalized, because it does not rely on the known stack size, only
2714 // properties like whether variable sized objects are present.
2715 if (ST.getFrameLowering()->hasFP(MF)) {
2716 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2717 }
2718}
2719
2722 return !Info->isEntryFunction();
2723}
2724
2726
2727}
2728
2730 MachineBasicBlock *Entry,
2731 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2733
2734 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2735 if (!IStart)
2736 return;
2737
2738 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2739 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2740 MachineBasicBlock::iterator MBBI = Entry->begin();
2741 for (const MCPhysReg *I = IStart; *I; ++I) {
2742 const TargetRegisterClass *RC = nullptr;
2743 if (AMDGPU::SReg_64RegClass.contains(*I))
2744 RC = &AMDGPU::SGPR_64RegClass;
2745 else if (AMDGPU::SReg_32RegClass.contains(*I))
2746 RC = &AMDGPU::SGPR_32RegClass;
2747 else
2748 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2749
2750 Register NewVR = MRI->createVirtualRegister(RC);
2751 // Create copy from CSR to a virtual register.
2752 Entry->addLiveIn(*I);
2753 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2754 .addReg(*I);
2755
2756 // Insert the copy-back instructions right before the terminator.
2757 for (auto *Exit : Exits)
2758 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2759 TII->get(TargetOpcode::COPY), *I)
2760 .addReg(NewVR);
2761 }
2762}
2763
2765 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2766 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2767 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2769
2771 const Function &Fn = MF.getFunction();
2774
2775 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2776 DiagnosticInfoUnsupported NoGraphicsHSA(
2777 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2778 DAG.getContext()->diagnose(NoGraphicsHSA);
2779 return DAG.getEntryNode();
2780 }
2781
2784 BitVector Skipped(Ins.size());
2785 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2786 *DAG.getContext());
2787
2788 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2789 bool IsKernel = AMDGPU::isKernel(CallConv);
2790 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2791
2792 if (IsGraphics) {
2793 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2794 assert(!UserSGPRInfo.hasDispatchPtr() &&
2795 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2796 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2797 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2798 (void)UserSGPRInfo;
2799 if (!Subtarget->enableFlatScratch())
2800 assert(!UserSGPRInfo.hasFlatScratchInit());
2801 if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs())
2802 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2803 !Info->hasWorkGroupIDZ());
2804 }
2805
2806 if (CallConv == CallingConv::AMDGPU_PS) {
2807 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2808
2809 // At least one interpolation mode must be enabled or else the GPU will
2810 // hang.
2811 //
2812 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2813 // set PSInputAddr, the user wants to enable some bits after the compilation
2814 // based on run-time states. Since we can't know what the final PSInputEna
2815 // will look like, so we shouldn't do anything here and the user should take
2816 // responsibility for the correct programming.
2817 //
2818 // Otherwise, the following restrictions apply:
2819 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2820 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2821 // enabled too.
2822 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2823 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2824 CCInfo.AllocateReg(AMDGPU::VGPR0);
2825 CCInfo.AllocateReg(AMDGPU::VGPR1);
2826 Info->markPSInputAllocated(0);
2827 Info->markPSInputEnabled(0);
2828 }
2829 if (Subtarget->isAmdPalOS()) {
2830 // For isAmdPalOS, the user does not enable some bits after compilation
2831 // based on run-time states; the register values being generated here are
2832 // the final ones set in hardware. Therefore we need to apply the
2833 // workaround to PSInputAddr and PSInputEnable together. (The case where
2834 // a bit is set in PSInputAddr but not PSInputEnable is where the
2835 // frontend set up an input arg for a particular interpolation mode, but
2836 // nothing uses that input arg. Really we should have an earlier pass
2837 // that removes such an arg.)
2838 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2839 if ((PsInputBits & 0x7F) == 0 ||
2840 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2841 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2842 }
2843 } else if (IsKernel) {
2844 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2845 } else {
2846 Splits.append(Ins.begin(), Ins.end());
2847 }
2848
2849 if (IsKernel)
2850 analyzeFormalArgumentsCompute(CCInfo, Ins);
2851
2852 if (IsEntryFunc) {
2853 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2854 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2855 if (IsKernel && Subtarget->hasKernargPreload())
2856 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2857
2858 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2859 } else if (!IsGraphics) {
2860 // For the fixed ABI, pass workitem IDs in the last argument register.
2861 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2862
2863 // FIXME: Sink this into allocateSpecialInputSGPRs
2864 if (!Subtarget->enableFlatScratch())
2865 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2866
2867 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2868 }
2869
2870 if (!IsKernel) {
2871 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2872 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2873 }
2874
2876
2877 // FIXME: This is the minimum kernel argument alignment. We should improve
2878 // this to the maximum alignment of the arguments.
2879 //
2880 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2881 // kern arg offset.
2882 const Align KernelArgBaseAlign = Align(16);
2883
2884 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2885 const ISD::InputArg &Arg = Ins[i];
2886 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2887 InVals.push_back(DAG.getUNDEF(Arg.VT));
2888 continue;
2889 }
2890
2891 CCValAssign &VA = ArgLocs[ArgIdx++];
2892 MVT VT = VA.getLocVT();
2893
2894 if (IsEntryFunc && VA.isMemLoc()) {
2895 VT = Ins[i].VT;
2896 EVT MemVT = VA.getLocVT();
2897
2898 const uint64_t Offset = VA.getLocMemOffset();
2899 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2900
2901 if (Arg.Flags.isByRef()) {
2902 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2903
2904 const GCNTargetMachine &TM =
2905 static_cast<const GCNTargetMachine &>(getTargetMachine());
2906 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2907 Arg.Flags.getPointerAddrSpace())) {
2910 }
2911
2912 InVals.push_back(Ptr);
2913 continue;
2914 }
2915
2916 SDValue NewArg;
2917 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2918 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2919 // In this case the argument is packed into the previous preload SGPR.
2920 int64_t AlignDownOffset = alignDown(Offset, 4);
2921 int64_t OffsetDiff = Offset - AlignDownOffset;
2922 EVT IntVT = MemVT.changeTypeToInteger();
2923
2927 Register Reg =
2928 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2929
2930 assert(Reg);
2931 Register VReg = MRI.getLiveInVirtReg(Reg);
2932 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2933
2934 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2935 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2936
2937 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2938 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2939 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2940 Ins[i].Flags.isSExt(), &Ins[i]);
2941
2942 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2943 } else {
2947 const SmallVectorImpl<MCRegister> &PreloadRegs =
2948 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2949
2950 SDValue Copy;
2951 if (PreloadRegs.size() == 1) {
2952 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2953 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2954 NewArg = DAG.getCopyFromReg(
2955 Chain, DL, VReg,
2957 TRI->getRegSizeInBits(*RC)));
2958
2959 } else {
2960 // If the kernarg alignment does not match the alignment of the SGPR
2961 // tuple RC that can accommodate this argument, it will be built up
2962 // via copies from from the individual SGPRs that the argument was
2963 // preloaded to.
2965 for (auto Reg : PreloadRegs) {
2966 Register VReg = MRI.getLiveInVirtReg(Reg);
2967 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2968 Elts.push_back(Copy);
2969 }
2970 NewArg =
2971 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
2972 PreloadRegs.size()),
2973 DL, Elts);
2974 }
2975
2976 SDValue CMemVT;
2977 if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
2978 CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
2979 else
2980 CMemVT = DAG.getBitcast(MemVT, NewArg);
2981 NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
2982 Ins[i].Flags.isSExt(), &Ins[i]);
2983 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
2984 }
2985 } else {
2986 NewArg =
2987 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
2988 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2989 }
2990 Chains.push_back(NewArg.getValue(1));
2991
2992 auto *ParamTy =
2993 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2995 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2996 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2997 // On SI local pointers are just offsets into LDS, so they are always
2998 // less than 16-bits. On CI and newer they could potentially be
2999 // real pointers, so we can't guarantee their size.
3000 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3001 DAG.getValueType(MVT::i16));
3002 }
3003
3004 InVals.push_back(NewArg);
3005 continue;
3006 } else if (!IsEntryFunc && VA.isMemLoc()) {
3007 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3008 InVals.push_back(Val);
3009 if (!Arg.Flags.isByVal())
3010 Chains.push_back(Val.getValue(1));
3011 continue;
3012 }
3013
3014 assert(VA.isRegLoc() && "Parameter must be in a register!");
3015
3016 Register Reg = VA.getLocReg();
3017 const TargetRegisterClass *RC = nullptr;
3018 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3019 RC = &AMDGPU::VGPR_32RegClass;
3020 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3021 RC = &AMDGPU::SGPR_32RegClass;
3022 else
3023 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3024 EVT ValVT = VA.getValVT();
3025
3026 Reg = MF.addLiveIn(Reg, RC);
3027 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3028
3029 if (Arg.Flags.isSRet()) {
3030 // The return object should be reasonably addressable.
3031
3032 // FIXME: This helps when the return is a real sret. If it is a
3033 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3034 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3035 unsigned NumBits
3037 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3038 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3039 }
3040
3041 // If this is an 8 or 16-bit value, it is really passed promoted
3042 // to 32 bits. Insert an assert[sz]ext to capture this, then
3043 // truncate to the right size.
3044 switch (VA.getLocInfo()) {
3045 case CCValAssign::Full:
3046 break;
3047 case CCValAssign::BCvt:
3048 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3049 break;
3050 case CCValAssign::SExt:
3051 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3052 DAG.getValueType(ValVT));
3053 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3054 break;
3055 case CCValAssign::ZExt:
3056 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3057 DAG.getValueType(ValVT));
3058 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3059 break;
3060 case CCValAssign::AExt:
3061 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3062 break;
3063 default:
3064 llvm_unreachable("Unknown loc info!");
3065 }
3066
3067 InVals.push_back(Val);
3068 }
3069
3070 // Start adding system SGPRs.
3071 if (IsEntryFunc)
3072 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3073
3074 auto &ArgUsageInfo =
3076 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3077
3078 unsigned StackArgSize = CCInfo.getStackSize();
3079 Info->setBytesInStackArgArea(StackArgSize);
3080
3081 return Chains.empty() ? Chain :
3082 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3083}
3084
3085// TODO: If return values can't fit in registers, we should return as many as
3086// possible in registers before passing on stack.
3088 CallingConv::ID CallConv,
3089 MachineFunction &MF, bool IsVarArg,
3091 LLVMContext &Context) const {
3092 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3093 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3094 // for shaders. Vector types should be explicitly handled by CC.
3095 if (AMDGPU::isEntryFunctionCC(CallConv))
3096 return true;
3097
3099 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3100 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3101 return false;
3102
3103 // We must use the stack if return would require unavailable registers.
3104 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3105 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3106 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3107 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3108 return false;
3109
3110 return true;
3111}
3112
3113SDValue
3115 bool isVarArg,
3117 const SmallVectorImpl<SDValue> &OutVals,
3118 const SDLoc &DL, SelectionDAG &DAG) const {
3121
3122 if (AMDGPU::isKernel(CallConv)) {
3123 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3124 OutVals, DL, DAG);
3125 }
3126
3127 bool IsShader = AMDGPU::isShader(CallConv);
3128
3129 Info->setIfReturnsVoid(Outs.empty());
3130 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3131
3132 // CCValAssign - represent the assignment of the return value to a location.
3135
3136 // CCState - Info about the registers and stack slots.
3137 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3138 *DAG.getContext());
3139
3140 // Analyze outgoing return values.
3141 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3142
3143 SDValue Glue;
3145 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3146
3147 // Copy the result values into the output registers.
3148 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3149 ++I, ++RealRVLocIdx) {
3150 CCValAssign &VA = RVLocs[I];
3151 assert(VA.isRegLoc() && "Can only return in registers!");
3152 // TODO: Partially return in registers if return values don't fit.
3153 SDValue Arg = OutVals[RealRVLocIdx];
3154
3155 // Copied from other backends.
3156 switch (VA.getLocInfo()) {
3157 case CCValAssign::Full:
3158 break;
3159 case CCValAssign::BCvt:
3160 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3161 break;
3162 case CCValAssign::SExt:
3163 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3164 break;
3165 case CCValAssign::ZExt:
3166 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3167 break;
3168 case CCValAssign::AExt:
3169 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3170 break;
3171 default:
3172 llvm_unreachable("Unknown loc info!");
3173 }
3174
3175 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3176 Glue = Chain.getValue(1);
3177 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3178 }
3179
3180 // FIXME: Does sret work properly?
3181 if (!Info->isEntryFunction()) {
3182 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3183 const MCPhysReg *I =
3184 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3185 if (I) {
3186 for (; *I; ++I) {
3187 if (AMDGPU::SReg_64RegClass.contains(*I))
3188 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3189 else if (AMDGPU::SReg_32RegClass.contains(*I))
3190 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3191 else
3192 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3193 }
3194 }
3195 }
3196
3197 // Update chain and glue.
3198 RetOps[0] = Chain;
3199 if (Glue.getNode())
3200 RetOps.push_back(Glue);
3201
3202 unsigned Opc = AMDGPUISD::ENDPGM;
3203 if (!IsWaveEnd)
3205 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3206}
3207
3209 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3210 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3211 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3212 SDValue ThisVal) const {
3213 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3214
3215 // Assign locations to each value returned by this call.
3217 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3218 *DAG.getContext());
3219 CCInfo.AnalyzeCallResult(Ins, RetCC);
3220
3221 // Copy all of the result registers out of their specified physreg.
3222 for (unsigned i = 0; i != RVLocs.size(); ++i) {
3223 CCValAssign VA = RVLocs[i];
3224 SDValue Val;
3225
3226 if (VA.isRegLoc()) {
3227 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3228 Chain = Val.getValue(1);
3229 InGlue = Val.getValue(2);
3230 } else if (VA.isMemLoc()) {
3231 report_fatal_error("TODO: return values in memory");
3232 } else
3233 llvm_unreachable("unknown argument location type");
3234
3235 switch (VA.getLocInfo()) {
3236 case CCValAssign::Full:
3237 break;
3238 case CCValAssign::BCvt:
3239 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3240 break;
3241 case CCValAssign::ZExt:
3242 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3243 DAG.getValueType(VA.getValVT()));
3244 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3245 break;
3246 case CCValAssign::SExt:
3247 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3248 DAG.getValueType(VA.getValVT()));
3249 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3250 break;
3251 case CCValAssign::AExt:
3252 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3253 break;
3254 default:
3255 llvm_unreachable("Unknown loc info!");
3256 }
3257
3258 InVals.push_back(Val);
3259 }
3260
3261 return Chain;
3262}
3263
3264// Add code to pass special inputs required depending on used features separate
3265// from the explicit user arguments present in the IR.
3267 CallLoweringInfo &CLI,
3268 CCState &CCInfo,
3269 const SIMachineFunctionInfo &Info,
3270 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3271 SmallVectorImpl<SDValue> &MemOpChains,
3272 SDValue Chain) const {
3273 // If we don't have a call site, this was a call inserted by
3274 // legalization. These can never use special inputs.
3275 if (!CLI.CB)
3276 return;
3277
3278 SelectionDAG &DAG = CLI.DAG;
3279 const SDLoc &DL = CLI.DL;
3280 const Function &F = DAG.getMachineFunction().getFunction();
3281
3282 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3283 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3284
3285 const AMDGPUFunctionArgInfo *CalleeArgInfo
3287 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3288 auto &ArgUsageInfo =
3290 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3291 }
3292
3293 // TODO: Unify with private memory register handling. This is complicated by
3294 // the fact that at least in kernels, the input argument is not necessarily
3295 // in the same location as the input.
3296 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3298 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3299 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3300 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3301 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3302 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3303 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3304 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3305 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3306 };
3307
3308 for (auto Attr : ImplicitAttrs) {
3309 const ArgDescriptor *OutgoingArg;
3310 const TargetRegisterClass *ArgRC;
3311 LLT ArgTy;
3312
3313 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3314
3315 // If the callee does not use the attribute value, skip copying the value.
3316 if (CLI.CB->hasFnAttr(Attr.second))
3317 continue;
3318
3319 std::tie(OutgoingArg, ArgRC, ArgTy) =
3320 CalleeArgInfo->getPreloadedValue(InputID);
3321 if (!OutgoingArg)
3322 continue;
3323
3324 const ArgDescriptor *IncomingArg;
3325 const TargetRegisterClass *IncomingArgRC;
3326 LLT Ty;
3327 std::tie(IncomingArg, IncomingArgRC, Ty) =
3328 CallerArgInfo.getPreloadedValue(InputID);
3329 assert(IncomingArgRC == ArgRC);
3330
3331 // All special arguments are ints for now.
3332 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3333 SDValue InputReg;
3334
3335 if (IncomingArg) {
3336 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3337 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3338 // The implicit arg ptr is special because it doesn't have a corresponding
3339 // input for kernels, and is computed from the kernarg segment pointer.
3340 InputReg = getImplicitArgPtr(DAG, DL);
3341 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3342 std::optional<uint32_t> Id =
3344 if (Id.has_value()) {
3345 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3346 } else {
3347 InputReg = DAG.getUNDEF(ArgVT);
3348 }
3349 } else {
3350 // We may have proven the input wasn't needed, although the ABI is
3351 // requiring it. We just need to allocate the register appropriately.
3352 InputReg = DAG.getUNDEF(ArgVT);
3353 }
3354
3355 if (OutgoingArg->isRegister()) {
3356 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3357 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3358 report_fatal_error("failed to allocate implicit input argument");
3359 } else {
3360 unsigned SpecialArgOffset =
3361 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3362 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3363 SpecialArgOffset);
3364 MemOpChains.push_back(ArgStore);
3365 }
3366 }
3367
3368 // Pack workitem IDs into a single register or pass it as is if already
3369 // packed.
3370 const ArgDescriptor *OutgoingArg;
3371 const TargetRegisterClass *ArgRC;
3372 LLT Ty;
3373
3374 std::tie(OutgoingArg, ArgRC, Ty) =
3376 if (!OutgoingArg)
3377 std::tie(OutgoingArg, ArgRC, Ty) =
3379 if (!OutgoingArg)
3380 std::tie(OutgoingArg, ArgRC, Ty) =
3382 if (!OutgoingArg)
3383 return;
3384
3385 const ArgDescriptor *IncomingArgX = std::get<0>(
3387 const ArgDescriptor *IncomingArgY = std::get<0>(
3389 const ArgDescriptor *IncomingArgZ = std::get<0>(
3391
3392 SDValue InputReg;
3393 SDLoc SL;
3394
3395 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3396 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3397 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3398
3399 // If incoming ids are not packed we need to pack them.
3400 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3401 NeedWorkItemIDX) {
3402 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3403 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3404 } else {
3405 InputReg = DAG.getConstant(0, DL, MVT::i32);
3406 }
3407 }
3408
3409 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3410 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3411 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3412 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3413 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3414 InputReg = InputReg.getNode() ?
3415 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3416 }
3417
3418 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3419 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3420 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3421 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3422 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3423 InputReg = InputReg.getNode() ?
3424 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3425 }
3426
3427 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3428 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3429 // We're in a situation where the outgoing function requires the workitem
3430 // ID, but the calling function does not have it (e.g a graphics function
3431 // calling a C calling convention function). This is illegal, but we need
3432 // to produce something.
3433 InputReg = DAG.getUNDEF(MVT::i32);
3434 } else {
3435 // Workitem ids are already packed, any of present incoming arguments
3436 // will carry all required fields.
3438 IncomingArgX ? *IncomingArgX :
3439 IncomingArgY ? *IncomingArgY :
3440 *IncomingArgZ, ~0u);
3441 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3442 }
3443 }
3444
3445 if (OutgoingArg->isRegister()) {
3446 if (InputReg)
3447 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3448
3449 CCInfo.AllocateReg(OutgoingArg->getRegister());
3450 } else {
3451 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3452 if (InputReg) {
3453 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3454 SpecialArgOffset);
3455 MemOpChains.push_back(ArgStore);
3456 }
3457 }
3458}
3459
3461 return CC == CallingConv::Fast;
3462}
3463
3464/// Return true if we might ever do TCO for calls with this calling convention.
3466 switch (CC) {
3467 case CallingConv::C:
3469 return true;
3470 default:
3471 return canGuaranteeTCO(CC);
3472 }
3473}
3474
3476 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3478 const SmallVectorImpl<SDValue> &OutVals,
3479 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3480 if (AMDGPU::isChainCC(CalleeCC))
3481 return true;
3482
3483 if (!mayTailCallThisCC(CalleeCC))
3484 return false;
3485
3486 // For a divergent call target, we need to do a waterfall loop over the
3487 // possible callees which precludes us from using a simple jump.
3488 if (Callee->isDivergent())
3489 return false;
3490
3492 const Function &CallerF = MF.getFunction();
3493 CallingConv::ID CallerCC = CallerF.getCallingConv();
3495 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3496
3497 // Kernels aren't callable, and don't have a live in return address so it
3498 // doesn't make sense to do a tail call with entry functions.
3499 if (!CallerPreserved)
3500 return false;
3501
3502 bool CCMatch = CallerCC == CalleeCC;
3503
3505 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3506 return true;
3507 return false;
3508 }
3509
3510 // TODO: Can we handle var args?
3511 if (IsVarArg)
3512 return false;
3513
3514 for (const Argument &Arg : CallerF.args()) {
3515 if (Arg.hasByValAttr())
3516 return false;
3517 }
3518
3519 LLVMContext &Ctx = *DAG.getContext();
3520
3521 // Check that the call results are passed in the same way.
3522 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3523 CCAssignFnForCall(CalleeCC, IsVarArg),
3524 CCAssignFnForCall(CallerCC, IsVarArg)))
3525 return false;
3526
3527 // The callee has to preserve all registers the caller needs to preserve.
3528 if (!CCMatch) {
3529 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3530 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3531 return false;
3532 }
3533
3534 // Nothing more to check if the callee is taking no arguments.
3535 if (Outs.empty())
3536 return true;
3537
3539 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3540
3541 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3542
3543 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3544 // If the stack arguments for this call do not fit into our own save area then
3545 // the call cannot be made tail.
3546 // TODO: Is this really necessary?
3547 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3548 return false;
3549
3550 const MachineRegisterInfo &MRI = MF.getRegInfo();
3551 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3552}
3553
3555 if (!CI->isTailCall())
3556 return false;
3557
3558 const Function *ParentFn = CI->getParent()->getParent();
3560 return false;
3561 return true;
3562}
3563
3564// The wave scratch offset register is used as the global base pointer.
3566 SmallVectorImpl<SDValue> &InVals) const {
3567 CallingConv::ID CallConv = CLI.CallConv;
3568 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3569
3570 SelectionDAG &DAG = CLI.DAG;
3571
3572 TargetLowering::ArgListEntry RequestedExec;
3573 if (IsChainCallConv) {
3574 // The last argument should be the value that we need to put in EXEC.
3575 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3576 // don't treat it like the rest of the arguments.
3577 RequestedExec = CLI.Args.back();
3578 assert(RequestedExec.Node && "No node for EXEC");
3579
3580 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3581 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3582
3583 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3584 CLI.Outs.pop_back();
3585 CLI.OutVals.pop_back();
3586
3587 if (RequestedExec.Ty->isIntegerTy(64)) {
3588 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3589 CLI.Outs.pop_back();
3590 CLI.OutVals.pop_back();
3591 }
3592
3593 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3594 "Haven't popped all the pieces of the EXEC mask");
3595 }
3596
3597 const SDLoc &DL = CLI.DL;
3599 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3601 SDValue Chain = CLI.Chain;
3602 SDValue Callee = CLI.Callee;
3603 bool &IsTailCall = CLI.IsTailCall;
3604 bool IsVarArg = CLI.IsVarArg;
3605 bool IsSibCall = false;
3607
3608 if (Callee.isUndef() || isNullConstant(Callee)) {
3609 if (!CLI.IsTailCall) {
3610 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3611 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3612 }
3613
3614 return Chain;
3615 }
3616
3617 if (IsVarArg) {
3618 return lowerUnhandledCall(CLI, InVals,
3619 "unsupported call to variadic function ");
3620 }
3621
3622 if (!CLI.CB)
3623 report_fatal_error("unsupported libcall legalization");
3624
3625 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3626 return lowerUnhandledCall(CLI, InVals,
3627 "unsupported required tail call to function ");
3628 }
3629
3630 if (IsTailCall) {
3632 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3633 if (!IsTailCall &&
3634 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3635 report_fatal_error("failed to perform tail call elimination on a call "
3636 "site marked musttail or on llvm.amdgcn.cs.chain");
3637 }
3638
3639 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3640
3641 // A sibling call is one where we're under the usual C ABI and not planning
3642 // to change that but can still do a tail call:
3643 if (!TailCallOpt && IsTailCall)
3644 IsSibCall = true;
3645
3646 if (IsTailCall)
3647 ++NumTailCalls;
3648 }
3649
3652 SmallVector<SDValue, 8> MemOpChains;
3653
3654 // Analyze operands of the call, assigning locations to each operand.
3656 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3657 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3658
3659 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3660 // With a fixed ABI, allocate fixed registers before user arguments.
3661 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3662 }
3663
3664 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3665
3666 // Get a count of how many bytes are to be pushed on the stack.
3667 unsigned NumBytes = CCInfo.getStackSize();
3668
3669 if (IsSibCall) {
3670 // Since we're not changing the ABI to make this a tail call, the memory
3671 // operands are already available in the caller's incoming argument space.
3672 NumBytes = 0;
3673 }
3674
3675 // FPDiff is the byte offset of the call's argument area from the callee's.
3676 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3677 // by this amount for a tail call. In a sibling call it must be 0 because the
3678 // caller will deallocate the entire stack and the callee still expects its
3679 // arguments to begin at SP+0. Completely unused for non-tail calls.
3680 int32_t FPDiff = 0;
3681 MachineFrameInfo &MFI = MF.getFrameInfo();
3682
3683 // Adjust the stack pointer for the new arguments...
3684 // These operations are automatically eliminated by the prolog/epilog pass
3685 if (!IsSibCall)
3686 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3687
3688 if (!IsSibCall || IsChainCallConv) {
3689 if (!Subtarget->enableFlatScratch()) {
3690 SmallVector<SDValue, 4> CopyFromChains;
3691
3692 // In the HSA case, this should be an identity copy.
3693 SDValue ScratchRSrcReg
3694 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3695 RegsToPass.emplace_back(IsChainCallConv
3696 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3697 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3698 ScratchRSrcReg);
3699 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3700 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3701 }
3702 }
3703
3704 MVT PtrVT = MVT::i32;
3705
3706 // Walk the register/memloc assignments, inserting copies/loads.
3707 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3708 CCValAssign &VA = ArgLocs[i];
3709 SDValue Arg = OutVals[i];
3710
3711 // Promote the value if needed.
3712 switch (VA.getLocInfo()) {
3713 case CCValAssign::Full:
3714 break;
3715 case CCValAssign::BCvt:
3716 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3717 break;
3718 case CCValAssign::ZExt:
3719 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3720 break;
3721 case CCValAssign::SExt:
3722 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3723 break;
3724 case CCValAssign::AExt:
3725 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3726 break;
3727 case CCValAssign::FPExt:
3728 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3729 break;
3730 default:
3731 llvm_unreachable("Unknown loc info!");
3732 }
3733
3734 if (VA.isRegLoc()) {
3735 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3736 } else {
3737 assert(VA.isMemLoc());
3738
3739 SDValue DstAddr;
3740 MachinePointerInfo DstInfo;
3741
3742 unsigned LocMemOffset = VA.getLocMemOffset();
3743 int32_t Offset = LocMemOffset;
3744
3745 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3746 MaybeAlign Alignment;
3747
3748 if (IsTailCall) {
3749 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3750 unsigned OpSize = Flags.isByVal() ?
3751 Flags.getByValSize() : VA.getValVT().getStoreSize();
3752
3753 // FIXME: We can have better than the minimum byval required alignment.
3754 Alignment =
3755 Flags.isByVal()
3756 ? Flags.getNonZeroByValAlign()
3757 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3758
3759 Offset = Offset + FPDiff;
3760 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3761
3762 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3763 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3764
3765 // Make sure any stack arguments overlapping with where we're storing
3766 // are loaded before this eventual operation. Otherwise they'll be
3767 // clobbered.
3768
3769 // FIXME: Why is this really necessary? This seems to just result in a
3770 // lot of code to copy the stack and write them back to the same
3771 // locations, which are supposed to be immutable?
3772 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3773 } else {
3774 // Stores to the argument stack area are relative to the stack pointer.
3775 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3776 MVT::i32);
3777 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3778 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3779 Alignment =
3780 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3781 }
3782
3783 if (Outs[i].Flags.isByVal()) {
3784 SDValue SizeNode =
3785 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3786 SDValue Cpy =
3787 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3788 Outs[i].Flags.getNonZeroByValAlign(),
3789 /*isVol = */ false, /*AlwaysInline = */ true,
3790 /*isTailCall = */ false, DstInfo,
3792
3793 MemOpChains.push_back(Cpy);
3794 } else {
3795 SDValue Store =
3796 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3797 MemOpChains.push_back(Store);
3798 }
3799 }
3800 }
3801
3802 if (!MemOpChains.empty())
3803 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3804
3805 // Build a sequence of copy-to-reg nodes chained together with token chain
3806 // and flag operands which copy the outgoing args into the appropriate regs.
3807 SDValue InGlue;
3808 for (auto &RegToPass : RegsToPass) {
3809 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3810 RegToPass.second, InGlue);
3811 InGlue = Chain.getValue(1);
3812 }
3813
3814
3815 // We don't usually want to end the call-sequence here because we would tidy
3816 // the frame up *after* the call, however in the ABI-changing tail-call case
3817 // we've carefully laid out the parameters so that when sp is reset they'll be
3818 // in the correct location.
3819 if (IsTailCall && !IsSibCall) {
3820 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3821 InGlue = Chain.getValue(1);
3822 }
3823
3824 std::vector<SDValue> Ops;
3825 Ops.push_back(Chain);
3826 Ops.push_back(Callee);
3827 // Add a redundant copy of the callee global which will not be legalized, as
3828 // we need direct access to the callee later.
3829 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3830 const GlobalValue *GV = GSD->getGlobal();
3831 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3832 } else {
3833 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3834 }
3835
3836 if (IsTailCall) {
3837 // Each tail call may have to adjust the stack by a different amount, so
3838 // this information must travel along with the operation for eventual
3839 // consumption by emitEpilogue.
3840 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3841 }
3842
3843 if (IsChainCallConv)
3844 Ops.push_back(RequestedExec.Node);
3845
3846 // Add argument registers to the end of the list so that they are known live
3847 // into the call.
3848 for (auto &RegToPass : RegsToPass) {
3849 Ops.push_back(DAG.getRegister(RegToPass.first,
3850 RegToPass.second.getValueType()));
3851 }
3852
3853 // Add a register mask operand representing the call-preserved registers.
3854 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3855 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3856 assert(Mask && "Missing call preserved mask for calling convention");
3857 Ops.push_back(DAG.getRegisterMask(Mask));
3858
3859 if (InGlue.getNode())
3860 Ops.push_back(InGlue);
3861
3862 // NOTE: This potentially results in *two* glue operands, and the wrong one
3863 // might possibly show up where the other was intended. In particular,
3864 // Emitter::EmitMachineNode() expects only the glued convergence token if it
3865 // exists. Similarly, the selection of the call expects to match only the
3866 // InGlue operand if it exists.
3867 if (SDValue Token = CLI.ConvergenceControlToken) {
3868 Ops.push_back(SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE,
3869 DL, MVT::Glue, Token),
3870 0));
3871 }
3872
3873 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3874
3875 // If we're doing a tall call, use a TC_RETURN here rather than an
3876 // actual call instruction.
3877 if (IsTailCall) {
3878 MFI.setHasTailCall();
3879 unsigned OPC = AMDGPUISD::TC_RETURN;
3880 switch (CallConv) {
3883 break;
3887 break;
3888 }
3889
3890 return DAG.getNode(OPC, DL, NodeTys, Ops);
3891 }
3892
3893 // Returns a chain and a flag for retval copy to use.
3894 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3895 Chain = Call.getValue(0);
3896 InGlue = Call.getValue(1);
3897
3898 uint64_t CalleePopBytes = NumBytes;
3899 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3900 if (!Ins.empty())
3901 InGlue = Chain.getValue(1);
3902
3903 // Handle result values, copying them out of physregs into vregs that we
3904 // return.
3905 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3906 InVals, /*IsThisReturn=*/false, SDValue());
3907}
3908
3909// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3910// except for applying the wave size scale to the increment amount.
3912 SDValue Op, SelectionDAG &DAG) const {
3913 const MachineFunction &MF = DAG.getMachineFunction();
3915
3916 SDLoc dl(Op);
3917 EVT VT = Op.getValueType();
3918 SDValue Tmp1 = Op;
3919 SDValue Tmp2 = Op.getValue(1);
3920 SDValue Tmp3 = Op.getOperand(2);
3921 SDValue Chain = Tmp1.getOperand(0);
3922
3923 Register SPReg = Info->getStackPtrOffsetReg();
3924
3925 // Chain the dynamic stack allocation so that it doesn't modify the stack
3926 // pointer when other instructions are using the stack.
3927 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3928
3929 SDValue Size = Tmp2.getOperand(1);
3930 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3931 Chain = SP.getValue(1);
3932 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3933 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3934 unsigned Opc =
3937
3938 SDValue ScaledSize = DAG.getNode(
3939 ISD::SHL, dl, VT, Size,
3940 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3941
3942 Align StackAlign = TFL->getStackAlign();
3943 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3944 if (Alignment && *Alignment > StackAlign) {
3945 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3946 DAG.getConstant(-(uint64_t)Alignment->value()
3947 << Subtarget->getWavefrontSizeLog2(),
3948 dl, VT));
3949 }
3950
3951 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3952 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3953
3954 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3955}
3956
3958 SelectionDAG &DAG) const {
3959 // We only handle constant sizes here to allow non-entry block, static sized
3960 // allocas. A truly dynamic value is more difficult to support because we
3961 // don't know if the size value is uniform or not. If the size isn't uniform,
3962 // we would need to do a wave reduction to get the maximum size to know how
3963 // much to increment the uniform stack pointer.
3964 SDValue Size = Op.getOperand(1);
3965 if (isa<ConstantSDNode>(Size))
3966 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3967
3969}
3970
3972 if (Op.getValueType() != MVT::i32)
3973 return Op; // Defer to cannot select error.
3974
3976 SDLoc SL(Op);
3977
3978 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
3979
3980 // Convert from wave uniform to swizzled vector address. This should protect
3981 // from any edge cases where the stacksave result isn't directly used with
3982 // stackrestore.
3983 SDValue VectorAddress =
3984 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
3985 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
3986}
3987
3989 SelectionDAG &DAG) const {
3990 SDLoc SL(Op);
3991 assert(Op.getValueType() == MVT::i32);
3992
3993 uint32_t BothRoundHwReg =
3995 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
3996
3997 SDValue IntrinID =
3998 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
3999 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4000 Op.getOperand(0), IntrinID, GetRoundBothImm);
4001
4002 // There are two rounding modes, one for f32 and one for f64/f16. We only
4003 // report in the standard value range if both are the same.
4004 //
4005 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4006 // ties away from zero is not supported, and the other values are rotated by
4007 // 1.
4008 //
4009 // If the two rounding modes are not the same, report a target defined value.
4010
4011 // Mode register rounding mode fields:
4012 //
4013 // [1:0] Single-precision round mode.
4014 // [3:2] Double/Half-precision round mode.
4015 //
4016 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4017 //
4018 // Hardware Spec
4019 // Toward-0 3 0
4020 // Nearest Even 0 1
4021 // +Inf 1 2
4022 // -Inf 2 3
4023 // NearestAway0 N/A 4
4024 //
4025 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4026 // table we can index by the raw hardware mode.
4027 //
4028 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4029
4030 SDValue BitTable =
4032
4033 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4034 SDValue RoundModeTimesNumBits =
4035 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4036
4037 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4038 // knew only one mode was demanded.
4039 SDValue TableValue =
4040 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4041 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4042
4043 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4044 SDValue TableEntry =
4045 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4046
4047 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4048 // if it's an extended value.
4049 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4050 SDValue IsStandardValue =
4051 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4052 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4053 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4054 TableEntry, EnumOffset);
4055
4056 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4057}
4058
4060 if (Op->isDivergent())
4061 return SDValue();
4062
4063 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4068 break;
4069 default:
4070 return SDValue();
4071 }
4072
4073 return Op;
4074}
4075
4076// Work around DAG legality rules only based on the result type.
4078 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4079 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4080 EVT SrcVT = Src.getValueType();
4081
4082 if (SrcVT.getScalarType() != MVT::bf16)
4083 return Op;
4084
4085 SDLoc SL(Op);
4086 SDValue BitCast =
4087 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4088
4089 EVT DstVT = Op.getValueType();
4090 if (IsStrict)
4091 llvm_unreachable("Need STRICT_BF16_TO_FP");
4092
4093 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4094}
4095
4097 SDLoc SL(Op);
4098 if (Op.getValueType() != MVT::i64)
4099 return Op;
4100
4101 uint32_t ModeHwReg =
4103 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4104 uint32_t TrapHwReg =
4106 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4107
4108 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4109 SDValue IntrinID =
4110 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4111 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4112 Op.getOperand(0), IntrinID, ModeHwRegImm);
4113 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4114 Op.getOperand(0), IntrinID, TrapHwRegImm);
4115 SDValue TokenReg =
4116 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4117 GetTrapReg.getValue(1));
4118
4119 SDValue CvtPtr =
4120 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4121 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4122
4123 return DAG.getMergeValues({Result, TokenReg}, SL);
4124}
4125
4127 SDLoc SL(Op);
4128 if (Op.getOperand(1).getValueType() != MVT::i64)
4129 return Op;
4130
4131 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4132 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4133 DAG.getConstant(0, SL, MVT::i32));
4134 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4135 DAG.getConstant(1, SL, MVT::i32));
4136
4137 SDValue ReadFirstLaneID =
4138 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4139 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4140 ReadFirstLaneID, NewModeReg);
4141 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4142 ReadFirstLaneID, NewTrapReg);
4143
4144 unsigned ModeHwReg =
4146 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4147 unsigned TrapHwReg =
4149 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4150
4151 SDValue IntrinID =
4152 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4153 SDValue SetModeReg =
4154 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4155 IntrinID, ModeHwRegImm, NewModeReg);
4156 SDValue SetTrapReg =
4157 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4158 IntrinID, TrapHwRegImm, NewTrapReg);
4159 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4160}
4161
4163 const MachineFunction &MF) const {
4165 .Case("m0", AMDGPU::M0)
4166 .Case("exec", AMDGPU::EXEC)
4167 .Case("exec_lo", AMDGPU::EXEC_LO)
4168 .Case("exec_hi", AMDGPU::EXEC_HI)
4169 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4170 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4171 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4172 .Default(Register());
4173
4174 if (Reg == AMDGPU::NoRegister) {
4175 report_fatal_error(Twine("invalid register name \""
4176 + StringRef(RegName) + "\"."));
4177
4178 }
4179
4180 if (!Subtarget->hasFlatScrRegister() &&
4181 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4182 report_fatal_error(Twine("invalid register \""
4183 + StringRef(RegName) + "\" for subtarget."));
4184 }
4185
4186 switch (Reg) {
4187 case AMDGPU::M0:
4188 case AMDGPU::EXEC_LO:
4189 case AMDGPU::EXEC_HI:
4190 case AMDGPU::FLAT_SCR_LO:
4191 case AMDGPU::FLAT_SCR_HI:
4192 if (VT.getSizeInBits() == 32)
4193 return Reg;
4194 break;
4195 case AMDGPU::EXEC:
4196 case AMDGPU::FLAT_SCR:
4197 if (VT.getSizeInBits() == 64)
4198 return Reg;
4199 break;
4200 default:
4201 llvm_unreachable("missing register type checking");
4202 }
4203
4204 report_fatal_error(Twine("invalid type for register \""
4205 + StringRef(RegName) + "\"."));
4206}
4207
4208// If kill is not the last instruction, split the block so kill is always a
4209// proper terminator.
4212 MachineBasicBlock *BB) const {
4213 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4215 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4216 return SplitBB;
4217}
4218
4219// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4220// \p MI will be the only instruction in the loop body block. Otherwise, it will
4221// be the first instruction in the remainder block.
4222//
4223/// \returns { LoopBody, Remainder }
4224static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4228
4229 // To insert the loop we need to split the block. Move everything after this
4230 // point to a new block, and insert a new empty block between the two.
4232 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4234 ++MBBI;
4235
4236 MF->insert(MBBI, LoopBB);
4237 MF->insert(MBBI, RemainderBB);
4238
4239 LoopBB->addSuccessor(LoopBB);
4240 LoopBB->addSuccessor(RemainderBB);
4241
4242 // Move the rest of the block into a new block.
4243 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4244
4245 if (InstInLoop) {
4246 auto Next = std::next(I);
4247
4248 // Move instruction to loop body.
4249 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4250
4251 // Move the rest of the block.
4252 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4253 } else {
4254 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4255 }
4256
4257 MBB.addSuccessor(LoopBB);
4258
4259 return std::pair(LoopBB, RemainderBB);
4260}
4261
4262/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4264 MachineBasicBlock *MBB = MI.getParent();
4266 auto I = MI.getIterator();
4267 auto E = std::next(I);
4268
4269 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4270 .addImm(0);
4271
4272 MIBundleBuilder Bundler(*MBB, I, E);
4273 finalizeBundle(*MBB, Bundler.begin());
4274}
4275
4278 MachineBasicBlock *BB) const {
4279 const DebugLoc &DL = MI.getDebugLoc();
4280
4282
4283 MachineBasicBlock *LoopBB;
4284 MachineBasicBlock *RemainderBB;
4286
4287 // Apparently kill flags are only valid if the def is in the same block?
4288 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4289 Src->setIsKill(false);
4290
4291 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4292
4293 MachineBasicBlock::iterator I = LoopBB->end();
4294
4295 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4297
4298 // Clear TRAP_STS.MEM_VIOL
4299 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4300 .addImm(0)
4301 .addImm(EncodedReg);
4302
4304
4305 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4306
4307 // Load and check TRAP_STS.MEM_VIOL
4308 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4309 .addImm(EncodedReg);
4310
4311 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4312 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))