LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
41#include "llvm/IR/MDBuilder.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
54static cl::opt<bool>
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing", cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
86 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
87 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
88 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
89
90 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
91 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
92
93 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
94
95 const SIRegisterInfo *TRI = STI.getRegisterInfo();
96 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
97
98 addRegisterClass(MVT::f64, V64RegClass);
99 addRegisterClass(MVT::v2f32, V64RegClass);
100 addRegisterClass(MVT::Untyped, V64RegClass);
101
102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104
105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107
108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110
111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113
114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116
117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122
123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125
126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131
132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134
135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137
138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140
141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143
144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149
150 if (Subtarget->has16BitInsts()) {
151 if (Subtarget->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
155 } else {
156 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
157 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
159 }
160
161 // Unless there are also VOP3P operations, not operations are really legal.
162 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
163 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
166 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
169 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
172 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
175 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
177 }
178
179 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
180 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
181
183
184 // The boolean content concept here is too inflexible. Compares only ever
185 // really produce a 1-bit result. Any copy/extend from these will turn into a
186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187 // it's what most targets use.
190
191 // We need to custom lower vector stores from local memory
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
197 Custom);
198
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
204 Custom);
205
206 if (isTypeLegal(MVT::bf16)) {
207 for (unsigned Opc :
216 ISD::SETCC}) {
217 // FIXME: The promoted to type shouldn't need to be explicit
218 setOperationAction(Opc, MVT::bf16, Promote);
219 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
220 }
221
223
225 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
226
230
231 // We only need to custom lower because we can't specify an action for bf16
232 // sources.
235 }
236
237 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
238 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
239 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
240 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
241 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
242 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
243 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
248 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
253
254 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
256 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
257 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
258 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
260 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
261
262 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
263
267 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
268
269 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
270
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
273
275 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
276 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
277
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
282 Expand);
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
287 Expand);
288
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
292 Custom);
293
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
297
299
301
303 Expand);
304
305#if 0
307#endif
308
309 // We only support LOAD/STORE and vector manipulation ops for vectors
310 // with > 4 elements.
311 for (MVT VT :
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
320 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
321 switch (Op) {
322 case ISD::LOAD:
323 case ISD::STORE:
325 case ISD::BITCAST:
326 case ISD::UNDEF:
330 case ISD::IS_FPCLASS:
331 break;
336 break;
337 default:
339 break;
340 }
341 }
342 }
343
345
346 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
347 // is expanded to avoid having two separate loops in case the index is a VGPR.
348
349 // Most operations are naturally 32-bit vector operations. We only support
350 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
351 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
353 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
354
356 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
357
359 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
360
362 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
363 }
364
365 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
368
370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
371
373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
374
376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
377 }
378
379 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
382
384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
385
387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
388
390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
391 }
392
393 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
396
398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
399
401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
402
404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
405 }
406
407 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
410
412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
413
415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
416
418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
419 }
420
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
423 Expand);
424
425 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
426 Custom);
427
428 // Avoid stack access for these.
429 // TODO: Generalize to more vector types.
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
433 Custom);
434
435 // Deal with vec3 vector operations when widened to vec4.
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
438
439 // Deal with vec5/6/7 vector operations when widened to vec8.
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
445 Custom);
446
447 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
448 // and output demarshalling
449 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
450
451 // We can't return success/failure, only the old value,
452 // let LLVM add the comparison
454 Expand);
455
456 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
457
458 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
459
460 // FIXME: This should be narrowed to i32, but that only happens if i64 is
461 // illegal.
462 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
463 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
464
465 // On SI this is s_memtime and s_memrealtime on VI.
467
468 if (Subtarget->hasSMemRealTime() ||
472
473 if (Subtarget->has16BitInsts()) {
476 } else {
478 }
479
480 if (Subtarget->hasMadMacF32Insts())
482
483 if (!Subtarget->hasBFI())
484 // fcopysign can be done in a single instruction with BFI.
485 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
486
487 if (!Subtarget->hasBCNT(32))
489
490 if (!Subtarget->hasBCNT(64))
492
493 if (Subtarget->hasFFBH())
495
496 if (Subtarget->hasFFBL())
498
499 // We only really have 32-bit BFE instructions (and 16-bit on VI).
500 //
501 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
502 // effort to match them now. We want this to be false for i64 cases when the
503 // extraction isn't restricted to the upper or lower half. Ideally we would
504 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
505 // span the midpoint are probably relatively rare, so don't worry about them
506 // for now.
507 if (Subtarget->hasBFE())
509
510 // Clamp modifier on add/sub
511 if (Subtarget->hasIntClamp())
513
514 if (Subtarget->hasAddNoCarry())
515 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
516 Legal);
517
518 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
519 Custom);
520
521 // These are really only legal for ieee_mode functions. We should be avoiding
522 // them for functions that don't have ieee_mode enabled, so just say they are
523 // legal.
525 {MVT::f32, MVT::f64}, Legal);
526
527 if (Subtarget->haveRoundOpsF64())
529 Legal);
530 else
532 MVT::f64, Custom);
533
535 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
536 Legal);
537 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
538
541
542 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
543 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544
545 // Custom lower these because we can't specify a rule based on an illegal
546 // source bf16.
549
550 if (Subtarget->has16BitInsts()) {
553 MVT::i16, Legal);
554
555 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
556
558 MVT::i16, Expand);
559
563 ISD::CTPOP},
564 MVT::i16, Promote);
565
567
568 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
569
571 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
573 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
574
578
580
581 // F16 - Constant Actions.
584
585 // F16 - Load/Store Actions.
587 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
589 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
590
591 // BF16 - Load/Store Actions.
593 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
595 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
596
597 // F16 - VOP1 Actions.
600 MVT::f16, Custom);
601
604
605 // F16 - VOP2 Actions.
606 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
607 Expand);
611
612 // F16 - VOP3 Actions.
614 if (STI.hasMadF16())
616
617 for (MVT VT :
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
621 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
622 switch (Op) {
623 case ISD::LOAD:
624 case ISD::STORE:
626 case ISD::BITCAST:
627 case ISD::UNDEF:
632 case ISD::IS_FPCLASS:
633 break;
637 break;
638 default:
640 break;
641 }
642 }
643 }
644
645 // v_perm_b32 can handle either of these.
646 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
648
649 // XXX - Do these do anything? Vector constants turn into build_vector.
650 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
651
652 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
653 Legal);
654
656 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
658 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
659
661 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
663 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
664
665 setOperationAction(ISD::AND, MVT::v2i16, Promote);
666 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
667 setOperationAction(ISD::OR, MVT::v2i16, Promote);
668 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
669 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
670 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
671
673 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
675 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
676 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
677 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
678
680 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
682 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
684 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
685
687 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
689 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
690 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
691 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
692
694 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
696 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
697
699 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
701 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
703 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
704
705 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
706 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
707 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
709 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
711
713 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
715 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
716 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
718
719 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
720 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
721 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
723 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
725
727 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
729 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
730 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
732
734 MVT::v2i32, Expand);
736
738 MVT::v4i32, Expand);
739
741 MVT::v8i32, Expand);
742
743 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
744 Subtarget->hasVOP3PInsts() ? Legal : Custom);
745
746 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
747 // This isn't really legal, but this avoids the legalizer unrolling it (and
748 // allows matching fneg (fabs x) patterns)
749 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
750
753
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Custom);
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
768 Vec16, Custom);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
777 MVT::v2i16, Legal);
778
781 MVT::v2f16, Legal);
782
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
785
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
789 Custom);
790
791 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
792 // Split vector operations.
797 VT, Custom);
798
799 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
800 // Split vector operations.
802 VT, Custom);
803
804 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
805 Custom);
806
807 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
808 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
809 Custom);
810
811 if (Subtarget->hasPackedFP32Ops()) {
813 MVT::v2f32, Legal);
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
816 Custom);
817 }
818 }
819
821
822 if (Subtarget->has16BitInsts()) {
824 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
826 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
827 } else {
828 // Legalization hack.
829 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
830
832 }
833
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
839 Custom);
840
842
843 if (Subtarget->hasScalarSMulU64())
845
846 if (Subtarget->hasMad64_32())
848
849 if (Subtarget->hasPrefetch())
851
852 if (Subtarget->hasIEEEMinMax()) {
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
857 Custom);
858 } else {
859 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
860 if (Subtarget->hasMinimum3Maximum3F32())
862
863 if (Subtarget->hasMinimum3Maximum3PKF16())
865 }
866
868 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
869 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
870 MVT::i8},
871 Custom);
872
874 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
875 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
876 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
877 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
878 Custom);
879
881 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
882 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
883 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
884 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
885 Custom);
886
892
893 // TODO: Could move this to custom lowering, could benefit from combines on
894 // extract of relevant bits.
896
898
899 if (Subtarget->hasBF16ConversionInsts()) {
903 }
904
905 if (Subtarget->hasCvtPkF16F32Inst()) {
907 }
908
911 ISD::SUB,
913 ISD::MUL,
914 ISD::FADD,
915 ISD::FSUB,
916 ISD::FDIV,
917 ISD::FMUL,
924 ISD::FMA,
925 ISD::SMIN,
926 ISD::SMAX,
927 ISD::UMIN,
928 ISD::UMAX,
931 ISD::SMIN,
932 ISD::SMAX,
933 ISD::UMIN,
934 ISD::UMAX,
935 ISD::AND,
936 ISD::OR,
937 ISD::XOR,
938 ISD::SHL,
939 ISD::SRL,
940 ISD::SRA,
941 ISD::FSHR,
951
952 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
954
955 // All memory operations. Some folding on the pointer operand is done to help
956 // matching the constant offsets in the addressing modes.
981
982 // FIXME: In other contexts we pretend this is a per-function property.
984
986}
987
988const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
989
991 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
992 return RCRegs;
993}
994
995//===----------------------------------------------------------------------===//
996// TargetLowering queries
997//===----------------------------------------------------------------------===//
998
999// v_mad_mix* support a conversion from f16 to f32.
1000//
1001// There is only one special case when denormals are enabled we don't currently,
1002// where this is OK to use.
1003bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1004 EVT DestVT, EVT SrcVT) const {
1005 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1006 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1007 DestVT.getScalarType() == MVT::f32 &&
1008 SrcVT.getScalarType() == MVT::f16 &&
1009 // TODO: This probably only requires no input flushing?
1011}
1012
1014 LLT DestTy, LLT SrcTy) const {
1015 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1016 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1017 DestTy.getScalarSizeInBits() == 32 &&
1018 SrcTy.getScalarSizeInBits() == 16 &&
1019 // TODO: This probably only requires no input flushing?
1020 denormalModeIsFlushAllF32(*MI.getMF());
1021}
1022
1024 // SI has some legal vector types, but no legal vector operations. Say no
1025 // shuffles are legal in order to prefer scalarizing some vector operations.
1026 return false;
1027}
1028
1031 EVT VT) const {
1034
1035 if (VT.isVector()) {
1036 EVT ScalarVT = VT.getScalarType();
1037 unsigned Size = ScalarVT.getSizeInBits();
1038 if (Size == 16) {
1039 if (Subtarget->has16BitInsts()) {
1040 if (VT.isInteger())
1041 return MVT::v2i16;
1042 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1043 }
1044 return VT.isInteger() ? MVT::i32 : MVT::f32;
1045 }
1046
1047 if (Size < 16)
1048 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1049 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1050 }
1051
1052 if (VT.getSizeInBits() > 32)
1053 return MVT::i32;
1054
1056}
1057
1060 EVT VT) const {
1063
1064 if (VT.isVector()) {
1065 unsigned NumElts = VT.getVectorNumElements();
1066 EVT ScalarVT = VT.getScalarType();
1067 unsigned Size = ScalarVT.getSizeInBits();
1068
1069 // FIXME: Should probably promote 8-bit vectors to i16.
1070 if (Size == 16 && Subtarget->has16BitInsts())
1071 return (NumElts + 1) / 2;
1072
1073 if (Size <= 32)
1074 return NumElts;
1075
1076 if (Size > 32)
1077 return NumElts * ((Size + 31) / 32);
1078 } else if (VT.getSizeInBits() > 32)
1079 return (VT.getSizeInBits() + 31) / 32;
1080
1082}
1083
1085 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1086 unsigned &NumIntermediates, MVT &RegisterVT) const {
1087 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1088 unsigned NumElts = VT.getVectorNumElements();
1089 EVT ScalarVT = VT.getScalarType();
1090 unsigned Size = ScalarVT.getSizeInBits();
1091 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1092 // support, but unless we can properly handle 3-vectors, it will be still be
1093 // inconsistent.
1094 if (Size == 16 && Subtarget->has16BitInsts()) {
1095 if (ScalarVT == MVT::bf16) {
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = MVT::v2bf16;
1098 } else {
1099 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1100 IntermediateVT = RegisterVT;
1101 }
1102 NumIntermediates = (NumElts + 1) / 2;
1103 return NumIntermediates;
1104 }
1105
1106 if (Size == 32) {
1107 RegisterVT = ScalarVT.getSimpleVT();
1108 IntermediateVT = RegisterVT;
1109 NumIntermediates = NumElts;
1110 return NumIntermediates;
1111 }
1112
1113 if (Size < 16 && Subtarget->has16BitInsts()) {
1114 // FIXME: Should probably form v2i16 pieces
1115 RegisterVT = MVT::i16;
1116 IntermediateVT = ScalarVT;
1117 NumIntermediates = NumElts;
1118 return NumIntermediates;
1119 }
1120
1121 if (Size != 16 && Size <= 32) {
1122 RegisterVT = MVT::i32;
1123 IntermediateVT = ScalarVT;
1124 NumIntermediates = NumElts;
1125 return NumIntermediates;
1126 }
1127
1128 if (Size > 32) {
1129 RegisterVT = MVT::i32;
1130 IntermediateVT = RegisterVT;
1131 NumIntermediates = NumElts * ((Size + 31) / 32);
1132 return NumIntermediates;
1133 }
1134 }
1135
1137 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1138}
1139
1141 const DataLayout &DL, Type *Ty,
1142 unsigned MaxNumLanes) {
1143 assert(MaxNumLanes != 0);
1144
1145 LLVMContext &Ctx = Ty->getContext();
1146 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1147 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1148 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1149 NumElts);
1150 }
1151
1152 return TLI.getValueType(DL, Ty);
1153}
1154
1155// Peek through TFE struct returns to only use the data size.
1157 const DataLayout &DL, Type *Ty,
1158 unsigned MaxNumLanes) {
1159 auto *ST = dyn_cast<StructType>(Ty);
1160 if (!ST)
1161 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1162
1163 // TFE intrinsics return an aggregate type.
1164 assert(ST->getNumContainedTypes() == 2 &&
1165 ST->getContainedType(1)->isIntegerTy(32));
1166 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1167}
1168
1169/// Map address space 7 to MVT::v5i32 because that's its in-memory
1170/// representation. This return value is vector-typed because there is no
1171/// MVT::i160 and it is not clear if one can be added. While this could
1172/// cause issues during codegen, these address space 7 pointers will be
1173/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1174/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1175/// modeling, to work.
1177 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1178 return MVT::v5i32;
1180 DL.getPointerSizeInBits(AS) == 192)
1181 return MVT::v6i32;
1183}
1184/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1185/// v8i32 when padding is added.
1186/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1187/// also v8i32 with padding.
1189 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1190 DL.getPointerSizeInBits(AS) == 160) ||
1192 DL.getPointerSizeInBits(AS) == 192))
1193 return MVT::v8i32;
1195}
1196
1198 const CallInst &CI,
1199 MachineFunction &MF,
1200 unsigned IntrID) const {
1202 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1204
1205 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1207 AttributeList Attr =
1209 MemoryEffects ME = Attr.getMemoryEffects();
1210 if (ME.doesNotAccessMemory())
1211 return false;
1212
1213 // TODO: Should images get their own address space?
1214 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1215
1216 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1217 if (RsrcIntr->IsImage) {
1220 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1221 Info.align.reset();
1222 }
1223
1224 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1225 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1226 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1227 // We conservatively set the memory operand of a buffer intrinsic to the
1228 // base resource pointer, so that we can access alias information about
1229 // those pointers. Cases like "this points at the same value
1230 // but with a different offset" are handled in
1231 // areMemAccessesTriviallyDisjoint.
1232 Info.ptrVal = RsrcArg;
1233 }
1234
1235 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1236 if (!IsSPrefetch) {
1237 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1238 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1240 }
1241
1243 if (ME.onlyReadsMemory()) {
1244 if (RsrcIntr->IsImage) {
1245 unsigned MaxNumLanes = 4;
1246
1247 if (!BaseOpcode->Gather4) {
1248 // If this isn't a gather, we may have excess loaded elements in the
1249 // IR type. Check the dmask for the real number of elements loaded.
1250 unsigned DMask =
1251 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1252 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1253 }
1254
1255 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1256 CI.getType(), MaxNumLanes);
1257 } else {
1258 Info.memVT =
1260 std::numeric_limits<unsigned>::max());
1261 }
1262
1263 // FIXME: What does alignment mean for an image?
1266 } else if (ME.onlyWritesMemory()) {
1268
1269 Type *DataTy = CI.getArgOperand(0)->getType();
1270 if (RsrcIntr->IsImage) {
1271 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1272 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1273 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1274 DMaskLanes);
1275 } else
1276 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1277
1279 } else {
1280 // Atomic, NoReturn Sampler or prefetch
1283 Info.flags |=
1285
1286 if (!IsSPrefetch)
1288
1289 switch (IntrID) {
1290 default:
1291 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1292 // Fake memory access type for no return sampler intrinsics
1293 Info.memVT = MVT::i32;
1294 } else {
1295 // XXX - Should this be volatile without known ordering?
1297 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1298 }
1299 break;
1300 case Intrinsic::amdgcn_raw_buffer_load_lds:
1301 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1302 case Intrinsic::amdgcn_struct_buffer_load_lds:
1303 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1304 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1305 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1306 Info.ptrVal = CI.getArgOperand(1);
1307 return true;
1308 }
1309 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1310 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1311 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1312 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1313 Info.memVT =
1315 std::numeric_limits<unsigned>::max());
1316 Info.flags &= ~MachineMemOperand::MOStore;
1317 return true;
1318 }
1319 }
1320 }
1321 return true;
1322 }
1323
1324 switch (IntrID) {
1325 case Intrinsic::amdgcn_ds_ordered_add:
1326 case Intrinsic::amdgcn_ds_ordered_swap: {
1328 Info.memVT = MVT::getVT(CI.getType());
1329 Info.ptrVal = CI.getOperand(0);
1330 Info.align.reset();
1332
1333 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1334 if (!Vol->isZero())
1336
1337 return true;
1338 }
1339 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1340 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1342 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1343 Info.ptrVal = nullptr;
1344 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1346 return true;
1347 }
1348 case Intrinsic::amdgcn_ds_append:
1349 case Intrinsic::amdgcn_ds_consume: {
1351 Info.memVT = MVT::getVT(CI.getType());
1352 Info.ptrVal = CI.getOperand(0);
1353 Info.align.reset();
1355
1356 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1357 if (!Vol->isZero())
1359
1360 return true;
1361 }
1362 case Intrinsic::amdgcn_global_atomic_csub: {
1364 Info.memVT = MVT::getVT(CI.getType());
1365 Info.ptrVal = CI.getOperand(0);
1366 Info.align.reset();
1369 return true;
1370 }
1371 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1373 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1374
1375 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1376 Info.align.reset();
1377 Info.flags |=
1379 return true;
1380 }
1381 case Intrinsic::amdgcn_global_atomic_fmin_num:
1382 case Intrinsic::amdgcn_global_atomic_fmax_num:
1383 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1384 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1385 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1386 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1388 Info.memVT = MVT::getVT(CI.getType());
1389 Info.ptrVal = CI.getOperand(0);
1390 Info.align.reset();
1394 return true;
1395 }
1396 case Intrinsic::amdgcn_global_load_tr_b64:
1397 case Intrinsic::amdgcn_global_load_tr_b128:
1398 case Intrinsic::amdgcn_ds_read_tr4_b64:
1399 case Intrinsic::amdgcn_ds_read_tr6_b96:
1400 case Intrinsic::amdgcn_ds_read_tr8_b64:
1401 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1403 Info.memVT = MVT::getVT(CI.getType());
1404 Info.ptrVal = CI.getOperand(0);
1405 Info.align.reset();
1407 return true;
1408 }
1409 case Intrinsic::amdgcn_ds_gws_init:
1410 case Intrinsic::amdgcn_ds_gws_barrier:
1411 case Intrinsic::amdgcn_ds_gws_sema_v:
1412 case Intrinsic::amdgcn_ds_gws_sema_br:
1413 case Intrinsic::amdgcn_ds_gws_sema_p:
1414 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1416
1417 const GCNTargetMachine &TM =
1418 static_cast<const GCNTargetMachine &>(getTargetMachine());
1419
1421 Info.ptrVal = MFI->getGWSPSV(TM);
1422
1423 // This is an abstract access, but we need to specify a type and size.
1424 Info.memVT = MVT::i32;
1425 Info.size = 4;
1426 Info.align = Align(4);
1427
1428 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1430 else
1432 return true;
1433 }
1434 case Intrinsic::amdgcn_global_load_lds: {
1436 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1437 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1438 Info.ptrVal = CI.getArgOperand(1);
1440 return true;
1441 }
1442 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1444
1445 const GCNTargetMachine &TM =
1446 static_cast<const GCNTargetMachine &>(getTargetMachine());
1447
1449 Info.ptrVal = MFI->getGWSPSV(TM);
1450
1451 // This is an abstract access, but we need to specify a type and size.
1452 Info.memVT = MVT::i32;
1453 Info.size = 4;
1454 Info.align = Align(4);
1455
1457 return true;
1458 }
1459 case Intrinsic::amdgcn_s_prefetch_data: {
1461 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1462 Info.ptrVal = CI.getArgOperand(0);
1464 return true;
1465 }
1466 default:
1467 return false;
1468 }
1469}
1470
1472 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1473 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1474 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1475 // The DAG's ValueType loses the addrspaces.
1476 // Add them as 2 extra Constant operands "from" and "to".
1477 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1478 unsigned DstAS = I.getType()->getPointerAddressSpace();
1479 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1480 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1481 break;
1482 }
1483 default:
1484 break;
1485 }
1486}
1487
1490 Type *&AccessTy) const {
1491 Value *Ptr = nullptr;
1492 switch (II->getIntrinsicID()) {
1493 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1494 case Intrinsic::amdgcn_ds_append:
1495 case Intrinsic::amdgcn_ds_consume:
1496 case Intrinsic::amdgcn_ds_read_tr4_b64:
1497 case Intrinsic::amdgcn_ds_read_tr6_b96:
1498 case Intrinsic::amdgcn_ds_read_tr8_b64:
1499 case Intrinsic::amdgcn_ds_read_tr16_b64:
1500 case Intrinsic::amdgcn_ds_ordered_add:
1501 case Intrinsic::amdgcn_ds_ordered_swap:
1502 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1503 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1504 case Intrinsic::amdgcn_global_atomic_csub:
1505 case Intrinsic::amdgcn_global_atomic_fmax_num:
1506 case Intrinsic::amdgcn_global_atomic_fmin_num:
1507 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1508 case Intrinsic::amdgcn_global_load_tr_b64:
1509 case Intrinsic::amdgcn_global_load_tr_b128:
1510 Ptr = II->getArgOperand(0);
1511 break;
1512 case Intrinsic::amdgcn_global_load_lds:
1513 Ptr = II->getArgOperand(1);
1514 break;
1515 default:
1516 return false;
1517 }
1518 AccessTy = II->getType();
1519 Ops.push_back(Ptr);
1520 return true;
1521}
1522
1524 unsigned AddrSpace) const {
1525 if (!Subtarget->hasFlatInstOffsets()) {
1526 // Flat instructions do not have offsets, and only have the register
1527 // address.
1528 return AM.BaseOffs == 0 && AM.Scale == 0;
1529 }
1530
1531 decltype(SIInstrFlags::FLAT) FlatVariant =
1535
1536 return AM.Scale == 0 &&
1537 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1538 AM.BaseOffs, AddrSpace, FlatVariant));
1539}
1540
1542 if (Subtarget->hasFlatGlobalInsts())
1544
1545 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1546 // Assume the we will use FLAT for all global memory accesses
1547 // on VI.
1548 // FIXME: This assumption is currently wrong. On VI we still use
1549 // MUBUF instructions for the r + i addressing mode. As currently
1550 // implemented, the MUBUF instructions only work on buffer < 4GB.
1551 // It may be possible to support > 4GB buffers with MUBUF instructions,
1552 // by setting the stride value in the resource descriptor which would
1553 // increase the size limit to (stride * 4GB). However, this is risky,
1554 // because it has never been validated.
1556 }
1557
1558 return isLegalMUBUFAddressingMode(AM);
1559}
1560
1561bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1562 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1563 // additionally can do r + r + i with addr64. 32-bit has more addressing
1564 // mode options. Depending on the resource constant, it can also do
1565 // (i64 r0) + (i32 r1) * (i14 i).
1566 //
1567 // Private arrays end up using a scratch buffer most of the time, so also
1568 // assume those use MUBUF instructions. Scratch loads / stores are currently
1569 // implemented as mubuf instructions with offen bit set, so slightly
1570 // different than the normal addr64.
1571 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1572 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1573 return false;
1574
1575 // FIXME: Since we can split immediate into soffset and immediate offset,
1576 // would it make sense to allow any immediate?
1577
1578 switch (AM.Scale) {
1579 case 0: // r + i or just i, depending on HasBaseReg.
1580 return true;
1581 case 1:
1582 return true; // We have r + r or r + i.
1583 case 2:
1584 if (AM.HasBaseReg) {
1585 // Reject 2 * r + r.
1586 return false;
1587 }
1588
1589 // Allow 2 * r as r + r
1590 // Or 2 * r + i is allowed as r + r + i.
1591 return true;
1592 default: // Don't allow n * r
1593 return false;
1594 }
1595}
1596
1598 const AddrMode &AM, Type *Ty,
1599 unsigned AS,
1600 Instruction *I) const {
1601 // No global is ever allowed as a base.
1602 if (AM.BaseGV)
1603 return false;
1604
1605 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1606 return isLegalGlobalAddressingMode(AM);
1607
1608 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1612 // If the offset isn't a multiple of 4, it probably isn't going to be
1613 // correctly aligned.
1614 // FIXME: Can we get the real alignment here?
1615 if (AM.BaseOffs % 4 != 0)
1616 return isLegalMUBUFAddressingMode(AM);
1617
1618 if (!Subtarget->hasScalarSubwordLoads()) {
1619 // There are no SMRD extloads, so if we have to do a small type access we
1620 // will use a MUBUF load.
1621 // FIXME?: We also need to do this if unaligned, but we don't know the
1622 // alignment here.
1623 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1624 return isLegalGlobalAddressingMode(AM);
1625 }
1626
1628 // SMRD instructions have an 8-bit, dword offset on SI.
1629 if (!isUInt<8>(AM.BaseOffs / 4))
1630 return false;
1631 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1632 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1633 // in 8-bits, it can use a smaller encoding.
1634 if (!isUInt<32>(AM.BaseOffs / 4))
1635 return false;
1636 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1637 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1638 if (!isUInt<20>(AM.BaseOffs))
1639 return false;
1640 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1641 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1642 // for S_BUFFER_* instructions).
1643 if (!isInt<21>(AM.BaseOffs))
1644 return false;
1645 } else {
1646 // On GFX12, all offsets are signed 24-bit in bytes.
1647 if (!isInt<24>(AM.BaseOffs))
1648 return false;
1649 }
1650
1651 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1653 AM.BaseOffs < 0) {
1654 // Scalar (non-buffer) loads can only use a negative offset if
1655 // soffset+offset is non-negative. Since the compiler can only prove that
1656 // in a few special cases, it is safer to claim that negative offsets are
1657 // not supported.
1658 return false;
1659 }
1660
1661 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1662 return true;
1663
1664 if (AM.Scale == 1 && AM.HasBaseReg)
1665 return true;
1666
1667 return false;
1668 }
1669
1670 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1671 return Subtarget->enableFlatScratch()
1673 : isLegalMUBUFAddressingMode(AM);
1674
1675 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1676 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1677 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1678 // field.
1679 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1680 // an 8-bit dword offset but we don't know the alignment here.
1681 if (!isUInt<16>(AM.BaseOffs))
1682 return false;
1683
1684 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1685 return true;
1686
1687 if (AM.Scale == 1 && AM.HasBaseReg)
1688 return true;
1689
1690 return false;
1691 }
1692
1694 // For an unknown address space, this usually means that this is for some
1695 // reason being used for pure arithmetic, and not based on some addressing
1696 // computation. We don't have instructions that compute pointers with any
1697 // addressing modes, so treat them as having no offset like flat
1698 // instructions.
1700 }
1701
1702 // Assume a user alias of global for unknown address spaces.
1703 return isLegalGlobalAddressingMode(AM);
1704}
1705
1707 const MachineFunction &MF) const {
1709 return (MemVT.getSizeInBits() <= 4 * 32);
1710 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1711 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1712 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1713 }
1715 return (MemVT.getSizeInBits() <= 2 * 32);
1716 return true;
1717}
1718
1720 unsigned Size, unsigned AddrSpace, Align Alignment,
1721 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1722 if (IsFast)
1723 *IsFast = 0;
1724
1725 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1726 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1727 // Check if alignment requirements for ds_read/write instructions are
1728 // disabled.
1729 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1730 return false;
1731
1732 Align RequiredAlignment(
1733 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1734 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1735 Alignment < RequiredAlignment)
1736 return false;
1737
1738 // Either, the alignment requirements are "enabled", or there is an
1739 // unaligned LDS access related hardware bug though alignment requirements
1740 // are "disabled". In either case, we need to check for proper alignment
1741 // requirements.
1742 //
1743 switch (Size) {
1744 case 64:
1745 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1746 // address is negative, then the instruction is incorrectly treated as
1747 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1748 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1749 // load later in the SILoadStoreOptimizer.
1750 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1751 return false;
1752
1753 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1754 // can do a 4 byte aligned, 8 byte access in a single operation using
1755 // ds_read2/write2_b32 with adjacent offsets.
1756 RequiredAlignment = Align(4);
1757
1758 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1759 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1760 // ds_write2_b32 depending on the alignment. In either case with either
1761 // alignment there is no faster way of doing this.
1762
1763 // The numbers returned here and below are not additive, it is a 'speed
1764 // rank'. They are just meant to be compared to decide if a certain way
1765 // of lowering an operation is faster than another. For that purpose
1766 // naturally aligned operation gets it bitsize to indicate that "it
1767 // operates with a speed comparable to N-bit wide load". With the full
1768 // alignment ds128 is slower than ds96 for example. If underaligned it
1769 // is comparable to a speed of a single dword access, which would then
1770 // mean 32 < 128 and it is faster to issue a wide load regardless.
1771 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1772 // wider load which will not be aligned anymore the latter is slower.
1773 if (IsFast)
1774 *IsFast = (Alignment >= RequiredAlignment) ? 64
1775 : (Alignment < Align(4)) ? 32
1776 : 1;
1777 return true;
1778 }
1779
1780 break;
1781 case 96:
1782 if (!Subtarget->hasDS96AndDS128())
1783 return false;
1784
1785 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1786 // gfx8 and older.
1787
1788 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1789 // Naturally aligned access is fastest. However, also report it is Fast
1790 // if memory is aligned less than DWORD. A narrow load or store will be
1791 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1792 // be more of them, so overall we will pay less penalty issuing a single
1793 // instruction.
1794
1795 // See comment on the values above.
1796 if (IsFast)
1797 *IsFast = (Alignment >= RequiredAlignment) ? 96
1798 : (Alignment < Align(4)) ? 32
1799 : 1;
1800 return true;
1801 }
1802
1803 break;
1804 case 128:
1805 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1806 return false;
1807
1808 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1809 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1810 // single operation using ds_read2/write2_b64.
1811 RequiredAlignment = Align(8);
1812
1813 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1814 // Naturally aligned access is fastest. However, also report it is Fast
1815 // if memory is aligned less than DWORD. A narrow load or store will be
1816 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1817 // will be more of them, so overall we will pay less penalty issuing a
1818 // single instruction.
1819
1820 // See comment on the values above.
1821 if (IsFast)
1822 *IsFast = (Alignment >= RequiredAlignment) ? 128
1823 : (Alignment < Align(4)) ? 32
1824 : 1;
1825 return true;
1826 }
1827
1828 break;
1829 default:
1830 if (Size > 32)
1831 return false;
1832
1833 break;
1834 }
1835
1836 // See comment on the values above.
1837 // Note that we have a single-dword or sub-dword here, so if underaligned
1838 // it is a slowest possible access, hence returned value is 0.
1839 if (IsFast)
1840 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1841
1842 return Alignment >= RequiredAlignment ||
1843 Subtarget->hasUnalignedDSAccessEnabled();
1844 }
1845
1846 // FIXME: We have to be conservative here and assume that flat operations
1847 // will access scratch. If we had access to the IR function, then we
1848 // could determine if any private memory was used in the function.
1849 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1850 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1851 bool AlignedBy4 = Alignment >= Align(4);
1852 if (IsFast)
1853 *IsFast = AlignedBy4;
1854
1855 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1856 }
1857
1858 // So long as they are correct, wide global memory operations perform better
1859 // than multiple smaller memory ops -- even when misaligned
1860 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1861 if (IsFast)
1862 *IsFast = Size;
1863
1864 return Alignment >= Align(4) ||
1866 }
1867
1868 // Smaller than dword value must be aligned.
1869 if (Size < 32)
1870 return false;
1871
1872 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1873 // byte-address are ignored, thus forcing Dword alignment.
1874 // This applies to private, global, and constant memory.
1875 if (IsFast)
1876 *IsFast = 1;
1877
1878 return Size >= 32 && Alignment >= Align(4);
1879}
1880
1882 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1883 unsigned *IsFast) const {
1885 Alignment, Flags, IsFast);
1886}
1887
1889 const MemOp &Op, const AttributeList &FuncAttributes) const {
1890 // FIXME: Should account for address space here.
1891
1892 // The default fallback uses the private pointer size as a guess for a type to
1893 // use. Make sure we switch these to 64-bit accesses.
1894
1895 if (Op.size() >= 16 &&
1896 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1897 return MVT::v4i32;
1898
1899 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1900 return MVT::v2i32;
1901
1902 // Use the default.
1903 return MVT::Other;
1904}
1905
1907 const MemSDNode *MemNode = cast<MemSDNode>(N);
1908 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1909}
1910
1912 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1914}
1915
1917 unsigned DestAS) const {
1918 // Flat -> private/local is a simple truncate.
1919 // Flat -> global is no-op
1920 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1921 return true;
1922
1923 const GCNTargetMachine &TM =
1924 static_cast<const GCNTargetMachine &>(getTargetMachine());
1925 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1926}
1927
1930 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1931 VT.getScalarType().bitsLE(MVT::i16))
1934}
1935
1937 Type *Ty) const {
1938 // FIXME: Could be smarter if called for vector constants.
1939 return true;
1940}
1941
1943 unsigned Index) const {
1945 return false;
1946
1947 // TODO: Add more cases that are cheap.
1948 return Index == 0;
1949}
1950
1952 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1953 switch (Op) {
1954 case ISD::LOAD:
1955 case ISD::STORE:
1956 return true;
1957 default:
1958 return false;
1959 }
1960 }
1961
1962 // SimplifySetCC uses this function to determine whether or not it should
1963 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1964 if (VT == MVT::i1 && Op == ISD::SETCC)
1965 return false;
1966
1968}
1969
1970SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1971 const SDLoc &SL,
1972 SDValue Chain,
1973 uint64_t Offset) const {
1974 const DataLayout &DL = DAG.getDataLayout();
1978
1979 auto [InputPtrReg, RC, ArgTy] =
1981
1982 // We may not have the kernarg segment argument if we have no kernel
1983 // arguments.
1984 if (!InputPtrReg)
1985 return DAG.getConstant(Offset, SL, PtrVT);
1986
1988 SDValue BasePtr = DAG.getCopyFromReg(
1989 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1990
1991 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1992}
1993
1994SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1995 const SDLoc &SL) const {
1998 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1999}
2000
2001SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2002 const SDLoc &SL) const {
2003
2005 std::optional<uint32_t> KnownSize =
2007 if (KnownSize.has_value())
2008 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2009 return SDValue();
2010}
2011
2012SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2013 const SDLoc &SL, SDValue Val,
2014 bool Signed,
2015 const ISD::InputArg *Arg) const {
2016 // First, if it is a widened vector, narrow it.
2017 if (VT.isVector() &&
2019 EVT NarrowedVT =
2022 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2023 DAG.getConstant(0, SL, MVT::i32));
2024 }
2025
2026 // Then convert the vector elements or scalar value.
2027 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2028 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2029 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2030 }
2031
2032 if (MemVT.isFloatingPoint())
2033 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2034 else if (Signed)
2035 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2036 else
2037 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2038
2039 return Val;
2040}
2041
2042SDValue SITargetLowering::lowerKernargMemParameter(
2043 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2044 uint64_t Offset, Align Alignment, bool Signed,
2045 const ISD::InputArg *Arg) const {
2047
2048 // Try to avoid using an extload by loading earlier than the argument address,
2049 // and extracting the relevant bits. The load should hopefully be merged with
2050 // the previous argument.
2051 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2052 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2053 int64_t AlignDownOffset = alignDown(Offset, 4);
2054 int64_t OffsetDiff = Offset - AlignDownOffset;
2055
2056 EVT IntVT = MemVT.changeTypeToInteger();
2057
2058 // TODO: If we passed in the base kernel offset we could have a better
2059 // alignment than 4, but we don't really need it.
2060 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2061 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2064
2065 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2066 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2067
2068 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2069 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2070 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2071
2072 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2073 }
2074
2075 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2076 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2079
2080 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2081 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2082}
2083
2084SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2085 CCValAssign &VA, const SDLoc &SL,
2086 SDValue Chain,
2087 const ISD::InputArg &Arg) const {
2089 MachineFrameInfo &MFI = MF.getFrameInfo();
2090
2091 if (Arg.Flags.isByVal()) {
2092 unsigned Size = Arg.Flags.getByValSize();
2093 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2094 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2095 }
2096
2097 unsigned ArgOffset = VA.getLocMemOffset();
2098 unsigned ArgSize = VA.getValVT().getStoreSize();
2099
2100 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2101
2102 // Create load nodes to retrieve arguments from the stack.
2103 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2104 SDValue ArgValue;
2105
2106 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2108 MVT MemVT = VA.getValVT();
2109
2110 switch (VA.getLocInfo()) {
2111 default:
2112 break;
2113 case CCValAssign::BCvt:
2114 MemVT = VA.getLocVT();
2115 break;
2116 case CCValAssign::SExt:
2117 ExtType = ISD::SEXTLOAD;
2118 break;
2119 case CCValAssign::ZExt:
2120 ExtType = ISD::ZEXTLOAD;
2121 break;
2122 case CCValAssign::AExt:
2123 ExtType = ISD::EXTLOAD;
2124 break;
2125 }
2126
2127 ArgValue = DAG.getExtLoad(
2128 ExtType, SL, VA.getLocVT(), Chain, FIN,
2130 return ArgValue;
2131}
2132
2133SDValue SITargetLowering::getPreloadedValue(
2134 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2136 const ArgDescriptor *Reg = nullptr;
2137 const TargetRegisterClass *RC;
2138 LLT Ty;
2139
2141 const ArgDescriptor WorkGroupIDX =
2142 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2143 // If GridZ is not programmed in an entry function then the hardware will set
2144 // it to all zeros, so there is no need to mask the GridY value in the low
2145 // order bits.
2146 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2147 AMDGPU::TTMP7,
2148 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2149 const ArgDescriptor WorkGroupIDZ =
2150 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2151 if (Subtarget->hasArchitectedSGPRs() &&
2153 switch (PVID) {
2155 Reg = &WorkGroupIDX;
2156 RC = &AMDGPU::SReg_32RegClass;
2157 Ty = LLT::scalar(32);
2158 break;
2160 Reg = &WorkGroupIDY;
2161 RC = &AMDGPU::SReg_32RegClass;
2162 Ty = LLT::scalar(32);
2163 break;
2165 Reg = &WorkGroupIDZ;
2166 RC = &AMDGPU::SReg_32RegClass;
2167 Ty = LLT::scalar(32);
2168 break;
2169 default:
2170 break;
2171 }
2172 }
2173
2174 if (!Reg)
2175 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2176 if (!Reg) {
2178 // It's possible for a kernarg intrinsic call to appear in a kernel with
2179 // no allocated segment, in which case we do not add the user sgpr
2180 // argument, so just return null.
2181 return DAG.getConstant(0, SDLoc(), VT);
2182 }
2183
2184 // It's undefined behavior if a function marked with the amdgpu-no-*
2185 // attributes uses the corresponding intrinsic.
2186 return DAG.getUNDEF(VT);
2187 }
2188
2189 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2190}
2191
2193 CallingConv::ID CallConv,
2194 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2195 FunctionType *FType,
2196 SIMachineFunctionInfo *Info) {
2197 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2198 const ISD::InputArg *Arg = &Ins[I];
2199
2200 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2201 "vector type argument should have been split");
2202
2203 // First check if it's a PS input addr.
2204 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2205 PSInputNum <= 15) {
2206 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2207
2208 // Inconveniently only the first part of the split is marked as isSplit,
2209 // so skip to the end. We only want to increment PSInputNum once for the
2210 // entire split argument.
2211 if (Arg->Flags.isSplit()) {
2212 while (!Arg->Flags.isSplitEnd()) {
2213 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2214 "unexpected vector split in ps argument type");
2215 if (!SkipArg)
2216 Splits.push_back(*Arg);
2217 Arg = &Ins[++I];
2218 }
2219 }
2220
2221 if (SkipArg) {
2222 // We can safely skip PS inputs.
2223 Skipped.set(Arg->getOrigArgIndex());
2224 ++PSInputNum;
2225 continue;
2226 }
2227
2228 Info->markPSInputAllocated(PSInputNum);
2229 if (Arg->Used)
2230 Info->markPSInputEnabled(PSInputNum);
2231
2232 ++PSInputNum;
2233 }
2234
2235 Splits.push_back(*Arg);
2236 }
2237}
2238
2239// Allocate special inputs passed in VGPRs.
2241 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2242 SIMachineFunctionInfo &Info) const {
2243 const LLT S32 = LLT::scalar(32);
2245
2246 if (Info.hasWorkItemIDX()) {
2247 Register Reg = AMDGPU::VGPR0;
2248 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2249
2250 CCInfo.AllocateReg(Reg);
2251 unsigned Mask =
2252 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2253 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2254 }
2255
2256 if (Info.hasWorkItemIDY()) {
2257 assert(Info.hasWorkItemIDX());
2258 if (Subtarget->hasPackedTID()) {
2259 Info.setWorkItemIDY(
2260 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2261 } else {
2262 unsigned Reg = AMDGPU::VGPR1;
2263 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2264
2265 CCInfo.AllocateReg(Reg);
2266 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2267 }
2268 }
2269
2270 if (Info.hasWorkItemIDZ()) {
2271 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2272 if (Subtarget->hasPackedTID()) {
2273 Info.setWorkItemIDZ(
2274 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2275 } else {
2276 unsigned Reg = AMDGPU::VGPR2;
2277 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2278
2279 CCInfo.AllocateReg(Reg);
2280 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2281 }
2282 }
2283}
2284
2285// Try to allocate a VGPR at the end of the argument list, or if no argument
2286// VGPRs are left allocating a stack slot.
2287// If \p Mask is is given it indicates bitfield position in the register.
2288// If \p Arg is given use it with new ]p Mask instead of allocating new.
2289static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2290 ArgDescriptor Arg = ArgDescriptor()) {
2291 if (Arg.isSet())
2292 return ArgDescriptor::createArg(Arg, Mask);
2293
2294 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2295 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2296 if (RegIdx == ArgVGPRs.size()) {
2297 // Spill to stack required.
2298 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2299
2300 return ArgDescriptor::createStack(Offset, Mask);
2301 }
2302
2303 unsigned Reg = ArgVGPRs[RegIdx];
2304 Reg = CCInfo.AllocateReg(Reg);
2305 assert(Reg != AMDGPU::NoRegister);
2306
2307 MachineFunction &MF = CCInfo.getMachineFunction();
2308 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2309 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2310 return ArgDescriptor::createRegister(Reg, Mask);
2311}
2312
2314 const TargetRegisterClass *RC,
2315 unsigned NumArgRegs) {
2316 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2317 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2318 if (RegIdx == ArgSGPRs.size())
2319 report_fatal_error("ran out of SGPRs for arguments");
2320
2321 unsigned Reg = ArgSGPRs[RegIdx];
2322 Reg = CCInfo.AllocateReg(Reg);
2323 assert(Reg != AMDGPU::NoRegister);
2324
2325 MachineFunction &MF = CCInfo.getMachineFunction();
2326 MF.addLiveIn(Reg, RC);
2328}
2329
2330// If this has a fixed position, we still should allocate the register in the
2331// CCInfo state. Technically we could get away with this for values passed
2332// outside of the normal argument range.
2334 const TargetRegisterClass *RC,
2335 MCRegister Reg) {
2336 Reg = CCInfo.AllocateReg(Reg);
2337 assert(Reg != AMDGPU::NoRegister);
2338 MachineFunction &MF = CCInfo.getMachineFunction();
2339 MF.addLiveIn(Reg, RC);
2340}
2341
2342static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2343 if (Arg) {
2344 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2345 Arg.getRegister());
2346 } else
2347 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2348}
2349
2350static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2351 if (Arg) {
2352 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2353 Arg.getRegister());
2354 } else
2355 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2356}
2357
2358/// Allocate implicit function VGPR arguments at the end of allocated user
2359/// arguments.
2361 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2362 SIMachineFunctionInfo &Info) const {
2363 const unsigned Mask = 0x3ff;
2364 ArgDescriptor Arg;
2365
2366 if (Info.hasWorkItemIDX()) {
2367 Arg = allocateVGPR32Input(CCInfo, Mask);
2368 Info.setWorkItemIDX(Arg);
2369 }
2370
2371 if (Info.hasWorkItemIDY()) {
2372 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2373 Info.setWorkItemIDY(Arg);
2374 }
2375
2376 if (Info.hasWorkItemIDZ())
2377 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2378}
2379
2380/// Allocate implicit function VGPR arguments in fixed registers.
2382 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2383 SIMachineFunctionInfo &Info) const {
2384 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2385 if (!Reg)
2386 report_fatal_error("failed to allocated VGPR for implicit arguments");
2387
2388 const unsigned Mask = 0x3ff;
2389 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2390 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2391 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2392}
2393
2395 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2396 SIMachineFunctionInfo &Info) const {
2397 auto &ArgInfo = Info.getArgInfo();
2398 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2399
2400 // TODO: Unify handling with private memory pointers.
2401 if (UserSGPRInfo.hasDispatchPtr())
2402 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2403
2404 if (UserSGPRInfo.hasQueuePtr())
2405 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2406
2407 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2408 // constant offset from the kernarg segment.
2409 if (Info.hasImplicitArgPtr())
2410 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2411
2412 if (UserSGPRInfo.hasDispatchID())
2413 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2414
2415 // flat_scratch_init is not applicable for non-kernel functions.
2416
2417 if (Info.hasWorkGroupIDX())
2418 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2419
2420 if (Info.hasWorkGroupIDY())
2421 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2422
2423 if (Info.hasWorkGroupIDZ())
2424 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2425
2426 if (Info.hasLDSKernelId())
2427 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2428}
2429
2430// Allocate special inputs passed in user SGPRs.
2432 MachineFunction &MF,
2433 const SIRegisterInfo &TRI,
2434 SIMachineFunctionInfo &Info) const {
2435 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2436 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2437 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2438 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2439 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2440 }
2441
2442 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2443 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2444 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2445 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2446 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2447 }
2448
2449 if (UserSGPRInfo.hasDispatchPtr()) {
2450 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2451 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2452 CCInfo.AllocateReg(DispatchPtrReg);
2453 }
2454
2455 if (UserSGPRInfo.hasQueuePtr()) {
2456 Register QueuePtrReg = Info.addQueuePtr(TRI);
2457 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2458 CCInfo.AllocateReg(QueuePtrReg);
2459 }
2460
2461 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2463 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2464 CCInfo.AllocateReg(InputPtrReg);
2465
2466 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2467 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2468 }
2469
2470 if (UserSGPRInfo.hasDispatchID()) {
2471 Register DispatchIDReg = Info.addDispatchID(TRI);
2472 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2473 CCInfo.AllocateReg(DispatchIDReg);
2474 }
2475
2476 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2477 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2478 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2479 CCInfo.AllocateReg(FlatScratchInitReg);
2480 }
2481
2482 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2483 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2484 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2485 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2486 }
2487
2488 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2489 // these from the dispatch pointer.
2490}
2491
2492// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2493// sequential starting from the first argument.
2495 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2497 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2498 Function &F = MF.getFunction();
2499 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2500 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2501 bool InPreloadSequence = true;
2502 unsigned InIdx = 0;
2503 bool AlignedForImplictArgs = false;
2504 unsigned ImplicitArgOffset = 0;
2505 for (auto &Arg : F.args()) {
2506 if (!InPreloadSequence || !Arg.hasInRegAttr())
2507 break;
2508
2509 unsigned ArgIdx = Arg.getArgNo();
2510 // Don't preload non-original args or parts not in the current preload
2511 // sequence.
2512 if (InIdx < Ins.size() &&
2513 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2514 break;
2515
2516 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2517 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2518 InIdx++) {
2519 assert(ArgLocs[ArgIdx].isMemLoc());
2520 auto &ArgLoc = ArgLocs[InIdx];
2521 const Align KernelArgBaseAlign = Align(16);
2522 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2523 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2524 unsigned NumAllocSGPRs =
2525 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2526
2527 // Fix alignment for hidden arguments.
2528 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2529 if (!AlignedForImplictArgs) {
2530 ImplicitArgOffset =
2531 alignTo(LastExplicitArgOffset,
2532 Subtarget->getAlignmentForImplicitArgPtr()) -
2533 LastExplicitArgOffset;
2534 AlignedForImplictArgs = true;
2535 }
2536 ArgOffset += ImplicitArgOffset;
2537 }
2538
2539 // Arg is preloaded into the previous SGPR.
2540 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2541 assert(InIdx >= 1 && "No previous SGPR");
2542 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2543 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2544 continue;
2545 }
2546
2547 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2548 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2549 // Check for free user SGPRs for preloading.
2550 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2551 InPreloadSequence = false;
2552 break;
2553 }
2554
2555 // Preload this argument.
2556 const TargetRegisterClass *RC =
2557 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2558 SmallVectorImpl<MCRegister> *PreloadRegs =
2559 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2560
2561 if (PreloadRegs->size() > 1)
2562 RC = &AMDGPU::SGPR_32RegClass;
2563 for (auto &Reg : *PreloadRegs) {
2564 assert(Reg);
2565 MF.addLiveIn(Reg, RC);
2566 CCInfo.AllocateReg(Reg);
2567 }
2568
2569 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2570 }
2571 }
2572}
2573
2575 const SIRegisterInfo &TRI,
2576 SIMachineFunctionInfo &Info) const {
2577 // Always allocate this last since it is a synthetic preload.
2578 if (Info.hasLDSKernelId()) {
2579 Register Reg = Info.addLDSKernelId();
2580 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2581 CCInfo.AllocateReg(Reg);
2582 }
2583}
2584
2585// Allocate special input registers that are initialized per-wave.
2588 CallingConv::ID CallConv,
2589 bool IsShader) const {
2590 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2591 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2592 // Note: user SGPRs are handled by the front-end for graphics shaders
2593 // Pad up the used user SGPRs with dead inputs.
2594
2595 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2596 // before enabling architected SGPRs for workgroup IDs.
2597 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2598
2599 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2600 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2601 // rely on it to reach 16 since if we end up having no stack usage, it will
2602 // not really be added.
2603 unsigned NumRequiredSystemSGPRs =
2604 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2605 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2606 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2607 Register Reg = Info.addReservedUserSGPR();
2608 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2609 CCInfo.AllocateReg(Reg);
2610 }
2611 }
2612
2613 if (!HasArchitectedSGPRs) {
2614 if (Info.hasWorkGroupIDX()) {
2615 Register Reg = Info.addWorkGroupIDX();
2616 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2617 CCInfo.AllocateReg(Reg);
2618 }
2619
2620 if (Info.hasWorkGroupIDY()) {
2621 Register Reg = Info.addWorkGroupIDY();
2622 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2623 CCInfo.AllocateReg(Reg);
2624 }
2625
2626 if (Info.hasWorkGroupIDZ()) {
2627 Register Reg = Info.addWorkGroupIDZ();
2628 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2629 CCInfo.AllocateReg(Reg);
2630 }
2631 }
2632
2633 if (Info.hasWorkGroupInfo()) {
2634 Register Reg = Info.addWorkGroupInfo();
2635 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2636 CCInfo.AllocateReg(Reg);
2637 }
2638
2639 if (Info.hasPrivateSegmentWaveByteOffset()) {
2640 // Scratch wave offset passed in system SGPR.
2641 unsigned PrivateSegmentWaveByteOffsetReg;
2642
2643 if (IsShader) {
2644 PrivateSegmentWaveByteOffsetReg =
2645 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2646
2647 // This is true if the scratch wave byte offset doesn't have a fixed
2648 // location.
2649 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2650 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2651 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2652 }
2653 } else
2654 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2655
2656 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2657 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2658 }
2659
2660 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2661 Info.getNumPreloadedSGPRs() >= 16);
2662}
2663
2665 MachineFunction &MF,
2666 const SIRegisterInfo &TRI,
2667 SIMachineFunctionInfo &Info) {
2668 // Now that we've figured out where the scratch register inputs are, see if
2669 // should reserve the arguments and use them directly.
2670 MachineFrameInfo &MFI = MF.getFrameInfo();
2671 bool HasStackObjects = MFI.hasStackObjects();
2672 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2673
2674 // Record that we know we have non-spill stack objects so we don't need to
2675 // check all stack objects later.
2676 if (HasStackObjects)
2677 Info.setHasNonSpillStackObjects(true);
2678
2679 // Everything live out of a block is spilled with fast regalloc, so it's
2680 // almost certain that spilling will be required.
2681 if (TM.getOptLevel() == CodeGenOptLevel::None)
2682 HasStackObjects = true;
2683
2684 // For now assume stack access is needed in any callee functions, so we need
2685 // the scratch registers to pass in.
2686 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2687
2688 if (!ST.enableFlatScratch()) {
2689 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2690 // If we have stack objects, we unquestionably need the private buffer
2691 // resource. For the Code Object V2 ABI, this will be the first 4 user
2692 // SGPR inputs. We can reserve those and use them directly.
2693
2694 Register PrivateSegmentBufferReg =
2696 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2697 } else {
2698 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2699 // We tentatively reserve the last registers (skipping the last registers
2700 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2701 // we'll replace these with the ones immediately after those which were
2702 // really allocated. In the prologue copies will be inserted from the
2703 // argument to these reserved registers.
2704
2705 // Without HSA, relocations are used for the scratch pointer and the
2706 // buffer resource setup is always inserted in the prologue. Scratch wave
2707 // offset is still in an input SGPR.
2708 Info.setScratchRSrcReg(ReservedBufferReg);
2709 }
2710 }
2711
2713
2714 // For entry functions we have to set up the stack pointer if we use it,
2715 // whereas non-entry functions get this "for free". This means there is no
2716 // intrinsic advantage to using S32 over S34 in cases where we do not have
2717 // calls but do need a frame pointer (i.e. if we are requested to have one
2718 // because frame pointer elimination is disabled). To keep things simple we
2719 // only ever use S32 as the call ABI stack pointer, and so using it does not
2720 // imply we need a separate frame pointer.
2721 //
2722 // Try to use s32 as the SP, but move it if it would interfere with input
2723 // arguments. This won't work with calls though.
2724 //
2725 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2726 // registers.
2727 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2728 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2729 } else {
2731
2732 if (MFI.hasCalls())
2733 report_fatal_error("call in graphics shader with too many input SGPRs");
2734
2735 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2736 if (!MRI.isLiveIn(Reg)) {
2737 Info.setStackPtrOffsetReg(Reg);
2738 break;
2739 }
2740 }
2741
2742 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2743 report_fatal_error("failed to find register for SP");
2744 }
2745
2746 // hasFP should be accurate for entry functions even before the frame is
2747 // finalized, because it does not rely on the known stack size, only
2748 // properties like whether variable sized objects are present.
2749 if (ST.getFrameLowering()->hasFP(MF)) {
2750 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2751 }
2752}
2753
2756 return !Info->isEntryFunction();
2757}
2758
2760
2762 MachineBasicBlock *Entry,
2763 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2765
2766 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2767 if (!IStart)
2768 return;
2769
2770 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2771 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2772 MachineBasicBlock::iterator MBBI = Entry->begin();
2773 for (const MCPhysReg *I = IStart; *I; ++I) {
2774 const TargetRegisterClass *RC = nullptr;
2775 if (AMDGPU::SReg_64RegClass.contains(*I))
2776 RC = &AMDGPU::SGPR_64RegClass;
2777 else if (AMDGPU::SReg_32RegClass.contains(*I))
2778 RC = &AMDGPU::SGPR_32RegClass;
2779 else
2780 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2781
2782 Register NewVR = MRI->createVirtualRegister(RC);
2783 // Create copy from CSR to a virtual register.
2784 Entry->addLiveIn(*I);
2785 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2786 .addReg(*I);
2787
2788 // Insert the copy-back instructions right before the terminator.
2789 for (auto *Exit : Exits)
2790 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2791 TII->get(TargetOpcode::COPY), *I)
2792 .addReg(NewVR);
2793 }
2794}
2795
2797 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2798 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2799 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2801
2803 const Function &Fn = MF.getFunction();
2806
2807 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2808 DiagnosticInfoUnsupported NoGraphicsHSA(
2809 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2810 DAG.getContext()->diagnose(NoGraphicsHSA);
2811 return DAG.getEntryNode();
2812 }
2813
2816 BitVector Skipped(Ins.size());
2817 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2818 *DAG.getContext());
2819
2820 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2821 bool IsKernel = AMDGPU::isKernel(CallConv);
2822 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2823
2824 if (IsGraphics) {
2825 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2826 assert(!UserSGPRInfo.hasDispatchPtr() &&
2827 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2828 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2829 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2830 (void)UserSGPRInfo;
2831 if (!Subtarget->enableFlatScratch())
2832 assert(!UserSGPRInfo.hasFlatScratchInit());
2833 if ((CallConv != CallingConv::AMDGPU_CS &&
2834 CallConv != CallingConv::AMDGPU_Gfx) ||
2835 !Subtarget->hasArchitectedSGPRs())
2836 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2837 !Info->hasWorkGroupIDZ());
2838 }
2839
2840 if (CallConv == CallingConv::AMDGPU_PS) {
2841 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2842
2843 // At least one interpolation mode must be enabled or else the GPU will
2844 // hang.
2845 //
2846 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2847 // set PSInputAddr, the user wants to enable some bits after the compilation
2848 // based on run-time states. Since we can't know what the final PSInputEna
2849 // will look like, so we shouldn't do anything here and the user should take
2850 // responsibility for the correct programming.
2851 //
2852 // Otherwise, the following restrictions apply:
2853 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2854 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2855 // enabled too.
2856 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2857 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2858 CCInfo.AllocateReg(AMDGPU::VGPR0);
2859 CCInfo.AllocateReg(AMDGPU::VGPR1);
2860 Info->markPSInputAllocated(0);
2861 Info->markPSInputEnabled(0);
2862 }
2863 if (Subtarget->isAmdPalOS()) {
2864 // For isAmdPalOS, the user does not enable some bits after compilation
2865 // based on run-time states; the register values being generated here are
2866 // the final ones set in hardware. Therefore we need to apply the
2867 // workaround to PSInputAddr and PSInputEnable together. (The case where
2868 // a bit is set in PSInputAddr but not PSInputEnable is where the
2869 // frontend set up an input arg for a particular interpolation mode, but
2870 // nothing uses that input arg. Really we should have an earlier pass
2871 // that removes such an arg.)
2872 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2873 if ((PsInputBits & 0x7F) == 0 ||
2874 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2875 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2876 }
2877 } else if (IsKernel) {
2878 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2879 } else {
2880 Splits.append(Ins.begin(), Ins.end());
2881 }
2882
2883 if (IsKernel)
2884 analyzeFormalArgumentsCompute(CCInfo, Ins);
2885
2886 if (IsEntryFunc) {
2887 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2888 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2889 if (IsKernel && Subtarget->hasKernargPreload())
2890 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2891
2892 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2893 } else if (!IsGraphics) {
2894 // For the fixed ABI, pass workitem IDs in the last argument register.
2895 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2896
2897 // FIXME: Sink this into allocateSpecialInputSGPRs
2898 if (!Subtarget->enableFlatScratch())
2899 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2900
2901 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2902 }
2903
2904 if (!IsKernel) {
2905 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2906 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2907 }
2908
2910
2911 // FIXME: This is the minimum kernel argument alignment. We should improve
2912 // this to the maximum alignment of the arguments.
2913 //
2914 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2915 // kern arg offset.
2916 const Align KernelArgBaseAlign = Align(16);
2917
2918 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2919 const ISD::InputArg &Arg = Ins[i];
2920 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2921 InVals.push_back(DAG.getUNDEF(Arg.VT));
2922 continue;
2923 }
2924
2925 CCValAssign &VA = ArgLocs[ArgIdx++];
2926 MVT VT = VA.getLocVT();
2927
2928 if (IsEntryFunc && VA.isMemLoc()) {
2929 VT = Ins[i].VT;
2930 EVT MemVT = VA.getLocVT();
2931
2932 const uint64_t Offset = VA.getLocMemOffset();
2933 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2934
2935 if (Arg.Flags.isByRef()) {
2936 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2937
2938 const GCNTargetMachine &TM =
2939 static_cast<const GCNTargetMachine &>(getTargetMachine());
2940 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2941 Arg.Flags.getPointerAddrSpace())) {
2944 }
2945
2946 InVals.push_back(Ptr);
2947 continue;
2948 }
2949
2950 SDValue NewArg;
2951 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2952 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2953 // In this case the argument is packed into the previous preload SGPR.
2954 int64_t AlignDownOffset = alignDown(Offset, 4);
2955 int64_t OffsetDiff = Offset - AlignDownOffset;
2956 EVT IntVT = MemVT.changeTypeToInteger();
2957
2961 Register Reg =
2962 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2963
2964 assert(Reg);
2965 Register VReg = MRI.getLiveInVirtReg(Reg);
2966 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2967
2968 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2969 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2970
2971 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2972 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2973 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2974 Ins[i].Flags.isSExt(), &Ins[i]);
2975
2976 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2977 } else {
2981 const SmallVectorImpl<MCRegister> &PreloadRegs =
2982 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2983
2984 SDValue Copy;
2985 if (PreloadRegs.size() == 1) {
2986 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2987 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2988 NewArg = DAG.getCopyFromReg(
2989 Chain, DL, VReg,
2991 TRI->getRegSizeInBits(*RC)));
2992
2993 } else {
2994 // If the kernarg alignment does not match the alignment of the SGPR
2995 // tuple RC that can accommodate this argument, it will be built up
2996 // via copies from from the individual SGPRs that the argument was
2997 // preloaded to.
2999 for (auto Reg : PreloadRegs) {
3000 Register VReg = MRI.getLiveInVirtReg(Reg);
3001 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3002 Elts.push_back(Copy);
3003 }
3004 NewArg =
3005 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3006 PreloadRegs.size()),
3007 DL, Elts);
3008 }
3009
3010 // If the argument was preloaded to multiple consecutive 32-bit
3011 // registers because of misalignment between addressable SGPR tuples
3012 // and the argument size, we can still assume that because of kernarg
3013 // segment alignment restrictions that NewArg's size is the same as
3014 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3015 // truncate since we cannot preload to less than a single SGPR and the
3016 // MemVT may be smaller.
3017 EVT MemVTInt =
3019 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3020 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3021
3022 NewArg = DAG.getBitcast(MemVT, NewArg);
3023 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3024 Ins[i].Flags.isSExt(), &Ins[i]);
3025 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3026 }
3027 } else {
3028 // Hidden arguments that are in the kernel signature must be preloaded
3029 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3030 // the argument list and is not preloaded.
3031 if (Arg.isOrigArg()) {
3032 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3033 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3034 DiagnosticInfoUnsupported NonPreloadHiddenArg(
3035 *OrigArg->getParent(),
3036 "hidden argument in kernel signature was not preloaded",
3037 DL.getDebugLoc());
3038 DAG.getContext()->diagnose(NonPreloadHiddenArg);
3039 }
3040 }
3041
3042 NewArg =
3043 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3044 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3045 }
3046 Chains.push_back(NewArg.getValue(1));
3047
3048 auto *ParamTy =
3049 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3051 ParamTy &&
3052 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3053 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3054 // On SI local pointers are just offsets into LDS, so they are always
3055 // less than 16-bits. On CI and newer they could potentially be
3056 // real pointers, so we can't guarantee their size.
3057 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3058 DAG.getValueType(MVT::i16));
3059 }
3060
3061 InVals.push_back(NewArg);
3062 continue;
3063 }
3064 if (!IsEntryFunc && VA.isMemLoc()) {
3065 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3066 InVals.push_back(Val);
3067 if (!Arg.Flags.isByVal())
3068 Chains.push_back(Val.getValue(1));
3069 continue;
3070 }
3071
3072 assert(VA.isRegLoc() && "Parameter must be in a register!");
3073
3074 Register Reg = VA.getLocReg();
3075 const TargetRegisterClass *RC = nullptr;
3076 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3077 RC = &AMDGPU::VGPR_32RegClass;
3078 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3079 RC = &AMDGPU::SGPR_32RegClass;
3080 else
3081 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3082 EVT ValVT = VA.getValVT();
3083
3084 Reg = MF.addLiveIn(Reg, RC);
3085 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3086
3087 if (Arg.Flags.isSRet()) {
3088 // The return object should be reasonably addressable.
3089
3090 // FIXME: This helps when the return is a real sret. If it is a
3091 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3092 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3093 unsigned NumBits =
3095 Val = DAG.getNode(
3096 ISD::AssertZext, DL, VT, Val,
3097 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3098 }
3099
3100 // If this is an 8 or 16-bit value, it is really passed promoted
3101 // to 32 bits. Insert an assert[sz]ext to capture this, then
3102 // truncate to the right size.
3103 switch (VA.getLocInfo()) {
3104 case CCValAssign::Full:
3105 break;
3106 case CCValAssign::BCvt:
3107 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3108 break;
3109 case CCValAssign::SExt:
3110 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3111 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3112 break;
3113 case CCValAssign::ZExt:
3114 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3115 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3116 break;
3117 case CCValAssign::AExt:
3118 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3119 break;
3120 default:
3121 llvm_unreachable("Unknown loc info!");
3122 }
3123
3124 InVals.push_back(Val);
3125 }
3126
3127 // Start adding system SGPRs.
3128 if (IsEntryFunc)
3129 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3130
3131 // DAG.getPass() returns nullptr when using new pass manager.
3132 // TODO: Use DAG.getMFAM() to access analysis result.
3133 if (DAG.getPass()) {
3134 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3135 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3136 }
3137
3138 unsigned StackArgSize = CCInfo.getStackSize();
3139 Info->setBytesInStackArgArea(StackArgSize);
3140
3141 return Chains.empty() ? Chain
3142 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3143}
3144
3145// TODO: If return values can't fit in registers, we should return as many as
3146// possible in registers before passing on stack.
3148 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3149 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3150 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3151 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3152 // for shaders. Vector types should be explicitly handled by CC.
3153 if (AMDGPU::isEntryFunctionCC(CallConv))
3154 return true;
3155
3157 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3158 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3159 return false;
3160
3161 // We must use the stack if return would require unavailable registers.
3162 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3163 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3164 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3165 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3166 return false;
3167
3168 return true;
3169}
3170
3171SDValue
3173 bool isVarArg,
3175 const SmallVectorImpl<SDValue> &OutVals,
3176 const SDLoc &DL, SelectionDAG &DAG) const {
3179
3180 if (AMDGPU::isKernel(CallConv)) {
3181 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3182 OutVals, DL, DAG);
3183 }
3184
3185 bool IsShader = AMDGPU::isShader(CallConv);
3186
3187 Info->setIfReturnsVoid(Outs.empty());
3188 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3189
3190 // CCValAssign - represent the assignment of the return value to a location.
3193
3194 // CCState - Info about the registers and stack slots.
3195 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3196 *DAG.getContext());
3197
3198 // Analyze outgoing return values.
3199 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3200
3201 SDValue Glue;
3203 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3204
3205 // Copy the result values into the output registers.
3206 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3207 ++I, ++RealRVLocIdx) {
3208 CCValAssign &VA = RVLocs[I];
3209 assert(VA.isRegLoc() && "Can only return in registers!");
3210 // TODO: Partially return in registers if return values don't fit.
3211 SDValue Arg = OutVals[RealRVLocIdx];
3212
3213 // Copied from other backends.
3214 switch (VA.getLocInfo()) {
3215 case CCValAssign::Full:
3216 break;
3217 case CCValAssign::BCvt:
3218 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3219 break;
3220 case CCValAssign::SExt:
3221 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3222 break;
3223 case CCValAssign::ZExt:
3224 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3225 break;
3226 case CCValAssign::AExt:
3227 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3228 break;
3229 default:
3230 llvm_unreachable("Unknown loc info!");
3231 }
3232
3233 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3234 Glue = Chain.getValue(1);
3235 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3236 }
3237
3238 // FIXME: Does sret work properly?
3239 if (!Info->isEntryFunction()) {
3240 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3241 const MCPhysReg *I =
3242 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3243 if (I) {
3244 for (; *I; ++I) {
3245 if (AMDGPU::SReg_64RegClass.contains(*I))
3246 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3247 else if (AMDGPU::SReg_32RegClass.contains(*I))
3248 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3249 else
3250 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3251 }
3252 }
3253 }
3254
3255 // Update chain and glue.
3256 RetOps[0] = Chain;
3257 if (Glue.getNode())
3258 RetOps.push_back(Glue);
3259
3260 unsigned Opc = AMDGPUISD::ENDPGM;
3261 if (!IsWaveEnd)
3263 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3264}
3265
3267 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3268 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3269 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3270 SDValue ThisVal) const {
3271 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3272
3273 // Assign locations to each value returned by this call.
3275 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3276 *DAG.getContext());
3277 CCInfo.AnalyzeCallResult(Ins, RetCC);
3278
3279 // Copy all of the result registers out of their specified physreg.
3280 for (CCValAssign VA : RVLocs) {
3281 SDValue Val;
3282
3283 if (VA.isRegLoc()) {
3284 Val =
3285 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3286 Chain = Val.getValue(1);
3287 InGlue = Val.getValue(2);
3288 } else if (VA.isMemLoc()) {
3289 report_fatal_error("TODO: return values in memory");
3290 } else
3291 llvm_unreachable("unknown argument location type");
3292
3293 switch (VA.getLocInfo()) {
3294 case CCValAssign::Full:
3295 break;
3296 case CCValAssign::BCvt:
3297 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3298 break;
3299 case CCValAssign::ZExt:
3300 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3301 DAG.getValueType(VA.getValVT()));
3302 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3303 break;
3304 case CCValAssign::SExt:
3305 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3306 DAG.getValueType(VA.getValVT()));
3307 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3308 break;
3309 case CCValAssign::AExt:
3310 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3311 break;
3312 default:
3313 llvm_unreachable("Unknown loc info!");
3314 }
3315
3316 InVals.push_back(Val);
3317 }
3318
3319 return Chain;
3320}
3321
3322// Add code to pass special inputs required depending on used features separate
3323// from the explicit user arguments present in the IR.
3325 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3326 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3327 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3328 // If we don't have a call site, this was a call inserted by
3329 // legalization. These can never use special inputs.
3330 if (!CLI.CB)
3331 return;
3332
3333 SelectionDAG &DAG = CLI.DAG;
3334 const SDLoc &DL = CLI.DL;
3335 const Function &F = DAG.getMachineFunction().getFunction();
3336
3337 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3338 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3339
3340 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3342 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3343 // DAG.getPass() returns nullptr when using new pass manager.
3344 // TODO: Use DAG.getMFAM() to access analysis result.
3345 if (DAG.getPass()) {
3346 auto &ArgUsageInfo =
3348 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3349 }
3350 }
3351
3352 // TODO: Unify with private memory register handling. This is complicated by
3353 // the fact that at least in kernels, the input argument is not necessarily
3354 // in the same location as the input.
3355 // clang-format off
3356 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3358 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3359 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3360 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3361 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3362 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3363 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3364 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3365 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3366 };
3367 // clang-format on
3368
3369 for (auto [InputID, Attr] : ImplicitAttrs) {
3370 // If the callee does not use the attribute value, skip copying the value.
3371 if (CLI.CB->hasFnAttr(Attr))
3372 continue;
3373
3374 const auto [OutgoingArg, ArgRC, ArgTy] =
3375 CalleeArgInfo->getPreloadedValue(InputID);
3376 if (!OutgoingArg)
3377 continue;
3378
3379 const auto [IncomingArg, IncomingArgRC, Ty] =
3380 CallerArgInfo.getPreloadedValue(InputID);
3381 assert(IncomingArgRC == ArgRC);
3382
3383 // All special arguments are ints for now.
3384 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3385 SDValue InputReg;
3386
3387 if (IncomingArg) {
3388 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3389 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3390 // The implicit arg ptr is special because it doesn't have a corresponding
3391 // input for kernels, and is computed from the kernarg segment pointer.
3392 InputReg = getImplicitArgPtr(DAG, DL);
3393 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3394 std::optional<uint32_t> Id =
3396 if (Id.has_value()) {
3397 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3398 } else {
3399 InputReg = DAG.getUNDEF(ArgVT);
3400 }
3401 } else {
3402 // We may have proven the input wasn't needed, although the ABI is
3403 // requiring it. We just need to allocate the register appropriately.
3404 InputReg = DAG.getUNDEF(ArgVT);
3405 }
3406
3407 if (OutgoingArg->isRegister()) {
3408 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3409 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3410 report_fatal_error("failed to allocate implicit input argument");
3411 } else {
3412 unsigned SpecialArgOffset =
3413 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3414 SDValue ArgStore =
3415 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3416 MemOpChains.push_back(ArgStore);
3417 }
3418 }
3419
3420 // Pack workitem IDs into a single register or pass it as is if already
3421 // packed.
3422
3423 auto [OutgoingArg, ArgRC, Ty] =
3425 if (!OutgoingArg)
3426 std::tie(OutgoingArg, ArgRC, Ty) =
3428 if (!OutgoingArg)
3429 std::tie(OutgoingArg, ArgRC, Ty) =
3431 if (!OutgoingArg)
3432 return;
3433
3434 const ArgDescriptor *IncomingArgX = std::get<0>(
3436 const ArgDescriptor *IncomingArgY = std::get<0>(
3438 const ArgDescriptor *IncomingArgZ = std::get<0>(
3440
3441 SDValue InputReg;
3442 SDLoc SL;
3443
3444 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3445 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3446 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3447
3448 // If incoming ids are not packed we need to pack them.
3449 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3450 NeedWorkItemIDX) {
3451 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3452 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3453 } else {
3454 InputReg = DAG.getConstant(0, DL, MVT::i32);
3455 }
3456 }
3457
3458 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3459 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3460 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3461 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3462 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3463 InputReg = InputReg.getNode()
3464 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3465 : Y;
3466 }
3467
3468 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3469 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3470 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3471 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3472 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3473 InputReg = InputReg.getNode()
3474 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3475 : Z;
3476 }
3477
3478 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3479 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3480 // We're in a situation where the outgoing function requires the workitem
3481 // ID, but the calling function does not have it (e.g a graphics function
3482 // calling a C calling convention function). This is illegal, but we need
3483 // to produce something.
3484 InputReg = DAG.getUNDEF(MVT::i32);
3485 } else {
3486 // Workitem ids are already packed, any of present incoming arguments
3487 // will carry all required fields.
3488 ArgDescriptor IncomingArg =
3489 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3490 : IncomingArgY ? *IncomingArgY
3491 : *IncomingArgZ,
3492 ~0u);
3493 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3494 }
3495 }
3496
3497 if (OutgoingArg->isRegister()) {
3498 if (InputReg)
3499 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3500
3501 CCInfo.AllocateReg(OutgoingArg->getRegister());
3502 } else {
3503 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3504 if (InputReg) {
3505 SDValue ArgStore =
3506 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3507 MemOpChains.push_back(ArgStore);
3508 }
3509 }
3510}
3511
3513 return CC == CallingConv::Fast;
3514}
3515
3516/// Return true if we might ever do TCO for calls with this calling convention.
3518 switch (CC) {
3519 case CallingConv::C:
3521 return true;
3522 default:
3523 return canGuaranteeTCO(CC);
3524 }
3525}
3526
3528 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3530 const SmallVectorImpl<SDValue> &OutVals,
3531 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3532 if (AMDGPU::isChainCC(CalleeCC))
3533 return true;
3534
3535 if (!mayTailCallThisCC(CalleeCC))
3536 return false;
3537
3538 // For a divergent call target, we need to do a waterfall loop over the
3539 // possible callees which precludes us from using a simple jump.
3540 if (Callee->isDivergent())
3541 return false;
3542
3544 const Function &CallerF = MF.getFunction();
3545 CallingConv::ID CallerCC = CallerF.getCallingConv();
3547 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3548
3549 // Kernels aren't callable, and don't have a live in return address so it
3550 // doesn't make sense to do a tail call with entry functions.
3551 if (!CallerPreserved)
3552 return false;
3553
3554 bool CCMatch = CallerCC == CalleeCC;
3555
3557 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3558 return true;
3559 return false;
3560 }
3561
3562 // TODO: Can we handle var args?
3563 if (IsVarArg)
3564 return false;
3565
3566 for (const Argument &Arg : CallerF.args()) {
3567 if (Arg.hasByValAttr())
3568 return false;
3569 }
3570
3571 LLVMContext &Ctx = *DAG.getContext();
3572
3573 // Check that the call results are passed in the same way.
3574 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3575 CCAssignFnForCall(CalleeCC, IsVarArg),
3576 CCAssignFnForCall(CallerCC, IsVarArg)))
3577 return false;
3578
3579 // The callee has to preserve all registers the caller needs to preserve.
3580 if (!CCMatch) {
3581 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3582 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3583 return false;
3584 }
3585
3586 // Nothing more to check if the callee is taking no arguments.
3587 if (Outs.empty())
3588 return true;
3589
3591 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3592
3593 // FIXME: We are not allocating special input registers, so we will be
3594 // deciding based on incorrect register assignments.
3595 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3596
3597 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3598 // If the stack arguments for this call do not fit into our own save area then
3599 // the call cannot be made tail.
3600 // TODO: Is this really necessary?
3601 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3602 return false;
3603
3604 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3605 // FIXME: What about inreg arguments that end up passed in memory?
3606 if (!CCVA.isRegLoc())
3607 continue;
3608
3609 // If we are passing an argument in an SGPR, and the value is divergent,
3610 // this call requires a waterfall loop.
3611 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3612 LLVM_DEBUG(
3613 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3614 << printReg(CCVA.getLocReg(), TRI) << '\n');
3615 return false;
3616 }
3617 }
3618
3619 const MachineRegisterInfo &MRI = MF.getRegInfo();
3620 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3621}
3622
3624 if (!CI->isTailCall())
3625 return false;
3626
3627 const Function *ParentFn = CI->getParent()->getParent();
3629 return false;
3630 return true;
3631}
3632
3633// The wave scratch offset register is used as the global base pointer.
3635 SmallVectorImpl<SDValue> &InVals) const {
3636 CallingConv::ID CallConv = CLI.CallConv;
3637 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3638
3639 SelectionDAG &DAG = CLI.DAG;
3640
3641 TargetLowering::ArgListEntry RequestedExec;
3642 if (IsChainCallConv) {
3643 // The last argument should be the value that we need to put in EXEC.
3644 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3645 // don't treat it like the rest of the arguments.
3646 RequestedExec = CLI.Args.back();
3647 assert(RequestedExec.Node && "No node for EXEC");
3648
3649 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3650 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3651
3652 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3653 CLI.Outs.pop_back();
3654 CLI.OutVals.pop_back();
3655
3656 if (RequestedExec.Ty->isIntegerTy(64)) {
3657 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3658 CLI.Outs.pop_back();
3659 CLI.OutVals.pop_back();
3660 }
3661
3662 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3663 "Haven't popped all the pieces of the EXEC mask");
3664 }
3665
3666 const SDLoc &DL = CLI.DL;
3668 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3670 SDValue Chain = CLI.Chain;
3671 SDValue Callee = CLI.Callee;
3672 bool &IsTailCall = CLI.IsTailCall;
3673 bool IsVarArg = CLI.IsVarArg;
3674 bool IsSibCall = false;
3676
3677 if (Callee.isUndef() || isNullConstant(Callee)) {
3678 if (!CLI.IsTailCall) {
3679 for (ISD::InputArg &Arg : CLI.Ins)
3680 InVals.push_back(DAG.getUNDEF(Arg.VT));
3681 }
3682
3683 return Chain;
3684 }
3685
3686 if (IsVarArg) {
3687 return lowerUnhandledCall(CLI, InVals,
3688 "unsupported call to variadic function ");
3689 }
3690
3691 if (!CLI.CB)
3692 report_fatal_error("unsupported libcall legalization");
3693
3694 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3695 return lowerUnhandledCall(CLI, InVals,
3696 "unsupported required tail call to function ");
3697 }
3698
3699 if (IsTailCall) {
3700 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3701 Outs, OutVals, Ins, DAG);
3702 if (!IsTailCall &&
3703 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3704 report_fatal_error("failed to perform tail call elimination on a call "
3705 "site marked musttail or on llvm.amdgcn.cs.chain");
3706 }
3707
3708 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3709
3710 // A sibling call is one where we're under the usual C ABI and not planning
3711 // to change that but can still do a tail call:
3712 if (!TailCallOpt && IsTailCall)
3713 IsSibCall = true;
3714
3715 if (IsTailCall)
3716 ++NumTailCalls;
3717 }
3718
3721 SmallVector<SDValue, 8> MemOpChains;
3722
3723 // Analyze operands of the call, assigning locations to each operand.
3725 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3726 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3727
3728 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3729 // With a fixed ABI, allocate fixed registers before user arguments.
3730 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3731 }
3732
3733 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3734
3735 // Get a count of how many bytes are to be pushed on the stack.
3736 unsigned NumBytes = CCInfo.getStackSize();
3737
3738 if (IsSibCall) {
3739 // Since we're not changing the ABI to make this a tail call, the memory
3740 // operands are already available in the caller's incoming argument space.
3741 NumBytes = 0;
3742 }
3743
3744 // FPDiff is the byte offset of the call's argument area from the callee's.
3745 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3746 // by this amount for a tail call. In a sibling call it must be 0 because the
3747 // caller will deallocate the entire stack and the callee still expects its
3748 // arguments to begin at SP+0. Completely unused for non-tail calls.
3749 int32_t FPDiff = 0;
3750 MachineFrameInfo &MFI = MF.getFrameInfo();
3751 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3752
3753 // Adjust the stack pointer for the new arguments...
3754 // These operations are automatically eliminated by the prolog/epilog pass
3755 if (!IsSibCall)
3756 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3757
3758 if (!IsSibCall || IsChainCallConv) {
3759 if (!Subtarget->enableFlatScratch()) {
3760 SmallVector<SDValue, 4> CopyFromChains;
3761
3762 // In the HSA case, this should be an identity copy.
3763 SDValue ScratchRSrcReg =
3764 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3765 RegsToPass.emplace_back(IsChainCallConv
3766 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3767 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3768 ScratchRSrcReg);
3769 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3770 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3771 }
3772 }
3773
3774 const unsigned NumSpecialInputs = RegsToPass.size();
3775
3776 MVT PtrVT = MVT::i32;
3777
3778 // Walk the register/memloc assignments, inserting copies/loads.
3779 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3780 CCValAssign &VA = ArgLocs[i];
3781 SDValue Arg = OutVals[i];
3782
3783 // Promote the value if needed.
3784 switch (VA.getLocInfo()) {
3785 case CCValAssign::Full:
3786 break;
3787 case CCValAssign::BCvt:
3788 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3789 break;
3790 case CCValAssign::ZExt:
3791 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3792 break;
3793 case CCValAssign::SExt:
3794 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3795 break;
3796 case CCValAssign::AExt:
3797 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3798 break;
3799 case CCValAssign::FPExt:
3800 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3801 break;
3802 default:
3803 llvm_unreachable("Unknown loc info!");
3804 }
3805
3806 if (VA.isRegLoc()) {
3807 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3808 } else {
3809 assert(VA.isMemLoc());
3810
3811 SDValue DstAddr;
3812 MachinePointerInfo DstInfo;
3813
3814 unsigned LocMemOffset = VA.getLocMemOffset();
3815 int32_t Offset = LocMemOffset;
3816
3817 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3818 MaybeAlign Alignment;
3819
3820 if (IsTailCall) {
3821 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3822 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3823 : VA.getValVT().getStoreSize();
3824
3825 // FIXME: We can have better than the minimum byval required alignment.
3826 Alignment =
3827 Flags.isByVal()
3828 ? Flags.getNonZeroByValAlign()
3829 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3830
3831 Offset = Offset + FPDiff;
3832 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3833
3834 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3835 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3836
3837 // Make sure any stack arguments overlapping with where we're storing
3838 // are loaded before this eventual operation. Otherwise they'll be
3839 // clobbered.
3840
3841 // FIXME: Why is this really necessary? This seems to just result in a
3842 // lot of code to copy the stack and write them back to the same
3843 // locations, which are supposed to be immutable?
3844 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3845 } else {
3846 // Stores to the argument stack area are relative to the stack pointer.
3847 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3848 MVT::i32);
3849 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3850 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3851 Alignment =
3852 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3853 }
3854
3855 if (Outs[i].Flags.isByVal()) {
3856 SDValue SizeNode =
3857 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3858 SDValue Cpy =
3859 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3860 Outs[i].Flags.getNonZeroByValAlign(),
3861 /*isVol = */ false, /*AlwaysInline = */ true,
3862 /*CI=*/nullptr, std::nullopt, DstInfo,
3864
3865 MemOpChains.push_back(Cpy);
3866 } else {
3867 SDValue Store =
3868 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3869 MemOpChains.push_back(Store);
3870 }
3871 }
3872 }
3873
3874 if (!MemOpChains.empty())
3875 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3876
3877 SDValue ReadFirstLaneID =
3878 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3879
3880 SDValue TokenGlue;
3881 if (CLI.ConvergenceControlToken) {
3882 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3884 }
3885
3886 // Build a sequence of copy-to-reg nodes chained together with token chain
3887 // and flag operands which copy the outgoing args into the appropriate regs.
3888 SDValue InGlue;
3889
3890 unsigned ArgIdx = 0;
3891 for (auto [Reg, Val] : RegsToPass) {
3892 if (ArgIdx++ >= NumSpecialInputs &&
3893 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3894 // For chain calls, the inreg arguments are required to be
3895 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
3896 // they are uniform.
3897 //
3898 // For other calls, if an inreg arguments is known to be uniform,
3899 // speculatively insert a readfirstlane in case it is in a VGPR.
3900 //
3901 // FIXME: We need to execute this in a waterfall loop if it is a divergent
3902 // value, so let that continue to produce invalid code.
3903
3904 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3905 if (TokenGlue)
3906 ReadfirstlaneArgs.push_back(TokenGlue);
3908 ReadfirstlaneArgs);
3909 }
3910
3911 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
3912 InGlue = Chain.getValue(1);
3913 }
3914
3915 // We don't usually want to end the call-sequence here because we would tidy
3916 // the frame up *after* the call, however in the ABI-changing tail-call case
3917 // we've carefully laid out the parameters so that when sp is reset they'll be
3918 // in the correct location.
3919 if (IsTailCall && !IsSibCall) {
3920 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3921 InGlue = Chain.getValue(1);
3922 }
3923
3924 std::vector<SDValue> Ops({Chain});
3925
3926 // Add a redundant copy of the callee global which will not be legalized, as
3927 // we need direct access to the callee later.
3928 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3929 const GlobalValue *GV = GSD->getGlobal();
3930 Ops.push_back(Callee);
3931 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3932 } else {
3933 if (IsTailCall) {
3934 // isEligibleForTailCallOptimization considered whether the call target is
3935 // divergent, but we may still end up with a uniform value in a VGPR.
3936 // Insert a readfirstlane just in case.
3937 SDValue ReadFirstLaneID =
3938 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3939
3940 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3941 if (TokenGlue)
3942 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3943 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3944 ReadfirstlaneArgs);
3945 }
3946
3947 Ops.push_back(Callee);
3948 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3949 }
3950
3951 if (IsTailCall) {
3952 // Each tail call may have to adjust the stack by a different amount, so
3953 // this information must travel along with the operation for eventual
3954 // consumption by emitEpilogue.
3955 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3956 }
3957
3958 if (IsChainCallConv)
3959 Ops.push_back(RequestedExec.Node);
3960
3961 // Add argument registers to the end of the list so that they are known live
3962 // into the call.
3963 for (auto &[Reg, Val] : RegsToPass)
3964 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
3965
3966 // Add a register mask operand representing the call-preserved registers.
3967 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3968 assert(Mask && "Missing call preserved mask for calling convention");
3969 Ops.push_back(DAG.getRegisterMask(Mask));
3970
3971 if (SDValue Token = CLI.ConvergenceControlToken) {
3973 GlueOps.push_back(Token);
3974 if (InGlue)
3975 GlueOps.push_back(InGlue);
3976
3977 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3978 MVT::Glue, GlueOps),
3979 0);
3980 }
3981
3982 if (InGlue)
3983 Ops.push_back(InGlue);
3984
3985 // If we're doing a tall call, use a TC_RETURN here rather than an
3986 // actual call instruction.
3987 if (IsTailCall) {
3988 MFI.setHasTailCall();
3989 unsigned OPC = AMDGPUISD::TC_RETURN;
3990 switch (CallConv) {
3993 break;
3997 break;
3998 }
3999
4000 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4001 }
4002
4003 // Returns a chain and a flag for retval copy to use.
4004 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4005 Chain = Call.getValue(0);
4006 InGlue = Call.getValue(1);
4007
4008 uint64_t CalleePopBytes = NumBytes;
4009 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4010 if (!Ins.empty())
4011 InGlue = Chain.getValue(1);
4012
4013 // Handle result values, copying them out of physregs into vregs that we
4014 // return.
4015 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4016 InVals, /*IsThisReturn=*/false, SDValue());
4017}
4018
4019// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4020// except for stack growth direction(default: downwards, AMDGPU: upwards) and
4021// applying the wave size scale to the increment amount.
4023 SelectionDAG &DAG) const {
4024 const MachineFunction &MF = DAG.getMachineFunction();
4026
4027 SDLoc dl(Op);
4028 EVT VT = Op.getValueType();
4029 SDValue Tmp1 = Op;
4030 SDValue Tmp2 = Op.getValue(1);
4031 SDValue Tmp3 = Op.getOperand(2);
4032 SDValue Chain = Tmp1.getOperand(0);
4033
4034 Register SPReg = Info->getStackPtrOffsetReg();
4035
4036 // Chain the dynamic stack allocation so that it doesn't modify the stack
4037 // pointer when other instructions are using the stack.
4038 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4039
4040 SDValue Size = Tmp2.getOperand(1);
4041 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4042 Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
4043
4044 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4046 "Stack grows upwards for AMDGPU");
4047
4048 Chain = BaseAddr.getValue(1);
4049 Align StackAlign = TFL->getStackAlign();
4050 if (Alignment > StackAlign) {
4051 uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4052 << Subtarget->getWavefrontSizeLog2();
4053 uint64_t StackAlignMask = ScaledAlignment - 1;
4054 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4055 DAG.getConstant(StackAlignMask, dl, VT));
4056 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4057 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4058 }
4059
4060 SDValue ScaledSize = DAG.getNode(
4061 ISD::SHL, dl, VT, Size,
4062 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4063
4064 SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4065
4066 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4067 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4068
4069 return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
4070}
4071
4073 SelectionDAG &DAG) const {
4074 // We only handle constant sizes here to allow non-entry block, static sized
4075 // allocas. A truly dynamic value is more difficult to support because we
4076 // don't know if the size value is uniform or not. If the size isn't uniform,
4077 // we would need to do a wave reduction to get the maximum size to know how
4078 // much to increment the uniform stack pointer.
4079 SDValue Size = Op.getOperand(1);
4080 if (isa<ConstantSDNode>(Size))
4081 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4082
4084}
4085
4087 if (Op.getValueType() != MVT::i32)
4088 return Op; // Defer to cannot select error.
4089
4091 SDLoc SL(Op);
4092
4093 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4094
4095 // Convert from wave uniform to swizzled vector address. This should protect
4096 // from any edge cases where the stacksave result isn't directly used with
4097 // stackrestore.
4098 SDValue VectorAddress =
4099 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4100 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4101}
4102
4104 SelectionDAG &DAG) const {
4105 SDLoc SL(Op);
4106 assert(Op.getValueType() == MVT::i32);
4107
4108 uint32_t BothRoundHwReg =
4110 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4111
4112 SDValue IntrinID =
4113 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4114 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4115 Op.getOperand(0), IntrinID, GetRoundBothImm);
4116
4117 // There are two rounding modes, one for f32 and one for f64/f16. We only
4118 // report in the standard value range if both are the same.
4119 //
4120 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4121 // ties away from zero is not supported, and the other values are rotated by
4122 // 1.
4123 //
4124 // If the two rounding modes are not the same, report a target defined value.
4125
4126 // Mode register rounding mode fields:
4127 //
4128 // [1:0] Single-precision round mode.
4129 // [3:2] Double/Half-precision round mode.
4130 //
4131 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4132 //
4133 // Hardware Spec
4134 // Toward-0 3 0
4135 // Nearest Even 0 1
4136 // +Inf 1 2
4137 // -Inf 2 3
4138 // NearestAway0 N/A 4
4139 //
4140 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4141 // table we can index by the raw hardware mode.
4142 //
4143 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4144
4145 SDValue BitTable =
4147
4148 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4149 SDValue RoundModeTimesNumBits =
4150 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4151
4152 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4153 // knew only one mode was demanded.
4154 SDValue TableValue =
4155 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4156 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4157
4158 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4159 SDValue TableEntry =
4160 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4161
4162 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4163 // if it's an extended value.
4164 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4165 SDValue IsStandardValue =
4166 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4167 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4168 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4169 TableEntry, EnumOffset);
4170
4171 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4172}
4173
4175 SelectionDAG &DAG) const {
4176 SDLoc SL(Op);
4177
4178 SDValue NewMode = Op.getOperand(1);
4179 assert(NewMode.getValueType() == MVT::i32);
4180
4181 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4182 // hardware MODE.fp_round values.
4183 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4184 uint32_t ClampedVal = std::min(
4185 static_cast<uint32_t>(ConstMode->getZExtValue()),
4187 NewMode = DAG.getConstant(
4188 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4189 } else {
4190 // If we know the input can only be one of the supported standard modes in
4191 // the range 0-3, we can use a simplified mapping to hardware values.
4192 KnownBits KB = DAG.computeKnownBits(NewMode);
4193 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4194 // The supported standard values are 0-3. The extended values start at 8. We
4195 // need to offset by 4 if the value is in the extended range.
4196
4197 if (UseReducedTable) {
4198 // Truncate to the low 32-bits.
4199 SDValue BitTable = DAG.getConstant(
4200 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4201
4202 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4203 SDValue RoundModeTimesNumBits =
4204 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4205
4206 NewMode =
4207 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4208
4209 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4210 // the table extracted bits into inline immediates.
4211 } else {
4212 // table_index = umin(value, value - 4)
4213 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4214 SDValue BitTable =
4216
4217 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4218 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4219 SDValue IndexVal =
4220 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4221
4222 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4223 SDValue RoundModeTimesNumBits =
4224 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4225
4226 SDValue TableValue =
4227 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4228 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4229
4230 // No need to mask out the high bits since the setreg will ignore them
4231 // anyway.
4232 NewMode = TruncTable;
4233 }
4234
4235 // Insert a readfirstlane in case the value is a VGPR. We could do this
4236 // earlier and keep more operations scalar, but that interferes with
4237 // combining the source.
4238 SDValue ReadFirstLaneID =
4239 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4240 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4241 ReadFirstLaneID, NewMode);
4242 }
4243
4244 // N.B. The setreg will be later folded into s_round_mode on supported
4245 // targets.
4246 SDValue IntrinID =
4247 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4248 uint32_t BothRoundHwReg =
4250 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4251
4252 SDValue SetReg =
4253 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4254 IntrinID, RoundBothImm, NewMode);
4255
4256 return SetReg;
4257}
4258
4260 if (Op->isDivergent())
4261 return SDValue();
4262
4263 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4268 break;
4269 default:
4270 return SDValue();
4271 }
4272
4273 return Op;
4274}
4275
4276// Work around DAG legality rules only based on the result type.
4278 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4279 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4280 EVT SrcVT = Src.getValueType();
4281
4282 if (SrcVT.getScalarType() != MVT::bf16)
4283 return Op;
4284
4285 SDLoc SL(Op);
4286 SDValue BitCast =
4287 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4288
4289 EVT DstVT = Op.getValueType();
4290 if (IsStrict)
4291 llvm_unreachable("Need STRICT_BF16_TO_FP");
4292
4293 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4294}
4295
4297 SDLoc SL(Op);
4298 if (Op.getValueType() != MVT::i64)
4299 return Op;
4300
4301 uint32_t ModeHwReg =
4303 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4304 uint32_t TrapHwReg =
4306 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4307
4308 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4309 SDValue IntrinID =
4310 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4311 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4312 Op.getOperand(0), IntrinID, ModeHwRegImm);
4313 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4314 Op.getOperand(0), IntrinID, TrapHwRegImm);
4315 SDValue TokenReg =
4316 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4317 GetTrapReg.getValue(1));
4318
4319 SDValue CvtPtr =
4320 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4321 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4322
4323 return DAG.getMergeValues({Result, TokenReg}, SL);
4324}
4325
4327 SDLoc SL(Op);
4328 if (Op.getOperand(1).getValueType() != MVT::i64)
4329 return Op;