LLVM 18.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101
102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104
105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107
108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110
111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113
114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116
117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119
120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122
123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125
126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128
129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131
132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134
135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137
138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140
141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143
144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146
147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149
150 if (Subtarget->has16BitInsts()) {
151 if (Subtarget->useRealTrue16Insts()) {
152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154 } else {
155 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
156 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
157 }
158
159 // Unless there are also VOP3P operations, not operations are really legal.
160 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
161 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
162 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
163 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
164 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
165 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
166 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
167 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
168 }
169
170 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
171 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
172
174
175 // The boolean content concept here is too inflexible. Compares only ever
176 // really produce a 1-bit result. Any copy/extend from these will turn into a
177 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
178 // it's what most targets use.
181
182 // We need to custom lower vector stores from local memory
184 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
185 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
186 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
187 MVT::i1, MVT::v32i32},
188 Custom);
189
191 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
192 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
193 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
194 MVT::i1, MVT::v32i32},
195 Custom);
196
197 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
198 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
199 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
200 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
201 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
202 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
203 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
204 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
205 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
206 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
207 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
208 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
209 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
210 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
211 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
212 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
213
214 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
215 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
216 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
217 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
218 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
219 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
220 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
221
222 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
223
227 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
228
229 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
230
232 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
233
235 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
236 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
237
239 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
240 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
241 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
242 Expand);
244 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
245 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
246 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
247 Expand);
248
250 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
251 MVT::v3i16, MVT::v4i16, MVT::Other},
252 Custom);
253
256 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
257
259
261
263 Expand);
264
265#if 0
267#endif
268
269 // We only support LOAD/STORE and vector manipulation ops for vectors
270 // with > 4 elements.
271 for (MVT VT :
272 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
273 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
274 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
275 MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
276 MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
277 MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
278 MVT::v32i32, MVT::v32f32}) {
279 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
280 switch (Op) {
281 case ISD::LOAD:
282 case ISD::STORE:
284 case ISD::BITCAST:
285 case ISD::UNDEF:
289 case ISD::IS_FPCLASS:
290 break;
295 break;
296 default:
298 break;
299 }
300 }
301 }
302
304
305 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
306 // is expanded to avoid having two separate loops in case the index is a VGPR.
307
308 // Most operations are naturally 32-bit vector operations. We only support
309 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
310 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
312 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
313
315 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
316
318 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
319
321 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
322 }
323
324 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
326 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
327
329 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
330
332 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
333
335 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
336 }
337
338 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
340 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
341
343 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
344
346 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
347
349 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
350 }
351
352 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
354 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
355
357 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
358
360 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
361
363 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
364 }
365
366 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
368 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
369
371 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
372
374 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
375
377 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
378 }
379
381 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
382 Expand);
383
384 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom);
385
386 // Avoid stack access for these.
387 // TODO: Generalize to more vector types.
389 {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
390 MVT::v4i16, MVT::v4f16},
391 Custom);
392
393 // Deal with vec3 vector operations when widened to vec4.
395 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
396
397 // Deal with vec5/6/7 vector operations when widened to vec8.
399 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
400 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
401 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
402 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
403 Custom);
404
405 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
406 // and output demarshalling
407 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
408
409 // We can't return success/failure, only the old value,
410 // let LLVM add the comparison
412 Expand);
413
414 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
415
416 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
417
418 // FIXME: This should be narrowed to i32, but that only happens if i64 is
419 // illegal.
420 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
421 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
422
423 // On SI this is s_memtime and s_memrealtime on VI.
426
427 if (Subtarget->has16BitInsts()) {
430 } else {
432 }
433
434 if (Subtarget->hasMadMacF32Insts())
436
437 if (!Subtarget->hasBFI())
438 // fcopysign can be done in a single instruction with BFI.
439 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
440
441 if (!Subtarget->hasBCNT(32))
443
444 if (!Subtarget->hasBCNT(64))
446
447 if (Subtarget->hasFFBH())
449
450 if (Subtarget->hasFFBL())
452
453 // We only really have 32-bit BFE instructions (and 16-bit on VI).
454 //
455 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
456 // effort to match them now. We want this to be false for i64 cases when the
457 // extraction isn't restricted to the upper or lower half. Ideally we would
458 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
459 // span the midpoint are probably relatively rare, so don't worry about them
460 // for now.
461 if (Subtarget->hasBFE())
463
464 // Clamp modifier on add/sub
465 if (Subtarget->hasIntClamp())
467
468 if (Subtarget->hasAddNoCarry())
469 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
470 Legal);
471
472 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
473 Custom);
474
475 // These are really only legal for ieee_mode functions. We should be avoiding
476 // them for functions that don't have ieee_mode enabled, so just say they are
477 // legal.
479 {MVT::f32, MVT::f64}, Legal);
480
481 if (Subtarget->haveRoundOpsF64())
483 else
485 MVT::f64, Custom);
486
488 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
489 Legal);
490 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
491
494
495 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
496 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
497
498 if (Subtarget->has16BitInsts()) {
501 MVT::i16, Legal);
502
503 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
504
506 MVT::i16, Expand);
507
511 ISD::CTPOP},
512 MVT::i16, Promote);
513
515
516 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
517
519 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
521 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
522
524
525 // F16 - Constant Actions.
527
528 // F16 - Load/Store Actions.
530 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
532 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
533
534 // F16 - VOP1 Actions.
537 MVT::f16, Custom);
538
540
543 MVT::f16, Promote);
544
545 // F16 - VOP2 Actions.
550
551 // F16 - VOP3 Actions.
553 if (STI.hasMadF16())
555
556 for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
557 MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
558 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
559 switch (Op) {
560 case ISD::LOAD:
561 case ISD::STORE:
563 case ISD::BITCAST:
564 case ISD::UNDEF:
570 case ISD::IS_FPCLASS:
571 break;
574 break;
575 default:
577 break;
578 }
579 }
580 }
581
582 // v_perm_b32 can handle either of these.
583 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
585
586 // XXX - Do these do anything? Vector constants turn into build_vector.
587 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
588
589 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal);
590
592 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
594 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
595
597 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
599 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
600
601 setOperationAction(ISD::AND, MVT::v2i16, Promote);
602 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
603 setOperationAction(ISD::OR, MVT::v2i16, Promote);
604 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
605 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
606 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
607
609 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
611 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
612
614 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
616 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
617
619 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
621 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
622
624 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
626 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
627
629 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
631 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
632
633 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
634 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
635 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
636 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
637
639 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
641 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
642
644 MVT::v2i32, Expand);
646
648 MVT::v4i32, Expand);
649
651 MVT::v8i32, Expand);
652
653 if (!Subtarget->hasVOP3PInsts())
654 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom);
655
656 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
657 // This isn't really legal, but this avoids the legalizer unrolling it (and
658 // allows matching fneg (fabs x) patterns)
659 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
660
663
665 {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
666
668 {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
669
670 for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
673 Vec16, Custom);
675 }
676 }
677
678 if (Subtarget->hasVOP3PInsts()) {
682 MVT::v2i16, Legal);
683
686 MVT::v2f16, Legal);
687
688 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16},
689 Custom);
690
692 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
693 MVT::v16f16, MVT::v16i16},
694 Custom);
695
696 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
697 // Split vector operations.
702 VT, Custom);
703
704 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
705 // Split vector operations.
707 VT, Custom);
708
709 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
710 Custom);
711
712 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
713 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom);
714
715 if (Subtarget->hasPackedFP32Ops()) {
717 MVT::v2f32, Legal);
719 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
720 Custom);
721 }
722 }
723
725
726 if (Subtarget->has16BitInsts()) {
728 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
730 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
731 } else {
732 // Legalization hack.
733 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
734
736 }
737
739 {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
740 MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
741 Custom);
742
744
745 if (Subtarget->hasMad64_32())
747
749 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
750 MVT::v2i16, MVT::v2f16, MVT::i128},
751 Custom);
752
754 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
755 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
756 MVT::i16, MVT::i8, MVT::i128},
757 Custom);
758
760 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
761 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
762 MVT::i8, MVT::i128},
763 Custom);
764
767
768 // TODO: Could move this to custom lowering, could benefit from combines on
769 // extract of relevant bits.
771
774 ISD::SUB,
776 ISD::FADD,
777 ISD::FSUB,
778 ISD::FDIV,
783 ISD::FMA,
784 ISD::SMIN,
785 ISD::SMAX,
786 ISD::UMIN,
787 ISD::UMAX,
789 ISD::AND,
790 ISD::OR,
791 ISD::XOR,
792 ISD::FSHR,
802
803 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
805
806 // All memory operations. Some folding on the pointer operand is done to help
807 // matching the constant offsets in the addressing modes.
830
831 // FIXME: In other contexts we pretend this is a per-function property.
833
835}
836
838 return Subtarget;
839}
840
841//===----------------------------------------------------------------------===//
842// TargetLowering queries
843//===----------------------------------------------------------------------===//
844
845// v_mad_mix* support a conversion from f16 to f32.
846//
847// There is only one special case when denormals are enabled we don't currently,
848// where this is OK to use.
849bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
850 EVT DestVT, EVT SrcVT) const {
851 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
852 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
853 DestVT.getScalarType() == MVT::f32 &&
854 SrcVT.getScalarType() == MVT::f16 &&
855 // TODO: This probably only requires no input flushing?
857}
858
860 LLT DestTy, LLT SrcTy) const {
861 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
862 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
863 DestTy.getScalarSizeInBits() == 32 &&
864 SrcTy.getScalarSizeInBits() == 16 &&
865 // TODO: This probably only requires no input flushing?
867}
868
870 // SI has some legal vector types, but no legal vector operations. Say no
871 // shuffles are legal in order to prefer scalarizing some vector operations.
872 return false;
873}
874
877 EVT VT) const {
880
881 if (VT.isVector()) {
882 EVT ScalarVT = VT.getScalarType();
883 unsigned Size = ScalarVT.getSizeInBits();
884 if (Size == 16) {
885 if (Subtarget->has16BitInsts()) {
886 if (VT.isInteger())
887 return MVT::v2i16;
888 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
889 }
890 return VT.isInteger() ? MVT::i32 : MVT::f32;
891 }
892
893 if (Size < 16)
894 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
895 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
896 }
897
898 if (VT.getSizeInBits() > 32)
899 return MVT::i32;
900
902}
903
906 EVT VT) const {
909
910 if (VT.isVector()) {
911 unsigned NumElts = VT.getVectorNumElements();
912 EVT ScalarVT = VT.getScalarType();
913 unsigned Size = ScalarVT.getSizeInBits();
914
915 // FIXME: Should probably promote 8-bit vectors to i16.
916 if (Size == 16 && Subtarget->has16BitInsts())
917 return (NumElts + 1) / 2;
918
919 if (Size <= 32)
920 return NumElts;
921
922 if (Size > 32)
923 return NumElts * ((Size + 31) / 32);
924 } else if (VT.getSizeInBits() > 32)
925 return (VT.getSizeInBits() + 31) / 32;
926
928}
929
932 EVT VT, EVT &IntermediateVT,
933 unsigned &NumIntermediates, MVT &RegisterVT) const {
934 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
935 unsigned NumElts = VT.getVectorNumElements();
936 EVT ScalarVT = VT.getScalarType();
937 unsigned Size = ScalarVT.getSizeInBits();
938 // FIXME: We should fix the ABI to be the same on targets without 16-bit
939 // support, but unless we can properly handle 3-vectors, it will be still be
940 // inconsistent.
941 if (Size == 16 && Subtarget->has16BitInsts()) {
942 if (ScalarVT == MVT::bf16) {
943 RegisterVT = MVT::i32;
944 IntermediateVT = MVT::v2bf16;
945 } else {
946 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
947 IntermediateVT = RegisterVT;
948 }
949 NumIntermediates = (NumElts + 1) / 2;
950 return NumIntermediates;
951 }
952
953 if (Size == 32) {
954 RegisterVT = ScalarVT.getSimpleVT();
955 IntermediateVT = RegisterVT;
956 NumIntermediates = NumElts;
957 return NumIntermediates;
958 }
959
960 if (Size < 16 && Subtarget->has16BitInsts()) {
961 // FIXME: Should probably form v2i16 pieces
962 RegisterVT = MVT::i16;
963 IntermediateVT = ScalarVT;
964 NumIntermediates = NumElts;
965 return NumIntermediates;
966 }
967
968
969 if (Size != 16 && Size <= 32) {
970 RegisterVT = MVT::i32;
971 IntermediateVT = ScalarVT;
972 NumIntermediates = NumElts;
973 return NumIntermediates;
974 }
975
976 if (Size > 32) {
977 RegisterVT = MVT::i32;
978 IntermediateVT = RegisterVT;
979 NumIntermediates = NumElts * ((Size + 31) / 32);
980 return NumIntermediates;
981 }
982 }
983
985 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
986}
987
988static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
989 assert(MaxNumLanes != 0);
990
991 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
992 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
993 return EVT::getVectorVT(Ty->getContext(),
994 EVT::getEVT(VT->getElementType()),
995 NumElts);
996 }
997
998 return EVT::getEVT(Ty);
999}
1000
1001// Peek through TFE struct returns to only use the data size.
1002static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
1003 auto *ST = dyn_cast<StructType>(Ty);
1004 if (!ST)
1005 return memVTFromLoadIntrData(Ty, MaxNumLanes);
1006
1007 // TFE intrinsics return an aggregate type.
1008 assert(ST->getNumContainedTypes() == 2 &&
1009 ST->getContainedType(1)->isIntegerTy(32));
1010 return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
1011}
1012
1013/// Map address space 7 to MVT::v5i32 because that's its in-memory
1014/// representation. This return value is vector-typed because there is no
1015/// MVT::i160 and it is not clear if one can be added. While this could
1016/// cause issues during codegen, these address space 7 pointers will be
1017/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1018/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1019/// modeling, to work.
1021 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1022 return MVT::v5i32;
1024}
1025/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1026/// v8i32 when padding is added.
1028 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1029 return MVT::v8i32;
1031}
1032
1034 const CallInst &CI,
1035 MachineFunction &MF,
1036 unsigned IntrID) const {
1038 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1040
1041 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1044 (Intrinsic::ID)IntrID);
1045 MemoryEffects ME = Attr.getMemoryEffects();
1046 if (ME.doesNotAccessMemory())
1047 return false;
1048
1049 // TODO: Should images get their own address space?
1050 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1051
1052 if (RsrcIntr->IsImage)
1053 Info.align.reset();
1054
1055 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1056 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1057 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1058 // We conservatively set the memory operand of a buffer intrinsic to the
1059 // base resource pointer, so that we can access alias information about
1060 // those pointers. Cases like "this points at the same value
1061 // but with a different offset" are handled in
1062 // areMemAccessesTriviallyDisjoint.
1063 Info.ptrVal = RsrcArg;
1064 }
1065
1067 if (ME.onlyReadsMemory()) {
1068 unsigned MaxNumLanes = 4;
1069
1070 if (RsrcIntr->IsImage) {
1073 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1075
1076 if (!BaseOpcode->Gather4) {
1077 // If this isn't a gather, we may have excess loaded elements in the
1078 // IR type. Check the dmask for the real number of elements loaded.
1079 unsigned DMask
1080 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1081 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1082 }
1083 }
1084
1085 Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
1086
1087 // FIXME: What does alignment mean for an image?
1090 } else if (ME.onlyWritesMemory()) {
1092
1093 Type *DataTy = CI.getArgOperand(0)->getType();
1094 if (RsrcIntr->IsImage) {
1095 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1096 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1097 Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
1098 } else
1099 Info.memVT = EVT::getEVT(DataTy);
1100
1102 } else {
1103 // Atomic
1104 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1106 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1110
1111 // XXX - Should this be volatile without known ordering?
1113
1114 switch (IntrID) {
1115 default:
1116 break;
1117 case Intrinsic::amdgcn_raw_buffer_load_lds:
1118 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1119 case Intrinsic::amdgcn_struct_buffer_load_lds:
1120 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1121 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1122 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1123 return true;
1124 }
1125 }
1126 }
1127 return true;
1128 }
1129
1130 switch (IntrID) {
1131 case Intrinsic::amdgcn_ds_ordered_add:
1132 case Intrinsic::amdgcn_ds_ordered_swap:
1133 case Intrinsic::amdgcn_ds_fadd:
1134 case Intrinsic::amdgcn_ds_fmin:
1135 case Intrinsic::amdgcn_ds_fmax: {
1137 Info.memVT = MVT::getVT(CI.getType());
1138 Info.ptrVal = CI.getOperand(0);
1139 Info.align.reset();
1141
1142 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1143 if (!Vol->isZero())
1145
1146 return true;
1147 }
1148 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1150 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1151 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1152 Info.align.reset();
1154
1155 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1156 if (!Vol || !Vol->isZero())
1158
1159 return true;
1160 }
1161 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1162 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1164 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1165 Info.ptrVal = nullptr;
1166 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1168 return true;
1169 }
1170 case Intrinsic::amdgcn_ds_append:
1171 case Intrinsic::amdgcn_ds_consume: {
1173 Info.memVT = MVT::getVT(CI.getType());
1174 Info.ptrVal = CI.getOperand(0);
1175 Info.align.reset();
1177
1178 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1179 if (!Vol->isZero())
1181
1182 return true;
1183 }
1184 case Intrinsic::amdgcn_global_atomic_csub: {
1186 Info.memVT = MVT::getVT(CI.getType());
1187 Info.ptrVal = CI.getOperand(0);
1188 Info.align.reset();
1192 return true;
1193 }
1194 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1196 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1197
1198 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1199 Info.align.reset();
1202 return true;
1203 }
1204 case Intrinsic::amdgcn_global_atomic_fadd:
1205 case Intrinsic::amdgcn_global_atomic_fmin:
1206 case Intrinsic::amdgcn_global_atomic_fmax:
1207 case Intrinsic::amdgcn_flat_atomic_fadd:
1208 case Intrinsic::amdgcn_flat_atomic_fmin:
1209 case Intrinsic::amdgcn_flat_atomic_fmax:
1210 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1211 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1213 Info.memVT = MVT::getVT(CI.getType());
1214 Info.ptrVal = CI.getOperand(0);
1215 Info.align.reset();
1220 return true;
1221 }
1222 case Intrinsic::amdgcn_ds_gws_init:
1223 case Intrinsic::amdgcn_ds_gws_barrier:
1224 case Intrinsic::amdgcn_ds_gws_sema_v:
1225 case Intrinsic::amdgcn_ds_gws_sema_br:
1226 case Intrinsic::amdgcn_ds_gws_sema_p:
1227 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1229
1230 const GCNTargetMachine &TM =
1231 static_cast<const GCNTargetMachine &>(getTargetMachine());
1232
1234 Info.ptrVal = MFI->getGWSPSV(TM);
1235
1236 // This is an abstract access, but we need to specify a type and size.
1237 Info.memVT = MVT::i32;
1238 Info.size = 4;
1239 Info.align = Align(4);
1240
1241 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1243 else
1245 return true;
1246 }
1247 case Intrinsic::amdgcn_global_load_lds: {
1249 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1250 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1253 return true;
1254 }
1255 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1257
1258 const GCNTargetMachine &TM =
1259 static_cast<const GCNTargetMachine &>(getTargetMachine());
1260
1262 Info.ptrVal = MFI->getGWSPSV(TM);
1263
1264 // This is an abstract access, but we need to specify a type and size.
1265 Info.memVT = MVT::i32;
1266 Info.size = 4;
1267 Info.align = Align(4);
1268
1270 return true;
1271 }
1272 default:
1273 return false;
1274 }
1275}
1276
1279 Type *&AccessTy) const {
1280 switch (II->getIntrinsicID()) {
1281 case Intrinsic::amdgcn_ds_ordered_add:
1282 case Intrinsic::amdgcn_ds_ordered_swap:
1283 case Intrinsic::amdgcn_ds_append:
1284 case Intrinsic::amdgcn_ds_consume:
1285 case Intrinsic::amdgcn_ds_fadd:
1286 case Intrinsic::amdgcn_ds_fmin:
1287 case Intrinsic::amdgcn_ds_fmax:
1288 case Intrinsic::amdgcn_global_atomic_fadd:
1289 case Intrinsic::amdgcn_flat_atomic_fadd:
1290 case Intrinsic::amdgcn_flat_atomic_fmin:
1291 case Intrinsic::amdgcn_flat_atomic_fmax:
1292 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1293 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1294 case Intrinsic::amdgcn_global_atomic_csub: {
1295 Value *Ptr = II->getArgOperand(0);
1296 AccessTy = II->getType();
1297 Ops.push_back(Ptr);
1298 return true;
1299 }
1300 default:
1301 return false;
1302 }
1303}
1304
1305bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
1306 if (!Subtarget->hasFlatInstOffsets()) {
1307 // Flat instructions do not have offsets, and only have the register
1308 // address.
1309 return AM.BaseOffs == 0 && AM.Scale == 0;
1310 }
1311
1312 return AM.Scale == 0 &&
1313 (AM.BaseOffs == 0 ||
1314 Subtarget->getInstrInfo()->isLegalFLATOffset(
1316}
1317
1319 if (Subtarget->hasFlatGlobalInsts())
1320 return AM.Scale == 0 &&
1321 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1324
1325 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1326 // Assume the we will use FLAT for all global memory accesses
1327 // on VI.
1328 // FIXME: This assumption is currently wrong. On VI we still use
1329 // MUBUF instructions for the r + i addressing mode. As currently
1330 // implemented, the MUBUF instructions only work on buffer < 4GB.
1331 // It may be possible to support > 4GB buffers with MUBUF instructions,
1332 // by setting the stride value in the resource descriptor which would
1333 // increase the size limit to (stride * 4GB). However, this is risky,
1334 // because it has never been validated.
1335 return isLegalFlatAddressingMode(AM);
1336 }
1337
1338 return isLegalMUBUFAddressingMode(AM);
1339}
1340
1341bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1342 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1343 // additionally can do r + r + i with addr64. 32-bit has more addressing
1344 // mode options. Depending on the resource constant, it can also do
1345 // (i64 r0) + (i32 r1) * (i14 i).
1346 //
1347 // Private arrays end up using a scratch buffer most of the time, so also
1348 // assume those use MUBUF instructions. Scratch loads / stores are currently
1349 // implemented as mubuf instructions with offen bit set, so slightly
1350 // different than the normal addr64.
1351 if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
1352 return false;
1353
1354 // FIXME: Since we can split immediate into soffset and immediate offset,
1355 // would it make sense to allow any immediate?
1356
1357 switch (AM.Scale) {
1358 case 0: // r + i or just i, depending on HasBaseReg.
1359 return true;
1360 case 1:
1361 return true; // We have r + r or r + i.
1362 case 2:
1363 if (AM.HasBaseReg) {
1364 // Reject 2 * r + r.
1365 return false;
1366 }
1367
1368 // Allow 2 * r as r + r
1369 // Or 2 * r + i is allowed as r + r + i.
1370 return true;
1371 default: // Don't allow n * r
1372 return false;
1373 }
1374}
1375
1377 const AddrMode &AM, Type *Ty,
1378 unsigned AS, Instruction *I) const {
1379 // No global is ever allowed as a base.
1380 if (AM.BaseGV)
1381 return false;
1382
1383 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1384 return isLegalGlobalAddressingMode(AM);
1385
1386 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1389 // If the offset isn't a multiple of 4, it probably isn't going to be
1390 // correctly aligned.
1391 // FIXME: Can we get the real alignment here?
1392 if (AM.BaseOffs % 4 != 0)
1393 return isLegalMUBUFAddressingMode(AM);
1394
1395 // There are no SMRD extloads, so if we have to do a small type access we
1396 // will use a MUBUF load.
1397 // FIXME?: We also need to do this if unaligned, but we don't know the
1398 // alignment here.
1399 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1400 return isLegalGlobalAddressingMode(AM);
1401
1403 // SMRD instructions have an 8-bit, dword offset on SI.
1404 if (!isUInt<8>(AM.BaseOffs / 4))
1405 return false;
1406 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1407 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1408 // in 8-bits, it can use a smaller encoding.
1409 if (!isUInt<32>(AM.BaseOffs / 4))
1410 return false;
1411 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1412 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1413 if (!isUInt<20>(AM.BaseOffs))
1414 return false;
1415 } else {
1416 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1417 // for S_BUFFER_* instructions).
1418 if (!isInt<21>(AM.BaseOffs))
1419 return false;
1420 }
1421
1422 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1423 return true;
1424
1425 if (AM.Scale == 1 && AM.HasBaseReg)
1426 return true;
1427
1428 return false;
1429 }
1430
1431 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1432 return isLegalMUBUFAddressingMode(AM);
1433
1434 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1435 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1436 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1437 // field.
1438 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1439 // an 8-bit dword offset but we don't know the alignment here.
1440 if (!isUInt<16>(AM.BaseOffs))
1441 return false;
1442
1443 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1444 return true;
1445
1446 if (AM.Scale == 1 && AM.HasBaseReg)
1447 return true;
1448
1449 return false;
1450 }
1451
1453 // For an unknown address space, this usually means that this is for some
1454 // reason being used for pure arithmetic, and not based on some addressing
1455 // computation. We don't have instructions that compute pointers with any
1456 // addressing modes, so treat them as having no offset like flat
1457 // instructions.
1458 return isLegalFlatAddressingMode(AM);
1459 }
1460
1461 // Assume a user alias of global for unknown address spaces.
1462 return isLegalGlobalAddressingMode(AM);
1463}
1464
1466 const MachineFunction &MF) const {
1468 return (MemVT.getSizeInBits() <= 4 * 32);
1469 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1470 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1471 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1472 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1473 return (MemVT.getSizeInBits() <= 2 * 32);
1474 }
1475 return true;
1476}
1477
1479 unsigned Size, unsigned AddrSpace, Align Alignment,
1480 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1481 if (IsFast)
1482 *IsFast = 0;
1483
1484 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1485 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1486 // Check if alignment requirements for ds_read/write instructions are
1487 // disabled.
1488 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1489 return false;
1490
1491 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1492 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1493 Alignment < RequiredAlignment)
1494 return false;
1495
1496 // Either, the alignment requirements are "enabled", or there is an
1497 // unaligned LDS access related hardware bug though alignment requirements
1498 // are "disabled". In either case, we need to check for proper alignment
1499 // requirements.
1500 //
1501 switch (Size) {
1502 case 64:
1503 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1504 // address is negative, then the instruction is incorrectly treated as
1505 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1506 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1507 // load later in the SILoadStoreOptimizer.
1508 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1509 return false;
1510
1511 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1512 // can do a 4 byte aligned, 8 byte access in a single operation using
1513 // ds_read2/write2_b32 with adjacent offsets.
1514 RequiredAlignment = Align(4);
1515
1516 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1517 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1518 // ds_write2_b32 depending on the alignment. In either case with either
1519 // alignment there is no faster way of doing this.
1520
1521 // The numbers returned here and below are not additive, it is a 'speed
1522 // rank'. They are just meant to be compared to decide if a certain way
1523 // of lowering an operation is faster than another. For that purpose
1524 // naturally aligned operation gets it bitsize to indicate that "it
1525 // operates with a speed comparable to N-bit wide load". With the full
1526 // alignment ds128 is slower than ds96 for example. If underaligned it
1527 // is comparable to a speed of a single dword access, which would then
1528 // mean 32 < 128 and it is faster to issue a wide load regardless.
1529 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1530 // wider load which will not be aligned anymore the latter is slower.
1531 if (IsFast)
1532 *IsFast = (Alignment >= RequiredAlignment) ? 64
1533 : (Alignment < Align(4)) ? 32
1534 : 1;
1535 return true;
1536 }
1537
1538 break;
1539 case 96:
1540 if (!Subtarget->hasDS96AndDS128())
1541 return false;
1542
1543 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1544 // gfx8 and older.
1545
1546 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1547 // Naturally aligned access is fastest. However, also report it is Fast
1548 // if memory is aligned less than DWORD. A narrow load or store will be
1549 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1550 // be more of them, so overall we will pay less penalty issuing a single
1551 // instruction.
1552
1553 // See comment on the values above.
1554 if (IsFast)
1555 *IsFast = (Alignment >= RequiredAlignment) ? 96
1556 : (Alignment < Align(4)) ? 32
1557 : 1;
1558 return true;
1559 }
1560
1561 break;
1562 case 128:
1563 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1564 return false;
1565
1566 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1567 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1568 // single operation using ds_read2/write2_b64.
1569 RequiredAlignment = Align(8);
1570
1571 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1572 // Naturally aligned access is fastest. However, also report it is Fast
1573 // if memory is aligned less than DWORD. A narrow load or store will be
1574 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1575 // will be more of them, so overall we will pay less penalty issuing a
1576 // single instruction.
1577
1578 // See comment on the values above.
1579 if (IsFast)
1580 *IsFast = (Alignment >= RequiredAlignment) ? 128
1581 : (Alignment < Align(4)) ? 32
1582 : 1;
1583 return true;
1584 }
1585
1586 break;
1587 default:
1588 if (Size > 32)
1589 return false;
1590
1591 break;
1592 }
1593
1594 // See comment on the values above.
1595 // Note that we have a single-dword or sub-dword here, so if underaligned
1596 // it is a slowest possible access, hence returned value is 0.
1597 if (IsFast)
1598 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1599
1600 return Alignment >= RequiredAlignment ||
1601 Subtarget->hasUnalignedDSAccessEnabled();
1602 }
1603
1604 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1605 bool AlignedBy4 = Alignment >= Align(4);
1606 if (IsFast)
1607 *IsFast = AlignedBy4;
1608
1609 return AlignedBy4 ||
1610 Subtarget->enableFlatScratch() ||
1611 Subtarget->hasUnalignedScratchAccess();
1612 }
1613
1614 // FIXME: We have to be conservative here and assume that flat operations
1615 // will access scratch. If we had access to the IR function, then we
1616 // could determine if any private memory was used in the function.
1617 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1618 !Subtarget->hasUnalignedScratchAccess()) {
1619 bool AlignedBy4 = Alignment >= Align(4);
1620 if (IsFast)
1621 *IsFast = AlignedBy4;
1622
1623 return AlignedBy4;
1624 }
1625
1626 // So long as they are correct, wide global memory operations perform better
1627 // than multiple smaller memory ops -- even when misaligned
1628 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1629 if (IsFast)
1630 *IsFast = Size;
1631
1632 return Alignment >= Align(4) ||
1634 }
1635
1636 // Smaller than dword value must be aligned.
1637 if (Size < 32)
1638 return false;
1639
1640 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1641 // byte-address are ignored, thus forcing Dword alignment.
1642 // This applies to private, global, and constant memory.
1643 if (IsFast)
1644 *IsFast = 1;
1645
1646 return Size >= 32 && Alignment >= Align(4);
1647}
1648
1650 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1651 unsigned *IsFast) const {
1653 Alignment, Flags, IsFast);
1654}
1655
1657 const MemOp &Op, const AttributeList &FuncAttributes) const {
1658 // FIXME: Should account for address space here.
1659
1660 // The default fallback uses the private pointer size as a guess for a type to
1661 // use. Make sure we switch these to 64-bit accesses.
1662
1663 if (Op.size() >= 16 &&
1664 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1665 return MVT::v4i32;
1666
1667 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1668 return MVT::v2i32;
1669
1670 // Use the default.
1671 return MVT::Other;
1672}
1673
1675 const MemSDNode *MemNode = cast<MemSDNode>(N);
1676 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1677}
1678
1680 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1682}
1683
1685 unsigned DestAS) const {
1686 // Flat -> private/local is a simple truncate.
1687 // Flat -> global is no-op
1688 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1689 return true;
1690
1691 const GCNTargetMachine &TM =
1692 static_cast<const GCNTargetMachine &>(getTargetMachine());
1693 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1694}
1695
1697 const MemSDNode *MemNode = cast<MemSDNode>(N);
1698
1700}
1701
1704 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1705 VT.getScalarType().bitsLE(MVT::i16))
1708}
1709
1711 Type *Ty) const {
1712 // FIXME: Could be smarter if called for vector constants.
1713 return true;
1714}
1715
1717 unsigned Index) const {
1719 return false;
1720
1721 // TODO: Add more cases that are cheap.
1722 return Index == 0;
1723}
1724
1726 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1727 switch (Op) {
1728 case ISD::LOAD:
1729 case ISD::STORE:
1730
1731 // These operations are done with 32-bit instructions anyway.
1732 case ISD::AND:
1733 case ISD::OR:
1734 case ISD::XOR:
1735 case ISD::SELECT:
1736 // TODO: Extensions?
1737 return true;
1738 default:
1739 return false;
1740 }
1741 }
1742
1743 // SimplifySetCC uses this function to determine whether or not it should
1744 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1745 if (VT == MVT::i1 && Op == ISD::SETCC)
1746 return false;
1747
1749}
1750
1751SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1752 const SDLoc &SL,
1753 SDValue Chain,
1754 uint64_t Offset) const {
1755 const DataLayout &DL = DAG.getDataLayout();
1758
1759 const ArgDescriptor *InputPtrReg;
1760 const TargetRegisterClass *RC;
1761 LLT ArgTy;
1763
1764 std::tie(InputPtrReg, RC, ArgTy) =
1766
1767 // We may not have the kernarg segment argument if we have no kernel
1768 // arguments.
1769 if (!InputPtrReg)
1770 return DAG.getConstant(0, SL, PtrVT);
1771
1773 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1774 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1775
1776 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
1777}
1778
1779SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1780 const SDLoc &SL) const {
1783 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1784}
1785
1786SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1787 const SDLoc &SL) const {
1788
1790 std::optional<uint32_t> KnownSize =
1792 if (KnownSize.has_value())
1793 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1794 return SDValue();
1795}
1796
1797SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1798 const SDLoc &SL, SDValue Val,
1799 bool Signed,
1800 const ISD::InputArg *Arg) const {
1801 // First, if it is a widened vector, narrow it.
1802 if (VT.isVector() &&
1804 EVT NarrowedVT =
1807 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1808 DAG.getConstant(0, SL, MVT::i32));
1809 }
1810
1811 // Then convert the vector elements or scalar value.
1812 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1813 VT.bitsLT(MemVT)) {
1814 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1815 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1816 }
1817
1818 if (MemVT.isFloatingPoint())
1819 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1820 else if (Signed)
1821 Val = DAG.getSExtOrTrunc(Val, SL, VT);
1822 else
1823 Val = DAG.getZExtOrTrunc(Val, SL, VT);
1824
1825 return Val;
1826}
1827
1828SDValue SITargetLowering::lowerKernargMemParameter(
1829 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
1830 uint64_t Offset, Align Alignment, bool Signed,
1831 const ISD::InputArg *Arg) const {
1833
1834 // Try to avoid using an extload by loading earlier than the argument address,
1835 // and extracting the relevant bits. The load should hopefully be merged with
1836 // the previous argument.
1837 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
1838 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1839 int64_t AlignDownOffset = alignDown(Offset, 4);
1840 int64_t OffsetDiff = Offset - AlignDownOffset;
1841
1842 EVT IntVT = MemVT.changeTypeToInteger();
1843
1844 // TODO: If we passed in the base kernel offset we could have a better
1845 // alignment than 4, but we don't really need it.
1846 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1847 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
1850
1851 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1852 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1853
1854 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1855 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1856 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1857
1858
1859 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1860 }
1861
1862 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1863 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
1866
1867 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1868 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1869}
1870
1871SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1872 const SDLoc &SL, SDValue Chain,
1873 const ISD::InputArg &Arg) const {
1875 MachineFrameInfo &MFI = MF.getFrameInfo();
1876
1877 if (Arg.Flags.isByVal()) {
1878 unsigned Size = Arg.Flags.getByValSize();
1879 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1880 return DAG.getFrameIndex(FrameIdx, MVT::i32);
1881 }
1882
1883 unsigned ArgOffset = VA.getLocMemOffset();
1884 unsigned ArgSize = VA.getValVT().getStoreSize();
1885
1886 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1887
1888 // Create load nodes to retrieve arguments from the stack.
1889 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1890 SDValue ArgValue;
1891
1892 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1894 MVT MemVT = VA.getValVT();
1895
1896 switch (VA.getLocInfo()) {
1897 default:
1898 break;
1899 case CCValAssign::BCvt:
1900 MemVT = VA.getLocVT();
1901 break;
1902 case CCValAssign::SExt:
1903 ExtType = ISD::SEXTLOAD;
1904 break;
1905 case CCValAssign::ZExt:
1906 ExtType = ISD::ZEXTLOAD;
1907 break;
1908 case CCValAssign::AExt:
1909 ExtType = ISD::EXTLOAD;
1910 break;
1911 }
1912
1913 ArgValue = DAG.getExtLoad(
1914 ExtType, SL, VA.getLocVT(), Chain, FIN,
1916 MemVT);
1917 return ArgValue;
1918}
1919
1920SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1921 const SIMachineFunctionInfo &MFI,
1922 EVT VT,
1924 const ArgDescriptor *Reg;
1925 const TargetRegisterClass *RC;
1926 LLT Ty;
1927
1928 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
1929 if (!Reg) {
1931 // It's possible for a kernarg intrinsic call to appear in a kernel with
1932 // no allocated segment, in which case we do not add the user sgpr
1933 // argument, so just return null.
1934 return DAG.getConstant(0, SDLoc(), VT);
1935 }
1936
1937 // It's undefined behavior if a function marked with the amdgpu-no-*
1938 // attributes uses the corresponding intrinsic.
1939 return DAG.getUNDEF(VT);
1940 }
1941
1942 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
1943}
1944
1946 CallingConv::ID CallConv,
1947 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
1948 FunctionType *FType,
1949 SIMachineFunctionInfo *Info) {
1950 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1951 const ISD::InputArg *Arg = &Ins[I];
1952
1953 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1954 "vector type argument should have been split");
1955
1956 // First check if it's a PS input addr.
1957 if (CallConv == CallingConv::AMDGPU_PS &&
1958 !Arg->Flags.isInReg() && PSInputNum <= 15) {
1959 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1960
1961 // Inconveniently only the first part of the split is marked as isSplit,
1962 // so skip to the end. We only want to increment PSInputNum once for the
1963 // entire split argument.
1964 if (Arg->Flags.isSplit()) {
1965 while (!Arg->Flags.isSplitEnd()) {
1966 assert((!Arg->VT.isVector() ||
1967 Arg->VT.getScalarSizeInBits() == 16) &&
1968 "unexpected vector split in ps argument type");
1969 if (!SkipArg)
1970 Splits.push_back(*Arg);
1971 Arg = &Ins[++I];
1972 }
1973 }
1974
1975 if (SkipArg) {
1976 // We can safely skip PS inputs.
1977 Skipped.set(Arg->getOrigArgIndex());
1978 ++PSInputNum;
1979 continue;
1980 }
1981
1982 Info->markPSInputAllocated(PSInputNum);
1983 if (Arg->Used)
1984 Info->markPSInputEnabled(PSInputNum);
1985
1986 ++PSInputNum;
1987 }
1988
1989 Splits.push_back(*Arg);
1990 }
1991}
1992
1993// Allocate special inputs passed in VGPRs.
1995 MachineFunction &MF,
1996 const SIRegisterInfo &TRI,
1997 SIMachineFunctionInfo &Info) const {
1998 const LLT S32 = LLT::scalar(32);
2000
2001 if (Info.hasWorkItemIDX()) {
2002 Register Reg = AMDGPU::VGPR0;
2003 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2004
2005 CCInfo.AllocateReg(Reg);
2006 unsigned Mask = (Subtarget->hasPackedTID() &&
2007 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2008 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2009 }
2010
2011 if (Info.hasWorkItemIDY()) {
2012 assert(Info.hasWorkItemIDX());
2013 if (Subtarget->hasPackedTID()) {
2014 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2015 0x3ff << 10));
2016 } else {
2017 unsigned Reg = AMDGPU::VGPR1;
2018 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2019
2020 CCInfo.AllocateReg(Reg);
2021 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2022 }
2023 }
2024
2025 if (Info.hasWorkItemIDZ()) {
2026 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2027 if (Subtarget->hasPackedTID()) {
2028 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2029 0x3ff << 20));
2030 } else {
2031 unsigned Reg = AMDGPU::VGPR2;
2032 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2033
2034 CCInfo.AllocateReg(Reg);
2035 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2036 }
2037 }
2038}
2039
2040// Try to allocate a VGPR at the end of the argument list, or if no argument
2041// VGPRs are left allocating a stack slot.
2042// If \p Mask is is given it indicates bitfield position in the register.
2043// If \p Arg is given use it with new ]p Mask instead of allocating new.
2044static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2045 ArgDescriptor Arg = ArgDescriptor()) {
2046 if (Arg.isSet())
2047 return ArgDescriptor::createArg(Arg, Mask);
2048
2049 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2050 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2051 if (RegIdx == ArgVGPRs.size()) {
2052 // Spill to stack required.
2053 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2054
2055 return ArgDescriptor::createStack(Offset, Mask);
2056 }
2057
2058 unsigned Reg = ArgVGPRs[RegIdx];
2059 Reg = CCInfo.AllocateReg(Reg);
2060 assert(Reg != AMDGPU::NoRegister);
2061
2062 MachineFunction &MF = CCInfo.getMachineFunction();
2063 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2064 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2065 return ArgDescriptor::createRegister(Reg, Mask);
2066}
2067
2069 const TargetRegisterClass *RC,
2070 unsigned NumArgRegs) {
2071 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2072 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2073 if (RegIdx == ArgSGPRs.size())
2074 report_fatal_error("ran out of SGPRs for arguments");
2075
2076 unsigned Reg = ArgSGPRs[RegIdx];
2077 Reg = CCInfo.AllocateReg(Reg);
2078 assert(Reg != AMDGPU::NoRegister);
2079
2080 MachineFunction &MF = CCInfo.getMachineFunction();
2081 MF.addLiveIn(Reg, RC);
2083}
2084
2085// If this has a fixed position, we still should allocate the register in the
2086// CCInfo state. Technically we could get away with this for values passed
2087// outside of the normal argument range.
2089 const TargetRegisterClass *RC,
2090 MCRegister Reg) {
2091 Reg = CCInfo.AllocateReg(Reg);
2092 assert(Reg != AMDGPU::NoRegister);
2093 MachineFunction &MF = CCInfo.getMachineFunction();
2094 MF.addLiveIn(Reg, RC);
2095}
2096
2097static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2098 if (Arg) {
2099 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2100 Arg.getRegister());
2101 } else
2102 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2103}
2104
2105static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2106 if (Arg) {
2107 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2108 Arg.getRegister());
2109 } else
2110 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2111}
2112
2113/// Allocate implicit function VGPR arguments at the end of allocated user
2114/// arguments.
2116 CCState &CCInfo, MachineFunction &MF,
2117 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2118 const unsigned Mask = 0x3ff;
2119 ArgDescriptor Arg;
2120
2121 if (Info.hasWorkItemIDX()) {
2122 Arg = allocateVGPR32Input(CCInfo, Mask);
2123 Info.setWorkItemIDX(Arg);
2124 }
2125
2126 if (Info.hasWorkItemIDY()) {
2127 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2128 Info.setWorkItemIDY(Arg);
2129 }
2130
2131 if (Info.hasWorkItemIDZ())
2132 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2133}
2134
2135/// Allocate implicit function VGPR arguments in fixed registers.
2137 CCState &CCInfo, MachineFunction &MF,
2138 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2139 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2140 if (!Reg)
2141 report_fatal_error("failed to allocated VGPR for implicit arguments");
2142
2143 const unsigned Mask = 0x3ff;
2144 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2145 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2146 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2147}
2148
2150 CCState &CCInfo,
2151 MachineFunction &MF,
2152 const SIRegisterInfo &TRI,
2153 SIMachineFunctionInfo &Info) const {
2154 auto &ArgInfo = Info.getArgInfo();
2155 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2156
2157 // TODO: Unify handling with private memory pointers.
2158 if (UserSGPRInfo.hasDispatchPtr())
2159 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2160
2161 const Module *M = MF.getFunction().getParent();
2162 if (UserSGPRInfo.hasQueuePtr() &&
2164 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2165
2166 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2167 // constant offset from the kernarg segment.
2168 if (Info.hasImplicitArgPtr())
2169 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2170
2171 if (UserSGPRInfo.hasDispatchID())
2172 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2173
2174 // flat_scratch_init is not applicable for non-kernel functions.
2175
2176 if (Info.hasWorkGroupIDX())
2177 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2178
2179 if (Info.hasWorkGroupIDY())
2180 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2181
2182 if (Info.hasWorkGroupIDZ())
2183 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2184
2185 if (Info.hasLDSKernelId())
2186 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2187}
2188
2189// Allocate special inputs passed in user SGPRs.
2191 MachineFunction &MF,
2192 const SIRegisterInfo &TRI,
2193 SIMachineFunctionInfo &Info) const {
2194 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2195 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2196 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2197 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2198 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2199 }
2200
2201 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2202 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2203 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2204 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2205 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2206 }
2207
2208 if (UserSGPRInfo.hasDispatchPtr()) {
2209 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2210 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2211 CCInfo.AllocateReg(DispatchPtrReg);
2212 }
2213
2214 const Module *M = MF.getFunction().getParent();
2215 if (UserSGPRInfo.hasQueuePtr() &&
2217 Register QueuePtrReg = Info.addQueuePtr(TRI);
2218 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2219 CCInfo.AllocateReg(QueuePtrReg);
2220 }
2221
2222 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2224 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2225 CCInfo.AllocateReg(InputPtrReg);
2226
2227 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2228 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2229 }
2230
2231 if (UserSGPRInfo.hasDispatchID()) {
2232 Register DispatchIDReg = Info.addDispatchID(TRI);
2233 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2234 CCInfo.AllocateReg(DispatchIDReg);
2235 }
2236
2237 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2238 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2239 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2240 CCInfo.AllocateReg(FlatScratchInitReg);
2241 }
2242
2243 if (Info.hasLDSKernelId()) {
2244 Register Reg = Info.addLDSKernelId();
2245 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2246 CCInfo.AllocateReg(Reg);
2247 }
2248
2249 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2250 // these from the dispatch pointer.
2251}
2252
2253// Allocate special input registers that are initialized per-wave.
2255 MachineFunction &MF,
2257 CallingConv::ID CallConv,
2258 bool IsShader) const {
2259 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2260 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2261 // Note: user SGPRs are handled by the front-end for graphics shaders
2262 // Pad up the used user SGPRs with dead inputs.
2263
2264 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2265 // before enabling architected SGPRs for workgroup IDs.
2266 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2267
2268 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2269 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2270 // rely on it to reach 16 since if we end up having no stack usage, it will
2271 // not really be added.
2272 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2273 Info.hasWorkGroupIDY() +
2274 Info.hasWorkGroupIDZ() +
2275 Info.hasWorkGroupInfo();
2276 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2277 Register Reg = Info.addReservedUserSGPR();
2278 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2279 CCInfo.AllocateReg(Reg);
2280 }
2281 }
2282
2283 if (Info.hasWorkGroupIDX()) {
2284 Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs);
2285 if (!HasArchitectedSGPRs)
2286 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2287
2288 CCInfo.AllocateReg(Reg);
2289 }
2290
2291 if (Info.hasWorkGroupIDY()) {
2292 Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs);
2293 if (!HasArchitectedSGPRs)
2294 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2295
2296 CCInfo.AllocateReg(Reg);
2297 }
2298
2299 if (Info.hasWorkGroupIDZ()) {
2300 Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs);
2301 if (!HasArchitectedSGPRs)
2302 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2303
2304 CCInfo.AllocateReg(Reg);
2305 }
2306
2307 if (Info.hasWorkGroupInfo()) {
2308 Register Reg = Info.addWorkGroupInfo();
2309 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2310 CCInfo.AllocateReg(Reg);
2311 }
2312
2313 if (Info.hasPrivateSegmentWaveByteOffset()) {
2314 // Scratch wave offset passed in system SGPR.
2315 unsigned PrivateSegmentWaveByteOffsetReg;
2316
2317 if (IsShader) {
2318 PrivateSegmentWaveByteOffsetReg =
2319 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2320
2321 // This is true if the scratch wave byte offset doesn't have a fixed
2322 // location.
2323 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2324 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2325 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2326 }
2327 } else
2328 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2329
2330 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2331 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2332 }
2333
2334 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2335 Info.getNumPreloadedSGPRs() >= 16);
2336}
2337
2339 MachineFunction &MF,
2340 const SIRegisterInfo &TRI,
2341 SIMachineFunctionInfo &Info) {
2342 // Now that we've figured out where the scratch register inputs are, see if
2343 // should reserve the arguments and use them directly.
2344 MachineFrameInfo &MFI = MF.getFrameInfo();
2345 bool HasStackObjects = MFI.hasStackObjects();
2346 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2347
2348 // Record that we know we have non-spill stack objects so we don't need to
2349 // check all stack objects later.
2350 if (HasStackObjects)
2351 Info.setHasNonSpillStackObjects(true);
2352
2353 // Everything live out of a block is spilled with fast regalloc, so it's
2354 // almost certain that spilling will be required.
2355 if (TM.getOptLevel() == CodeGenOptLevel::None)
2356 HasStackObjects = true;
2357
2358 // For now assume stack access is needed in any callee functions, so we need
2359 // the scratch registers to pass in.
2360 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2361
2362 if (!ST.enableFlatScratch()) {
2363 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2364 // If we have stack objects, we unquestionably need the private buffer
2365 // resource. For the Code Object V2 ABI, this will be the first 4 user
2366 // SGPR inputs. We can reserve those and use them directly.
2367
2368 Register PrivateSegmentBufferReg =
2370 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2371 } else {
2372 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2373 // We tentatively reserve the last registers (skipping the last registers
2374 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2375 // we'll replace these with the ones immediately after those which were
2376 // really allocated. In the prologue copies will be inserted from the
2377 // argument to these reserved registers.
2378
2379 // Without HSA, relocations are used for the scratch pointer and the
2380 // buffer resource setup is always inserted in the prologue. Scratch wave
2381 // offset is still in an input SGPR.
2382 Info.setScratchRSrcReg(ReservedBufferReg);
2383 }
2384 }
2385
2387
2388 // For entry functions we have to set up the stack pointer if we use it,
2389 // whereas non-entry functions get this "for free". This means there is no
2390 // intrinsic advantage to using S32 over S34 in cases where we do not have
2391 // calls but do need a frame pointer (i.e. if we are requested to have one
2392 // because frame pointer elimination is disabled). To keep things simple we
2393 // only ever use S32 as the call ABI stack pointer, and so using it does not
2394 // imply we need a separate frame pointer.
2395 //
2396 // Try to use s32 as the SP, but move it if it would interfere with input
2397 // arguments. This won't work with calls though.
2398 //
2399 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2400 // registers.
2401 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2402 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2403 } else {
2405
2406 if (MFI.hasCalls())
2407 report_fatal_error("call in graphics shader with too many input SGPRs");
2408
2409 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2410 if (!MRI.isLiveIn(Reg)) {
2411 Info.setStackPtrOffsetReg(Reg);
2412 break;
2413 }
2414 }
2415
2416 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2417 report_fatal_error("failed to find register for SP");
2418 }
2419
2420 // hasFP should be accurate for entry functions even before the frame is
2421 // finalized, because it does not rely on the known stack size, only
2422 // properties like whether variable sized objects are present.
2423 if (ST.getFrameLowering()->hasFP(MF)) {
2424 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2425 }
2426}
2427
2430 return !Info->isEntryFunction();
2431}
2432
2434
2435}
2436
2438 MachineBasicBlock *Entry,
2439 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2441
2442 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2443 if (!IStart)
2444 return;
2445
2446 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2447 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2448 MachineBasicBlock::iterator MBBI = Entry->begin();
2449 for (const MCPhysReg *I = IStart; *I; ++I) {
2450 const TargetRegisterClass *RC = nullptr;
2451 if (AMDGPU::SReg_64RegClass.contains(*I))
2452 RC = &AMDGPU::SGPR_64RegClass;
2453 else if (AMDGPU::SReg_32RegClass.contains(*I))
2454 RC = &AMDGPU::SGPR_32RegClass;
2455 else
2456 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2457
2458 Register NewVR = MRI->createVirtualRegister(RC);
2459 // Create copy from CSR to a virtual register.
2460 Entry->addLiveIn(*I);
2461 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2462 .addReg(*I);
2463
2464 // Insert the copy-back instructions right before the terminator.
2465 for (auto *Exit : Exits)
2466 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2467 TII->get(TargetOpcode::COPY), *I)
2468 .addReg(NewVR);
2469 }
2470}
2471
2473 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2474 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2475 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2477
2479 const Function &Fn = MF.getFunction();
2482
2483 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2484 DiagnosticInfoUnsupported NoGraphicsHSA(
2485 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2486 DAG.getContext()->diagnose(NoGraphicsHSA);
2487 return DAG.getEntryNode();
2488 }
2489
2492 BitVector Skipped(Ins.size());
2493 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2494 *DAG.getContext());
2495
2496 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2497 bool IsKernel = AMDGPU::isKernel(CallConv);
2498 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2499
2500 if (IsGraphics) {
2501 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2502 assert(!UserSGPRInfo.hasDispatchPtr() &&
2503 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2504 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2505 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2506 (void)UserSGPRInfo;
2507 if (!Subtarget->enableFlatScratch())
2508 assert(!UserSGPRInfo.hasFlatScratchInit());
2509 if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs())
2510 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2511 !Info->hasWorkGroupIDZ());
2512 }
2513
2514 if (CallConv == CallingConv::AMDGPU_PS) {
2515 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2516
2517 // At least one interpolation mode must be enabled or else the GPU will
2518 // hang.
2519 //
2520 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2521 // set PSInputAddr, the user wants to enable some bits after the compilation
2522 // based on run-time states. Since we can't know what the final PSInputEna
2523 // will look like, so we shouldn't do anything here and the user should take
2524 // responsibility for the correct programming.
2525 //
2526 // Otherwise, the following restrictions apply:
2527 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2528 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2529 // enabled too.
2530 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2531 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2532 CCInfo.AllocateReg(AMDGPU::VGPR0);
2533 CCInfo.AllocateReg(AMDGPU::VGPR1);
2534 Info->markPSInputAllocated(0);
2535 Info->markPSInputEnabled(0);
2536 }
2537 if (Subtarget->isAmdPalOS()) {
2538 // For isAmdPalOS, the user does not enable some bits after compilation
2539 // based on run-time states; the register values being generated here are
2540 // the final ones set in hardware. Therefore we need to apply the
2541 // workaround to PSInputAddr and PSInputEnable together. (The case where
2542 // a bit is set in PSInputAddr but not PSInputEnable is where the
2543 // frontend set up an input arg for a particular interpolation mode, but
2544 // nothing uses that input arg. Really we should have an earlier pass
2545 // that removes such an arg.)
2546 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2547 if ((PsInputBits & 0x7F) == 0 ||
2548 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2549 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2550 }
2551 } else if (IsKernel) {
2552 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2553 } else {
2554 Splits.append(Ins.begin(), Ins.end());
2555 }
2556
2557 if (IsEntryFunc) {
2558 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2559 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2560 } else if (!IsGraphics) {
2561 // For the fixed ABI, pass workitem IDs in the last argument register.
2562 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2563 }
2564
2565 if (IsKernel) {
2566 analyzeFormalArgumentsCompute(CCInfo, Ins);
2567 } else {
2568 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2569 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2570 }
2571
2573
2574 // FIXME: This is the minimum kernel argument alignment. We should improve
2575 // this to the maximum alignment of the arguments.
2576 //
2577 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2578 // kern arg offset.
2579 const Align KernelArgBaseAlign = Align(16);
2580
2581 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2582 const ISD::InputArg &Arg = Ins[i];
2583 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2584 InVals.push_back(DAG.getUNDEF(Arg.VT));
2585 continue;
2586 }
2587
2588 CCValAssign &VA = ArgLocs[ArgIdx++];
2589 MVT VT = VA.getLocVT();
2590
2591 if (IsEntryFunc && VA.isMemLoc()) {
2592 VT = Ins[i].VT;
2593 EVT MemVT = VA.getLocVT();
2594
2595 const uint64_t Offset = VA.getLocMemOffset();
2596 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2597
2598 if (Arg.Flags.isByRef()) {
2599 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2600
2601 const GCNTargetMachine &TM =
2602 static_cast<const GCNTargetMachine &>(getTargetMachine());
2603 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2604 Arg.Flags.getPointerAddrSpace())) {
2607 }
2608
2609 InVals.push_back(Ptr);
2610 continue;
2611 }
2612
2613 SDValue Arg = lowerKernargMemParameter(
2614 DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2615 Chains.push_back(Arg.getValue(1));
2616
2617 auto *ParamTy =
2618 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2620 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2621 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2622 // On SI local pointers are just offsets into LDS, so they are always
2623 // less than 16-bits. On CI and newer they could potentially be
2624 // real pointers, so we can't guarantee their size.
2625 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2626 DAG.getValueType(MVT::i16));
2627 }
2628
2629 InVals.push_back(Arg);
2630 continue;
2631 } else if (!IsEntryFunc && VA.isMemLoc()) {
2632 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2633 InVals.push_back(Val);
2634 if (!Arg.Flags.isByVal())
2635 Chains.push_back(Val.getValue(1));
2636 continue;
2637 }
2638
2639 assert(VA.isRegLoc() && "Parameter must be in a register!");
2640
2641 Register Reg = VA.getLocReg();
2642 const TargetRegisterClass *RC = nullptr;
2643 if (AMDGPU::VGPR_32RegClass.contains(Reg))
2644 RC = &AMDGPU::VGPR_32RegClass;
2645 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
2646 RC = &AMDGPU::SGPR_32RegClass;
2647 else
2648 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
2649 EVT ValVT = VA.getValVT();
2650
2651 Reg = MF.addLiveIn(Reg, RC);
2652 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2653
2654 if (Arg.Flags.isSRet()) {
2655 // The return object should be reasonably addressable.
2656
2657 // FIXME: This helps when the return is a real sret. If it is a
2658 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2659 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2660 unsigned NumBits
2662 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2663 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2664 }
2665
2666 // If this is an 8 or 16-bit value, it is really passed promoted
2667 // to 32 bits. Insert an assert[sz]ext to capture this, then
2668 // truncate to the right size.
2669 switch (VA.getLocInfo()) {
2670 case CCValAssign::Full:
2671 break;
2672 case CCValAssign::BCvt:
2673 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2674 break;
2675 case CCValAssign::SExt:
2676 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2677 DAG.getValueType(ValVT));
2678 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2679 break;
2680 case CCValAssign::ZExt:
2681 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2682 DAG.getValueType(ValVT));
2683 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2684 break;
2685 case CCValAssign::AExt:
2686 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2687 break;
2688 default:
2689 llvm_unreachable("Unknown loc info!");
2690 }
2691
2692 InVals.push_back(Val);
2693 }
2694
2695 // Start adding system SGPRs.
2696 if (IsEntryFunc) {
2697 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
2698 } else {
2699 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2700 if (!IsGraphics)
2701 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2702 }
2703
2704 auto &ArgUsageInfo =
2706 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2707
2708 unsigned StackArgSize = CCInfo.getStackSize();
2709 Info->setBytesInStackArgArea(StackArgSize);
2710
2711 return Chains.empty() ? Chain :
2712 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2713}
2714
2715// TODO: If return values can't fit in registers, we should return as many as
2716// possible in registers before passing on stack.
2718 CallingConv::ID CallConv,
2719 MachineFunction &MF, bool IsVarArg,
2721 LLVMContext &Context) const {
2722 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2723 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2724 // for shaders. Vector types should be explicitly handled by CC.
2725 if (AMDGPU::isEntryFunctionCC(CallConv))
2726 return true;
2727
2729 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2730 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
2731 return false;
2732
2733 // We must use the stack if return would require unavailable registers.
2734 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
2735 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
2736 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
2737 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
2738 return false;
2739
2740 return true;
2741}
2742
2743SDValue
2745 bool isVarArg,
2747 const SmallVectorImpl<SDValue> &OutVals,
2748 const SDLoc &DL, SelectionDAG &DAG) const {
2751
2752 if (AMDGPU::isKernel(CallConv)) {
2753 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2754 OutVals, DL, DAG);
2755 }
2756
2757 bool IsShader = AMDGPU::isShader(CallConv);
2758
2759 Info->setIfReturnsVoid(Outs.empty());
2760 bool IsWaveEnd = Info->returnsVoid() && IsShader;
2761
2762 // CCValAssign - represent the assignment of the return value to a location.
2765
2766 // CCState - Info about the registers and stack slots.
2767 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2768 *DAG.getContext());
2769
2770 // Analyze outgoing return values.
2771 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2772
2773 SDValue Glue;
2775 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2776
2777 // Copy the result values into the output registers.
2778 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2779 ++I, ++RealRVLocIdx) {
2780 CCValAssign &VA = RVLocs[I];
2781 assert(VA.isRegLoc() && "Can only return in registers!");
2782 // TODO: Partially return in registers if return values don't fit.
2783 SDValue Arg = OutVals[RealRVLocIdx];
2784
2785 // Copied from other backends.
2786 switch (VA.getLocInfo()) {
2787 case CCValAssign::Full:
2788 break;
2789 case CCValAssign::BCvt:
2790 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2791 break;
2792 case CCValAssign::SExt:
2793 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2794 break;
2795 case CCValAssign::ZExt:
2796 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2797 break;
2798 case CCValAssign::AExt:
2799 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2800 break;
2801 default:
2802 llvm_unreachable("Unknown loc info!");
2803 }
2804
2805 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
2806 Glue = Chain.getValue(1);
2807 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2808 }
2809
2810 // FIXME: Does sret work properly?
2811 if (!Info->isEntryFunction()) {
2812 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2813 const MCPhysReg *I =
2814 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2815 if (I) {
2816 for (; *I; ++I) {
2817 if (AMDGPU::SReg_64RegClass.contains(*I))
2818 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2819 else if (AMDGPU::SReg_32RegClass.contains(*I))
2820 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2821 else
2822 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2823 }
2824 }
2825 }
2826
2827 // Update chain and glue.
2828 RetOps[0] = Chain;
2829 if (Glue.getNode())
2830 RetOps.push_back(Glue);
2831
2832 unsigned Opc = AMDGPUISD::ENDPGM;
2833 if (!IsWaveEnd)
2835 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2836}
2837
2839 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
2840 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2841 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2842 SDValue ThisVal) const {
2843 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2844
2845 // Assign locations to each value returned by this call.
2847 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2848 *DAG.getContext());
2849 CCInfo.AnalyzeCallResult(Ins, RetCC);
2850
2851 // Copy all of the result registers out of their specified physreg.
2852 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2853 CCValAssign VA = RVLocs[i];
2854 SDValue Val;
2855
2856 if (VA.isRegLoc()) {
2857 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
2858 Chain = Val.getValue(1);
2859 InGlue = Val.getValue(2);
2860 } else if (VA.isMemLoc()) {
2861 report_fatal_error("TODO: return values in memory");
2862 } else
2863 llvm_unreachable("unknown argument location type");
2864
2865 switch (VA.getLocInfo()) {
2866 case CCValAssign::Full:
2867 break;
2868 case CCValAssign::BCvt:
2869 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2870 break;
2871 case CCValAssign::ZExt:
2872 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2873 DAG.getValueType(VA.getValVT()));
2874 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2875 break;
2876 case CCValAssign::SExt:
2877 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2878 DAG.getValueType(VA.getValVT()));
2879 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2880 break;
2881 case CCValAssign::AExt:
2882 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2883 break;
2884 default:
2885 llvm_unreachable("Unknown loc info!");
2886 }
2887
2888 InVals.push_back(Val);
2889 }
2890
2891 return Chain;
2892}
2893
2894// Add code to pass special inputs required depending on used features separate
2895// from the explicit user arguments present in the IR.
2897 CallLoweringInfo &CLI,
2898 CCState &CCInfo,
2899 const SIMachineFunctionInfo &Info,
2900 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2901 SmallVectorImpl<SDValue> &MemOpChains,
2902 SDValue Chain) const {
2903 // If we don't have a call site, this was a call inserted by
2904 // legalization. These can never use special inputs.
2905 if (!CLI.CB)
2906 return;
2907
2908 SelectionDAG &DAG = CLI.DAG;
2909 const SDLoc &DL = CLI.DL;
2910 const Function &F = DAG.getMachineFunction().getFunction();
2911
2912 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2913 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2914
2915 const AMDGPUFunctionArgInfo *CalleeArgInfo
2917 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
2918 auto &ArgUsageInfo =
2920 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2921 }
2922
2923 // TODO: Unify with private memory register handling. This is complicated by
2924 // the fact that at least in kernels, the input argument is not necessarily
2925 // in the same location as the input.
2926 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
2928 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
2929 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
2930 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
2931 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
2932 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
2933 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
2934 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
2935 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
2936 };
2937
2938 for (auto Attr : ImplicitAttrs) {
2939 const ArgDescriptor *OutgoingArg;
2940 const TargetRegisterClass *ArgRC;
2941 LLT ArgTy;
2942
2943 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
2944
2945 // If the callee does not use the attribute value, skip copying the value.
2946 if (CLI.CB->hasFnAttr(Attr.second))
2947 continue;
2948
2949 std::tie(OutgoingArg, ArgRC, ArgTy) =
2950 CalleeArgInfo->getPreloadedValue(InputID);
2951 if (!OutgoingArg)
2952 continue;
2953
2954 const ArgDescriptor *IncomingArg;
2955 const TargetRegisterClass *IncomingArgRC;
2956 LLT Ty;
2957 std::tie(IncomingArg, IncomingArgRC, Ty) =
2958 CallerArgInfo.getPreloadedValue(InputID);
2959 assert(IncomingArgRC == ArgRC);
2960
2961 // All special arguments are ints for now.
2962 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2963 SDValue InputReg;
2964
2965 if (IncomingArg) {
2966 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2967 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
2968 // The implicit arg ptr is special because it doesn't have a corresponding
2969 // input for kernels, and is computed from the kernarg segment pointer.
2970 InputReg = getImplicitArgPtr(DAG, DL);
2971 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
2972 std::optional<uint32_t> Id =
2974 if (Id.has_value()) {
2975 InputReg = DAG.getConstant(*Id, DL, ArgVT);
2976 } else {
2977 InputReg = DAG.getUNDEF(ArgVT);
2978 }
2979 } else {
2980 // We may have proven the input wasn't needed, although the ABI is
2981 // requiring it. We just need to allocate the register appropriately.
2982 InputReg = DAG.getUNDEF(ArgVT);
2983 }
2984
2985 if (OutgoingArg->isRegister()) {
2986 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2987 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
2988 report_fatal_error("failed to allocate implicit input argument");
2989 } else {
2990 unsigned SpecialArgOffset =
2991 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
2992 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2993 SpecialArgOffset);
2994 MemOpChains.push_back(ArgStore);
2995 }
2996 }
2997
2998 // Pack workitem IDs into a single register or pass it as is if already
2999 // packed.
3000 const ArgDescriptor *OutgoingArg;
3001 const TargetRegisterClass *ArgRC;
3002 LLT Ty;
3003
3004 std::tie(OutgoingArg, ArgRC, Ty) =
3006 if (!OutgoingArg)
3007 std::tie(OutgoingArg, ArgRC, Ty) =
3009 if (!OutgoingArg)
3010 std::tie(OutgoingArg, ArgRC, Ty) =
3012 if (!OutgoingArg)
3013 return;
3014
3015 const ArgDescriptor *IncomingArgX = std::get<0>(
3017 const ArgDescriptor *IncomingArgY = std::get<0>(
3019 const ArgDescriptor *IncomingArgZ = std::get<0>(
3021
3022 SDValue InputReg;
3023 SDLoc SL;
3024
3025 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3026 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3027 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3028
3029 // If incoming ids are not packed we need to pack them.
3030 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3031 NeedWorkItemIDX) {
3032 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3033 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3034 } else {
3035 InputReg = DAG.getConstant(0, DL, MVT::i32);
3036 }
3037 }
3038
3039 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3040 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3041 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3042 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3043 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3044 InputReg = InputReg.getNode() ?
3045 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3046 }
3047
3048 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3049 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3050 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3051 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3052 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3053 InputReg = InputReg.getNode() ?
3054 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3055 }
3056
3057 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3058 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3059 // We're in a situation where the outgoing function requires the workitem
3060 // ID, but the calling function does not have it (e.g a graphics function
3061 // calling a C calling convention function). This is illegal, but we need
3062 // to produce something.
3063 InputReg = DAG.getUNDEF(MVT::i32);
3064 } else {
3065 // Workitem ids are already packed, any of present incoming arguments
3066 // will carry all required fields.
3068 IncomingArgX ? *IncomingArgX :
3069 IncomingArgY ? *IncomingArgY :
3070 *IncomingArgZ, ~0u);
3071 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3072 }
3073 }
3074
3075 if (OutgoingArg->isRegister()) {
3076 if (InputReg)
3077 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3078
3079 CCInfo.AllocateReg(OutgoingArg->getRegister());
3080 } else {
3081 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3082 if (InputReg) {
3083 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3084 SpecialArgOffset);
3085 MemOpChains.push_back(ArgStore);
3086 }
3087 }
3088}
3089
3091 return CC == CallingConv::Fast;
3092}
3093
3094/// Return true if we might ever do TCO for calls with this calling convention.
3096 switch (CC) {
3097 case CallingConv::C:
3099 return true;
3100 default:
3101 return canGuaranteeTCO(CC);
3102 }
3103}
3104
3106 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3108 const SmallVectorImpl<SDValue> &OutVals,
3109 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3110 if (!mayTailCallThisCC(CalleeCC))
3111 return false;
3112
3113 // For a divergent call target, we need to do a waterfall loop over the
3114 // possible callees which precludes us from using a simple jump.
3115 if (Callee->isDivergent())
3116 return false;
3117
3119 const Function &CallerF = MF.getFunction();
3120 CallingConv::ID CallerCC = CallerF.getCallingConv();
3122 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3123
3124 // Kernels aren't callable, and don't have a live in return address so it
3125 // doesn't make sense to do a tail call with entry functions.
3126 if (!CallerPreserved)
3127 return false;
3128
3129 bool CCMatch = CallerCC == CalleeCC;
3130
3132 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3133 return true;
3134 return false;
3135 }
3136
3137 // TODO: Can we handle var args?
3138 if (IsVarArg)
3139 return false;
3140
3141 for (const Argument &Arg : CallerF.args()) {
3142 if (Arg.hasByValAttr())
3143 return false;
3144 }
3145
3146 LLVMContext &Ctx = *DAG.getContext();
3147
3148 // Check that the call results are passed in the same way.
3149 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3150 CCAssignFnForCall(CalleeCC, IsVarArg),
3151 CCAssignFnForCall(CallerCC, IsVarArg)))
3152 return false;
3153
3154 // The callee has to preserve all registers the caller needs to preserve.
3155 if (!CCMatch) {
3156 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3157 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3158 return false;
3159 }
3160
3161 // Nothing more to check if the callee is taking no arguments.
3162 if (Outs.empty())
3163 return true;
3164
3166 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3167
3168 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3169
3170 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3171 // If the stack arguments for this call do not fit into our own save area then
3172 // the call cannot be made tail.
3173 // TODO: Is this really necessary?
3174 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3175 return false;
3176
3177 const MachineRegisterInfo &MRI = MF.getRegInfo();
3178 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3179}
3180
3182 if (!CI->isTailCall())
3183 return false;
3184
3185 const Function *ParentFn = CI->getParent()->getParent();
3187 return false;
3188 return true;
3189}
3190
3191// The wave scratch offset register is used as the global base pointer.
3193 SmallVectorImpl<SDValue> &InVals) const {
3194 SelectionDAG &DAG = CLI.DAG;
3195 const SDLoc &DL = CLI.DL;
3197 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3199 SDValue Chain = CLI.Chain;
3200 SDValue Callee = CLI.Callee;
3201 bool &IsTailCall = CLI.IsTailCall;
3202 CallingConv::ID CallConv = CLI.CallConv;
3203 bool IsVarArg = CLI.IsVarArg;
3204 bool IsSibCall = false;
3205 bool IsThisReturn = false;
3207
3208 if (Callee.isUndef() || isNullConstant(Callee)) {
3209 if (!CLI.IsTailCall) {
3210 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3211 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3212 }
3213
3214 return Chain;
3215 }
3216
3217 if (IsVarArg) {
3218 return lowerUnhandledCall(CLI, InVals,
3219 "unsupported call to variadic function ");
3220 }
3221
3222 if (!CLI.CB)
3223 report_fatal_error("unsupported libcall legalization");
3224
3225 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3226 return lowerUnhandledCall(CLI, InVals,
3227 "unsupported required tail call to function ");
3228 }
3229
3230 if (IsTailCall) {
3232 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3233 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
3234 report_fatal_error("failed to perform tail call elimination on a call "
3235 "site marked musttail");
3236 }
3237
3238 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3239
3240 // A sibling call is one where we're under the usual C ABI and not planning
3241 // to change that but can still do a tail call:
3242 if (!TailCallOpt && IsTailCall)
3243 IsSibCall = true;
3244
3245 if (IsTailCall)
3246 ++NumTailCalls;
3247 }
3248
3251 SmallVector<SDValue, 8> MemOpChains;
3252
3253 // Analyze operands of the call, assigning locations to each operand.
3255 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3256 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3257
3258 if (CallConv != CallingConv::AMDGPU_Gfx) {
3259 // With a fixed ABI, allocate fixed registers before user arguments.
3260 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3261 }
3262
3263 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3264
3265 // Get a count of how many bytes are to be pushed on the stack.
3266 unsigned NumBytes = CCInfo.getStackSize();
3267
3268 if (IsSibCall) {
3269 // Since we're not changing the ABI to make this a tail call, the memory
3270 // operands are already available in the caller's incoming argument space.
3271 NumBytes = 0;
3272 }
3273
3274 // FPDiff is the byte offset of the call's argument area from the callee's.
3275 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3276 // by this amount for a tail call. In a sibling call it must be 0 because the
3277 // caller will deallocate the entire stack and the callee still expects its
3278 // arguments to begin at SP+0. Completely unused for non-tail calls.
3279 int32_t FPDiff = 0;
3280 MachineFrameInfo &MFI = MF.getFrameInfo();
3281
3282 // Adjust the stack pointer for the new arguments...
3283 // These operations are automatically eliminated by the prolog/epilog pass
3284 if (!IsSibCall) {
3285 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3286
3287 if (!Subtarget->enableFlatScratch()) {
3288 SmallVector<SDValue, 4> CopyFromChains;
3289
3290 // In the HSA case, this should be an identity copy.
3291 SDValue ScratchRSrcReg
3292 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3293 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3294 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3295 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3296 }
3297 }
3298
3299 MVT PtrVT = MVT::i32;
3300
3301 // Walk the register/memloc assignments, inserting copies/loads.
3302 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3303 CCValAssign &VA = ArgLocs[i];
3304 SDValue Arg = OutVals[i];
3305
3306 // Promote the value if needed.
3307 switch (VA.getLocInfo()) {
3308 case CCValAssign::Full:
3309 break;
3310 case CCValAssign::BCvt:
3311 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3312 break;
3313 case CCValAssign::ZExt:
3314 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3315 break;
3316 case CCValAssign::SExt:
3317 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3318 break;
3319 case CCValAssign::AExt:
3320 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3321 break;
3322 case CCValAssign::FPExt:
3323 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3324 break;
3325 default:
3326 llvm_unreachable("Unknown loc info!");
3327 }
3328
3329 if (VA.isRegLoc()) {
3330 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3331 } else {
3332 assert(VA.isMemLoc());
3333
3334 SDValue DstAddr;
3335 MachinePointerInfo DstInfo;
3336
3337 unsigned LocMemOffset = VA.getLocMemOffset();
3338 int32_t Offset = LocMemOffset;
3339
3340 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3341 MaybeAlign Alignment;
3342
3343 if (IsTailCall) {
3344 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3345 unsigned OpSize = Flags.isByVal() ?
3346 Flags.getByValSize() : VA.getValVT().getStoreSize();
3347
3348 // FIXME: We can have better than the minimum byval required alignment.
3349 Alignment =
3350 Flags.isByVal()
3351 ? Flags.getNonZeroByValAlign()
3352 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3353
3354 Offset = Offset + FPDiff;
3355 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3356
3357 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3358 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3359
3360 // Make sure any stack arguments overlapping with where we're storing
3361 // are loaded before this eventual operation. Otherwise they'll be
3362 // clobbered.
3363
3364 // FIXME: Why is this really necessary? This seems to just result in a
3365 // lot of code to copy the stack and write them back to the same
3366 // locations, which are supposed to be immutable?
3367 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3368 } else {
3369 // Stores to the argument stack area are relative to the stack pointer.
3370 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3371 MVT::i32);
3372 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3373 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3374 Alignment =
3375 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3376 }
3377
3378 if (Outs[i].Flags.isByVal()) {
3379 SDValue SizeNode =
3380 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3381 SDValue Cpy =
3382 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3383 Outs[i].Flags.getNonZeroByValAlign(),
3384 /*isVol = */ false, /*AlwaysInline = */ true,
3385 /*isTailCall = */ false, DstInfo,
3387
3388 MemOpChains.push_back(Cpy);
3389 } else {
3390 SDValue Store =
3391 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3392 MemOpChains.push_back(Store);
3393 }
3394 }
3395 }
3396
3397 if (!MemOpChains.empty())
3398 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3399
3400 // Build a sequence of copy-to-reg nodes chained together with token chain
3401 // and flag operands which copy the outgoing args into the appropriate regs.
3402 SDValue InGlue;
3403 for (auto &RegToPass : RegsToPass) {
3404 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3405 RegToPass.second, InGlue);
3406 InGlue = Chain.getValue(1);
3407 }
3408
3409
3410 // We don't usually want to end the call-sequence here because we would tidy
3411 // the frame up *after* the call, however in the ABI-changing tail-call case
3412 // we've carefully laid out the parameters so that when sp is reset they'll be
3413 // in the correct location.
3414 if (IsTailCall && !IsSibCall) {
3415 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3416 InGlue = Chain.getValue(1);
3417 }
3418
3419 std::vector<SDValue> Ops;
3420 Ops.push_back(Chain);
3421 Ops.push_back(Callee);
3422 // Add a redundant copy of the callee global which will not be legalized, as
3423 // we need direct access to the callee later.
3424 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3425 const GlobalValue *GV = GSD->getGlobal();
3426 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3427 } else {
3428 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3429 }
3430
3431 if (IsTailCall) {
3432 // Each tail call may have to adjust the stack by a different amount, so
3433 // this information must travel along with the operation for eventual
3434 // consumption by emitEpilogue.
3435 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3436 }
3437
3438 // Add argument registers to the end of the list so that they are known live
3439 // into the call.
3440 for (auto &RegToPass : RegsToPass) {
3441 Ops.push_back(DAG.getRegister(RegToPass.first,
3442 RegToPass.second.getValueType()));
3443 }
3444
3445 // Add a register mask operand representing the call-preserved registers.
3446
3447 auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
3448 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3449 assert(Mask && "Missing call preserved mask for calling convention");
3450 Ops.push_back(DAG.getRegisterMask(Mask));
3451
3452 if (InGlue.getNode())
3453 Ops.push_back(InGlue);
3454
3455 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3456
3457 // If we're doing a tall call, use a TC_RETURN here rather than an
3458 // actual call instruction.
3459 if (IsTailCall) {
3460 MFI.setHasTailCall();
3461 unsigned OPC = CallConv == CallingConv::AMDGPU_Gfx ?
3463 return DAG.getNode(OPC, DL, NodeTys, Ops);
3464 }
3465
3466 // Returns a chain and a flag for retval copy to use.
3467 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3468 Chain = Call.getValue(0);
3469 InGlue = Call.getValue(1);
3470
3471 uint64_t CalleePopBytes = NumBytes;
3472 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3473 if (!Ins.empty())
3474 InGlue = Chain.getValue(1);
3475
3476 // Handle result values, copying them out of physregs into vregs that we
3477 // return.
3478 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3479 InVals, IsThisReturn,
3480 IsThisReturn ? OutVals[0] : SDValue());
3481}
3482
3483// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3484// except for applying the wave size scale to the increment amount.
3486 SDValue Op, SelectionDAG &DAG) const {
3487 const MachineFunction &MF = DAG.getMachineFunction();
3489
3490 SDLoc dl(Op);
3491 EVT VT = Op.getValueType();
3492 SDValue Tmp1 = Op;
3493 SDValue Tmp2 = Op.getValue(1);
3494 SDValue Tmp3 = Op.getOperand(2);
3495 SDValue Chain = Tmp1.getOperand(0);
3496
3497 Register SPReg = Info->getStackPtrOffsetReg();
3498
3499 // Chain the dynamic stack allocation so that it doesn't modify the stack
3500 // pointer when other instructions are using the stack.
3501 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3502
3503 SDValue Size = Tmp2.getOperand(1);
3504 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3505 Chain = SP.getValue(1);
3506 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3507 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3508 unsigned Opc =
3511
3512 SDValue ScaledSize = DAG.getNode(
3513 ISD::SHL, dl, VT, Size,
3514 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3515
3516 Align StackAlign = TFL->getStackAlign();
3517 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3518 if (Alignment && *Alignment > StackAlign) {
3519 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3520 DAG.getConstant(-(uint64_t)Alignment->value()
3521 << Subtarget->getWavefrontSizeLog2(),
3522 dl, VT));
3523 }
3524
3525 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3526 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3527
3528 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3529}
3530
3532 SelectionDAG &DAG) const {
3533 // We only handle constant sizes here to allow non-entry block, static sized
3534 // allocas. A truly dynamic value is more difficult to support because we
3535 // don't know if the size value is uniform or not. If the size isn't uniform,
3536 // we would need to do a wave reduction to get the maximum size to know how
3537 // much to increment the uniform stack pointer.
3538 SDValue Size = Op.getOperand(1);
3539 if (isa<ConstantSDNode>(Size))
3540 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3541
3543}
3544
3546 if (Op.getValueType() != MVT::i32)
3547 return Op; // Defer to cannot select error.
3548
3550 SDLoc SL(Op);
3551
3552 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
3553
3554 // Convert from wave uniform to swizzled vector address. This should protect
3555 // from any edge cases where the stacksave result isn't directly used with
3556 // stackrestore.
3557 SDValue VectorAddress =
3558 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
3559 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
3560}
3561
3563 SelectionDAG &DAG) const {
3564 SDLoc SL(Op);
3565 assert(Op.getValueType() == MVT::i32);
3566
3567 uint32_t BothRoundHwReg =
3569 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
3570
3571 SDValue IntrinID =
3572 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
3573 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
3574 Op.getOperand(0), IntrinID, GetRoundBothImm);
3575
3576 // There are two rounding modes, one for f32 and one for f64/f16. We only
3577 // report in the standard value range if both are the same.
3578 //
3579 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
3580 // ties away from zero is not supported, and the other values are rotated by
3581 // 1.
3582 //
3583 // If the two rounding modes are not the same, report a target defined value.
3584
3585 // Mode register rounding mode fields:
3586 //
3587 // [1:0] Single-precision round mode.
3588 // [3:2] Double/Half-precision round mode.
3589 //
3590 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
3591 //
3592 // Hardware Spec
3593 // Toward-0 3 0
3594 // Nearest Even 0 1
3595 // +Inf 1 2
3596 // -Inf 2 3
3597 // NearestAway0 N/A 4
3598 //
3599 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
3600 // table we can index by the raw hardware mode.
3601 //
3602 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
3603
3604 SDValue BitTable =
3606
3607 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
3608 SDValue RoundModeTimesNumBits =
3609 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
3610
3611 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
3612 // knew only one mode was demanded.
3613 SDValue TableValue =
3614 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
3615 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
3616
3617 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
3618 SDValue TableEntry =
3619 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
3620
3621 // There's a gap in the 4-bit encoded table and actual enum values, so offset
3622 // if it's an extended value.
3623 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
3624 SDValue IsStandardValue =
3625 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
3626 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
3627 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
3628 TableEntry, EnumOffset);
3629
3630 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
3631}
3632
3634 const MachineFunction &MF) const {
3636 .Case("m0", AMDGPU::M0)
3637 .Case("exec", AMDGPU::EXEC)
3638 .Case("exec_lo", AMDGPU::EXEC_LO)
3639 .Case("exec_hi", AMDGPU::EXEC_HI)
3640 .Case("flat_scratch", AMDGPU::FLAT_SCR)
3641 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3642 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3643 .Default(Register());
3644
3645 if (Reg == AMDGPU::NoRegister) {
3646 report_fatal_error(Twine("invalid register name \""
3647 + StringRef(RegName) + "\"."));
3648
3649 }
3650
3651 if (!Subtarget->hasFlatScrRegister() &&
3652 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
3653 report_fatal_error(Twine("invalid register \""
3654 + StringRef(RegName) + "\" for subtarget."));
3655 }
3656
3657 switch (Reg) {
3658 case AMDGPU::M0:
3659 case AMDGPU::EXEC_LO:
3660 case AMDGPU::EXEC_HI:
3661 case AMDGPU::FLAT_SCR_LO:
3662 case AMDGPU::FLAT_SCR_HI:
3663 if (VT.getSizeInBits() == 32)
3664 return Reg;
3665 break;
3666 case AMDGPU::EXEC:
3667 case AMDGPU::FLAT_SCR:
3668 if (VT.getSizeInBits() == 64)
3669 return Reg;
3670 break;
3671 default:
3672 llvm_unreachable("missing register type checking");
3673 }
3674
3675 report_fatal_error(Twine("invalid type for register \""
3676 + StringRef(RegName) + "\"."));
3677}
3678
3679// If kill is not the last instruction, split the block so kill is always a
3680// proper terminator.
3683 MachineBasicBlock *BB) const {
3684 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
3686 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
3687 return SplitBB;
3688}
3689
3690// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3691// \p MI will be the only instruction in the loop body block. Otherwise, it will
3692// be the first instruction in the remainder block.
3693//
3694/// \returns { LoopBody, Remainder }
3695static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3699
3700 // To insert the loop we need to split the block. Move everything after this
3701 // point to a new block, and insert a new empty block between the two.
3703 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3705 ++MBBI;
3706
3707 MF->insert(MBBI, LoopBB);
3708 MF->insert(MBBI, RemainderBB);
3709
3710 LoopBB->addSuccessor(LoopBB);
3711 LoopBB->addSuccessor(RemainderBB);
3712
3713 // Move the rest of the block into a new block.
3714 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3715
3716 if (InstInLoop) {
3717 auto Next = std::next(I);
3718
3719 // Move instruction to loop body.
3720 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
3721
3722 // Move the rest of the block.
3723 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
3724 } else {
3725 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3726 }
3727
3728 MBB.addSuccessor(LoopBB);
3729
3730 return std::pair(LoopBB, RemainderBB);
3731}
3732
3733/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3735 MachineBasicBlock *MBB = MI.getParent();
3737 auto I = MI.getIterator();
3738 auto E = std::next(I);
3739
3740 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
3741 .addImm(0);
3742
3743 MIBundleBuilder Bundler(*MBB, I, E);
3744 finalizeBundle(*MBB, Bundler.begin());
3745}
3746
3749 MachineBasicBlock *BB) const {
3750 const DebugLoc &DL = MI.getDebugLoc();
3751
3753
3754 MachineBasicBlock *LoopBB;
3755 MachineBasicBlock *RemainderBB;
3757
3758 // Apparently kill flags are only valid if the def is in the same block?
3759 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
3760 Src->setIsKill(false);
3761
3762 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
3763
3764 MachineBasicBlock::iterator I = LoopBB->end();
3765
3766 const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
3768
3769 // Clear TRAP_STS.MEM_VIOL
3770 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
3771 .addImm(0)
3772 .addImm(EncodedReg);
3773
3775
3776 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3777
3778 // Load and check TRAP_STS.MEM_VIOL
3779 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
3780 .addImm(EncodedReg);
3781
3782 // FIXME: Do we need to use an isel pseudo that may clobber scc?
3783 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
3784 .addReg(Reg, RegState::Kill)
3785 .addImm(0);
3786 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3787 .addMBB(LoopBB);
3788
3789 return RemainderBB;
3790}
3791
3792// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3793// wavefront. If the value is uniform and just happens to be in a VGPR, this
3794// will only do one iteration. In the worst case, this will loop 64 times.
3795//
3796// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3799 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3800 const DebugLoc &DL, const MachineOperand &Idx,
3801 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
3802 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
3803 Register &SGPRIdxReg) {
3804
3805 MachineFunction *MF = OrigBB.getParent();
3806 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3807 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3809
3810 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3811 Register PhiExec = MRI.createVirtualRegister(BoolRC);
3812 Register NewExec = MRI.createVirtualRegister(BoolRC);
3813 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3814 Register CondReg = MRI.createVirtualRegister(BoolRC);
3815
3816 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
3817 .addReg(InitReg)
3818 .addMBB(&OrigBB)
3819 .addReg(ResultReg)
3820 .addMBB(&LoopBB);
3821
3822 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
3823 .addReg(InitSaveExecReg)
3824 .addMBB(&OrigBB)
3825 .addReg(NewExec)
3826 .addMBB(&LoopBB);
3827
3828 // Read the next variant <- also loop target.
3829 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3830 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
3831
3832 // Compare the just read M0 value to all possible Idx values.
3833 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3834 .addReg(CurrentIdxReg)
3835 .addReg(Idx.getReg(), 0, Idx.getSubReg());
3836
3837 // Update EXEC, save the original EXEC value to VCC.
3838 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3839 : AMDGPU::S_AND_SAVEEXEC_B64),
3840 NewExec)
3841 .addReg(CondReg, RegState::Kill);
3842
3843 MRI.setSimpleHint(NewExec, CondReg);
3844
3845 if (UseGPRIdxMode) {
3846 if (Offset == 0) {
3847 SGPRIdxReg = CurrentIdxReg;
3848 } else {
3849 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3850 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3851 .addReg(CurrentIdxReg, RegState::Kill)
3852 .addImm(Offset);
3853 }
3854 } else {
3855 // Move index from VCC into M0
3856 if (Offset == 0) {
3857 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3858 .addReg(CurrentIdxReg, RegState::Kill);
3859 } else {
3860 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3861 .addReg(CurrentIdxReg, RegState::Kill)
3862 .addImm(Offset);
3863 }
3864 }
3865
3866 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3867 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3868 MachineInstr *InsertPt =
3869 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3870 : AMDGPU::S_XOR_B64_term), Exec)
3871 .addReg(Exec)
3872 .addReg(NewExec);
3873
3874 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3875 // s_cbranch_scc0?
3876
3877 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3878 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3879 .addMBB(&LoopBB);
3880
3881 return InsertPt->getIterator();
3882}
3883
3884// This has slightly sub-optimal regalloc when the source vector is killed by
3885// the read. The register allocator does not understand that the kill is
3886// per-workitem, so is kept alive for the whole loop so we end up not re-using a
3887// subregister from it, using 1 more VGPR than necessary. This was saved when
3888// this was expanded after register allocation.
3891 unsigned InitResultReg, unsigned PhiReg, int Offset,
3892 bool UseGPRIdxMode, Register &SGPRIdxReg) {
3894 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3895 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3897 const DebugLoc &DL = MI.getDebugLoc();
3899
3900 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3901 Register DstReg = MI.getOperand(0).getReg();
3902 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
3903 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
3904 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3905 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3906
3907 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3908
3909 // Save the EXEC mask
3910 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
3911 .addReg(Exec);
3912
3913 MachineBasicBlock *LoopBB;
3914 MachineBasicBlock *RemainderBB;
3915 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
3916
3917 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3918
3919 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3920 InitResultReg, DstReg, PhiReg, TmpExec,
3921 Offset, UseGPRIdxMode, SGPRIdxReg);
3922
3923 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
3925 ++MBBI;
3926 MF->insert(MBBI, LandingPad);
3927 LoopBB->removeSuccessor(RemainderBB);
3928 LandingPad->addSuccessor(RemainderBB);
3929 LoopBB->addSuccessor(LandingPad);
3930 MachineBasicBlock::iterator First = LandingPad->begin();
3931 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
3932 .addReg(SaveExec);
3933
3934 return InsPt;
3935}
3936
3937// Returns subreg index, offset
3938static std::pair<unsigned, int>
3940 const TargetRegisterClass *SuperRC,
3941 unsigned VecReg,
3942 int Offset) {
3943 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3944
3945 // Skip out of bounds offsets, or else we would end up using an undefined
3946 // register.
3947 if (Offset >= NumElts || Offset < 0)
3948 return std::pair(AMDGPU::sub0, Offset);
3949
3950 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
3951}
3952
3955 int Offset) {
3956 MachineBasicBlock *MBB = MI.getParent();
3957 const DebugLoc &DL = MI.getDebugLoc();
3959
3960 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3961
3962 assert(Idx->getReg() != AMDGPU::NoRegister);
3963
3964 if (Offset == 0) {
3965 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
3966 } else {
3967 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3968 .add(*Idx)
3969 .addImm(Offset);
3970 }
3971}
3972
3975 int Offset) {
3976 MachineBasicBlock *MBB = MI.getParent();
3977 const DebugLoc &DL = MI.getDebugLoc();
3979
3980 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3981
3982 if (Offset == 0)
3983 return Idx->getReg();
3984
3985 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3986 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3987 .add(*Idx)
3988 .addImm(Offset);
3989 return Tmp;
3990}
3991
3994 const GCNSubtarget &ST) {
3995 const SIInstrInfo *TII = ST.getInstrInfo();
3996 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3999
4000 Register Dst = MI.getOperand(0).getReg();
4001 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4002 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4003 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4004
4005 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4006 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4007
4008 unsigned SubReg;
4009 std::tie(SubReg, Offset)
4010 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4011
4012 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4013
4014 // Check for a SGPR index.
4015 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4017 const DebugLoc &DL = MI.getDebugLoc();
4018
4019 if (UseGPRIdxMode) {
4020 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4021 // to avoid interfering with other uses, so probably requires a new
4022 // optimization pass.
4024
4025 const MCInstrDesc &GPRIDXDesc =
4026 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4027 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4028 .addReg(SrcReg)
4029 .addReg(Idx)
4030 .addImm(SubReg);
4031 } else {
4033
4034 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4035 .addReg(SrcReg, 0, SubReg)
4036 .addReg(SrcReg, RegState::Implicit);
4037 }
4038
4039 MI.eraseFromParent();
4040
4041 return &MBB;
4042 }
4043
4044 // Control flow needs to be inserted if indexing with a VGPR.
4045 const DebugLoc &DL = MI.getDebugLoc();
4047
4048 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4049 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4050
4051 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4052
4053 Register SGPRIdxReg;
4054 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4055 UseGPRIdxMode, SGPRIdxReg);
4056
4057 MachineBasicBlock *LoopBB = InsPt->getParent();
4058
4059 if (UseGPRIdxMode) {
4060 const MCInstrDesc &GPRIDXDesc =
4061 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4062
4063 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4064 .addReg(SrcReg)
4065 .addReg(SGPRIdxReg)
4066 .addImm(SubReg);
4067 } else {
4068 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4069 .addReg(SrcReg, 0, SubReg)
4070 .addReg(SrcReg, RegState::Implicit);
4071 }
4072
4073 MI.eraseFromParent();
4074
4075 return LoopBB;
4076}
4077
4080 const GCNSubtarget &ST) {
4081 const SIInstrInfo *TII = ST.getInstrInfo();
4082 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4085
4086 Register Dst = MI.getOperand(0).getReg();
4087 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4088 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4089 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4090 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4091 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4092 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4093
4094 // This can be an immediate, but will be folded later.
4095 assert(Val->getReg());
4096
4097 unsigned SubReg;
4098 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4099 SrcVec->getReg(),
4100 Offset);
4101 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4102
4103 if (Idx->getReg() == AMDGPU::NoRegister) {
4105 const DebugLoc &DL = MI.getDebugLoc();
4106
4107 assert(Offset == 0);
4108
4109 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4110 .add(*SrcVec)
4111 .add(*Val)
4112 .addImm(SubReg);
4113
4114 MI.eraseFromParent();
4115 return &MBB;
4116 }
4117
4118 // Check for a SGPR index.
4119 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4121 const DebugLoc &DL = MI.getDebugLoc();
4122
4123 if (UseGPRIdxMode) {
4125
4126 const MCInstrDesc &GPRIDXDesc =
4127 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4128 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4129 .addReg(SrcVec->getReg())
4130 .add(*Val)
4131 .addReg(Idx)
4132 .addImm(SubReg);
4133 } else {
4135
4136 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4137 TRI.getRegSizeInBits(*VecRC), 32, false);
4138 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4139 .addReg(SrcVec->getReg())
4140 .add(*Val)
4141 .addImm(SubReg);
4142 }
4143 MI.eraseFromParent();
4144 return &MBB;
4145 }
4146
4147 // Control flow needs to be inserted if indexing with a VGPR.
4148 if (Val->isReg())
4149 MRI.clearKillFlags(Val->getReg());
4150
4151 const DebugLoc &DL = MI.getDebugLoc();
4152
4153 Register PhiReg = MRI.createVirtualRegister(VecRC);
4154
4155 Register SGPRIdxReg;
4156 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4157 UseGPRIdxMode, SGPRIdxReg);
4158 MachineBasicBlock *LoopBB = InsPt->getParent();
4159
4160 if (UseGPRIdxMode) {
4161 const MCInstrDesc &GPRIDXDesc =
4162 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4163
4164 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4165 .addReg(PhiReg)
4166 .add(*Val)
4167 .addReg(SGPRIdxReg)
4168 .addImm(AMDGPU::sub0);
4169 } else {
4170 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4171 TRI.getRegSizeInBits(*VecRC), 32, false);
4172 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4173 .addReg(PhiReg)
4174 .add(*Val)
4175 .addImm(AMDGPU::sub0);
4176 }
4177
4178 MI.eraseFromParent();
4179 return LoopBB;
4180}
4181
4184 const GCNSubtarget &ST,
4185 unsigned Opc) {
4187 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4188 const DebugLoc &DL = MI.getDebugLoc();
4189 const SIInstrInfo *TII = ST.getInstrInfo();
4190
4191 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4192 Register SrcReg = MI.getOperand(1).getReg();
4193 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4194 Register DstReg = MI.getOperand(0).getReg();
4195 MachineBasicBlock *RetBB = nullptr;
4196 if (isSGPR) {
4197 // These operations with a uniform value i.e. SGPR are idempotent.
4198 // Reduced value will be same as given sgpr.
4199 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4200 RetBB = &BB;
4201 } else {
4202 // TODO: Implement DPP Strategy and switch based on immediate strategy
4203 // operand. For now, for all the cases (default, Iterative and DPP we use
4204 // iterative approach by default.)
4205
4206 // To reduce the VGPR using iterative approach, we need to iterate
4207 // over all the active lanes. Lowering consists of ComputeLoop,
4208 // which iterate over only active lanes. We use copy of EXEC register
4209 // as induction variable and every active lane modifies it using bitset0
4210 // so that we will get the next active lane for next iteration.
4212 Register SrcReg = MI.getOperand(1).getReg();
4213
4214 // Create Control flow for loop
4215 // Split MI's Machine Basic block into For loop
4216 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4217
4218 // Create virtual registers required for lowering.
4219 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4220 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4221 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4222 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4223
4224 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4225 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4226 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4227
4228 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4229 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4230
4231 bool IsWave32 = ST.isWave32();
4232 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4233 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4234
4235 // Create initail values of induction variable from Exec, Accumulator and
4236 // insert branch instr to newly created ComputeBlockk
4237 uint32_t InitalValue =
4238 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4239 auto TmpSReg =
4240 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4241 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4242 .addImm(InitalValue);
4243 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4244
4245 // Start constructing ComputeLoop
4246 I = ComputeLoop->end();
4247 auto Accumulator =
4248 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4249 .addReg(InitalValReg)
4250 .addMBB(&BB);
4251 auto ActiveBits =
4252 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4253 .addReg(TmpSReg->getOperand(0).getReg())
4254 .addMBB(&BB);
4255
4256 // Perform the computations
4257 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4258 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4259 .addReg(ActiveBits->getOperand(0).getReg());
4260 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4261 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4262 .addReg(SrcReg)
4263 .addReg(FF1->getOperand(0).getReg());
4264 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4265 .addReg(Accumulator->getOperand(0).getReg())
4266 .addReg(LaneValue->getOperand(0).getReg());
4267
4268 // Manipulate the iterator to get the next active lane
4269 unsigned BITSETOpc =
4270 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4271 auto NewActiveBits =
4272 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4273 .addReg(FF1->getOperand(0).getReg())
4274 .addReg(ActiveBits->getOperand(0).getReg());
4275
4276 // Add phi nodes
4277 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4278 .addMBB(ComputeLoop);
4279 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4280 .addMBB(ComputeLoop);
4281
4282 // Creating branching
4283 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4284 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4285 .addReg(NewActiveBits->getOperand(0).getReg())
4286 .addImm(0);
4287 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4288 .addMBB(ComputeLoop);
4289
4290