LLVM 19.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101 addRegisterClass(MVT::Untyped, V64RegClass);
102
103 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
105
106 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
111
112 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
114
115 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
117
118 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
120
121 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
123
124 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
126
127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129
130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132
133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135
136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138
139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141
142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147
148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
191
192 // We need to custom lower vector stores from local memory
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Custom);
199
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 if (isTypeLegal(MVT::bf16)) {
208 for (unsigned Opc :
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Opc, MVT::bf16, Promote);
220 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221 }
222
224
226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227
228 // TODO: Could make these legal
232
233 // We only need to custom lower because we can't specify an action for bf16
234 // sources.
237
239 AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
240 }
241
242 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
243 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
248 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
253 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
254 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
255 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
256 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
257 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
258
259 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
260 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
261 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
262 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
263 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
264 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
265 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
266
267 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
268
272 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
273
274 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
275
277 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
278
280 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
281 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
282
284 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
285 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
286 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
287 Expand);
289 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
290 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
291 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
292 Expand);
293
295 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
296 MVT::v3i16, MVT::v4i16, MVT::Other},
297 Custom);
298
301 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
302
304
306
308 Expand);
309
310#if 0
312#endif
313
314 // We only support LOAD/STORE and vector manipulation ops for vectors
315 // with > 4 elements.
316 for (MVT VT :
317 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
318 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
319 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
320 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
321 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
322 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
323 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
324 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
325 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
326 switch (Op) {
327 case ISD::LOAD:
328 case ISD::STORE:
330 case ISD::BITCAST:
331 case ISD::UNDEF:
335 case ISD::IS_FPCLASS:
336 break;
341 break;
342 default:
344 break;
345 }
346 }
347 }
348
350
351 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
352 // is expanded to avoid having two separate loops in case the index is a VGPR.
353
354 // Most operations are naturally 32-bit vector operations. We only support
355 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
356 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
358 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
359
361 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
362
364 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
365
367 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
368 }
369
370 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
372 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
373
375 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
376
378 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
379
381 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
382 }
383
384 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
386 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
387
389 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
390
392 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
393
395 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
396 }
397
398 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
400 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
401
403 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
404
406 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
407
409 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
410 }
411
412 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
414 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
415
417 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
418
420 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
421
423 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
424 }
425
427 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
428 Expand);
429
430 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
431 Custom);
432
433 // Avoid stack access for these.
434 // TODO: Generalize to more vector types.
436 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
437 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
438 Custom);
439
440 // Deal with vec3 vector operations when widened to vec4.
442 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
443
444 // Deal with vec5/6/7 vector operations when widened to vec8.
446 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
450 Custom);
451
452 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
453 // and output demarshalling
454 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
455
456 // We can't return success/failure, only the old value,
457 // let LLVM add the comparison
459 Expand);
460
461 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
462
463 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
464
465 // FIXME: This should be narrowed to i32, but that only happens if i64 is
466 // illegal.
467 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
468 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
469
470 // On SI this is s_memtime and s_memrealtime on VI.
472
473 if (Subtarget->hasSMemRealTime() ||
477
478 if (Subtarget->has16BitInsts()) {
481 } else {
483 }
484
485 if (Subtarget->hasMadMacF32Insts())
487
488 if (!Subtarget->hasBFI())
489 // fcopysign can be done in a single instruction with BFI.
490 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
491
492 if (!Subtarget->hasBCNT(32))
494
495 if (!Subtarget->hasBCNT(64))
497
498 if (Subtarget->hasFFBH())
500
501 if (Subtarget->hasFFBL())
503
504 // We only really have 32-bit BFE instructions (and 16-bit on VI).
505 //
506 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
507 // effort to match them now. We want this to be false for i64 cases when the
508 // extraction isn't restricted to the upper or lower half. Ideally we would
509 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
510 // span the midpoint are probably relatively rare, so don't worry about them
511 // for now.
512 if (Subtarget->hasBFE())
514
515 // Clamp modifier on add/sub
516 if (Subtarget->hasIntClamp())
518
519 if (Subtarget->hasAddNoCarry())
520 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
521 Legal);
522
523 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
524 Custom);
525
526 // These are really only legal for ieee_mode functions. We should be avoiding
527 // them for functions that don't have ieee_mode enabled, so just say they are
528 // legal.
530 {MVT::f32, MVT::f64}, Legal);
531
532 if (Subtarget->haveRoundOpsF64())
534 Legal);
535 else
537 MVT::f64, Custom);
538
540 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
541 Legal);
542 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
543
546
547 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
548 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
549
550 // Custom lower these because we can't specify a rule based on an illegal
551 // source bf16.
554
555 if (Subtarget->has16BitInsts()) {
558 MVT::i16, Legal);
559
560 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
561
563 MVT::i16, Expand);
564
568 ISD::CTPOP},
569 MVT::i16, Promote);
570
572
573 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
574
576 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
578 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
579
583
585
586 // F16 - Constant Actions.
589
590 // F16 - Load/Store Actions.
592 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
594 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
595
596 // BF16 - Load/Store Actions.
598 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
600 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
601
602 // F16 - VOP1 Actions.
605 MVT::f16, Custom);
606
609
610 // F16 - VOP2 Actions.
611 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
612 Expand);
616
617 // F16 - VOP3 Actions.
619 if (STI.hasMadF16())
621
622 for (MVT VT :
623 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
624 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
625 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
626 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
627 switch (Op) {
628 case ISD::LOAD:
629 case ISD::STORE:
631 case ISD::BITCAST:
632 case ISD::UNDEF:
638 case ISD::IS_FPCLASS:
639 break;
642 break;
643 default:
645 break;
646 }
647 }
648 }
649
650 // v_perm_b32 can handle either of these.
651 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
653
654 // XXX - Do these do anything? Vector constants turn into build_vector.
655 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
656
657 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
658 Legal);
659
661 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
663 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
664
666 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
668 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
669
670 setOperationAction(ISD::AND, MVT::v2i16, Promote);
671 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
672 setOperationAction(ISD::OR, MVT::v2i16, Promote);
673 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
674 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
675 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
676
678 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
680 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
681 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
682 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
683
685 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
687 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
689 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
690
692 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
694 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
695 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
696 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
697
699 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
701 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
702
704 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
706 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
708 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
709
710 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
712 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
713 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
714 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
715 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
716
718 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
720 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
721 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
722 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
723
724 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
725 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
726 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
727 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
728 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
730
732 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
734 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
735 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
736 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
737
739 MVT::v2i32, Expand);
741
743 MVT::v4i32, Expand);
744
746 MVT::v8i32, Expand);
747
748 if (!Subtarget->hasVOP3PInsts())
750 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
751
752 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
753 // This isn't really legal, but this avoids the legalizer unrolling it (and
754 // allows matching fneg (fabs x) patterns)
755 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
756
759
761 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
762 Custom);
763
765 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
766 Expand);
767
768 for (MVT Vec16 :
769 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
770 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
773 Vec16, Custom);
775 }
776 }
777
778 if (Subtarget->hasVOP3PInsts()) {
782 MVT::v2i16, Legal);
783
786 MVT::v2f16, Legal);
787
788 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
789 Custom);
790
792 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
793 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
794 Custom);
795
796 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
797 // Split vector operations.
802 VT, Custom);
803
804 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
805 // Split vector operations.
807 VT, Custom);
808
809 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
810 Custom);
811
812 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
813 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
814 Custom);
815
816 if (Subtarget->hasPackedFP32Ops()) {
818 MVT::v2f32, Legal);
820 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
821 Custom);
822 }
823 }
824
826
827 if (Subtarget->has16BitInsts()) {
829 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
831 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
832 } else {
833 // Legalization hack.
834 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
835
837 }
838
840 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
841 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
842 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
843 MVT::v32f16, MVT::v32bf16},
844 Custom);
845
847
848 if (Subtarget->hasScalarSMulU64())
850
851 if (Subtarget->hasMad64_32())
853
854 if (Subtarget->hasPrefetch())
856
857 if (Subtarget->hasIEEEMinMax())
859 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
860
862 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
863 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
864 Custom);
865
867 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
868 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
869 MVT::i16, MVT::i8, MVT::i128},
870 Custom);
871
873 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
874 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
875 MVT::i8, MVT::i128},
876 Custom);
877
882
883 // TODO: Could move this to custom lowering, could benefit from combines on
884 // extract of relevant bits.
886
888
891 ISD::SUB,
893 ISD::FADD,
894 ISD::FSUB,
895 ISD::FDIV,
902 ISD::FMA,
903 ISD::SMIN,
904 ISD::SMAX,
905 ISD::UMIN,
906 ISD::UMAX,
908 ISD::AND,
909 ISD::OR,
910 ISD::XOR,
911 ISD::FSHR,
921
922 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
924
925 // All memory operations. Some folding on the pointer operand is done to help
926 // matching the constant offsets in the addressing modes.
949
950 // FIXME: In other contexts we pretend this is a per-function property.
952
954}
955
957 return Subtarget;
958}
959
960//===----------------------------------------------------------------------===//
961// TargetLowering queries
962//===----------------------------------------------------------------------===//
963
964// v_mad_mix* support a conversion from f16 to f32.
965//
966// There is only one special case when denormals are enabled we don't currently,
967// where this is OK to use.
968bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
969 EVT DestVT, EVT SrcVT) const {
970 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
971 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
972 DestVT.getScalarType() == MVT::f32 &&
973 SrcVT.getScalarType() == MVT::f16 &&
974 // TODO: This probably only requires no input flushing?
976}
977
979 LLT DestTy, LLT SrcTy) const {
980 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
981 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
982 DestTy.getScalarSizeInBits() == 32 &&
983 SrcTy.getScalarSizeInBits() == 16 &&
984 // TODO: This probably only requires no input flushing?
986}
987
989 // SI has some legal vector types, but no legal vector operations. Say no
990 // shuffles are legal in order to prefer scalarizing some vector operations.
991 return false;
992}
993
996 EVT VT) const {
999
1000 if (VT.isVector()) {
1001 EVT ScalarVT = VT.getScalarType();
1002 unsigned Size = ScalarVT.getSizeInBits();
1003 if (Size == 16) {
1004 if (Subtarget->has16BitInsts()) {
1005 if (VT.isInteger())
1006 return MVT::v2i16;
1007 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1008 }
1009 return VT.isInteger() ? MVT::i32 : MVT::f32;
1010 }
1011
1012 if (Size < 16)
1013 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1014 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1015 }
1016
1017 if (VT.getSizeInBits() > 32)
1018 return MVT::i32;
1019
1021}
1022
1025 EVT VT) const {
1028
1029 if (VT.isVector()) {
1030 unsigned NumElts = VT.getVectorNumElements();
1031 EVT ScalarVT = VT.getScalarType();
1032 unsigned Size = ScalarVT.getSizeInBits();
1033
1034 // FIXME: Should probably promote 8-bit vectors to i16.
1035 if (Size == 16 && Subtarget->has16BitInsts())
1036 return (NumElts + 1) / 2;
1037
1038 if (Size <= 32)
1039 return NumElts;
1040
1041 if (Size > 32)
1042 return NumElts * ((Size + 31) / 32);
1043 } else if (VT.getSizeInBits() > 32)
1044 return (VT.getSizeInBits() + 31) / 32;
1045
1047}
1048
1050 LLVMContext &Context, CallingConv::ID CC,
1051 EVT VT, EVT &IntermediateVT,
1052 unsigned &NumIntermediates, MVT &RegisterVT) const {
1053 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1054 unsigned NumElts = VT.getVectorNumElements();
1055 EVT ScalarVT = VT.getScalarType();
1056 unsigned Size = ScalarVT.getSizeInBits();
1057 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1058 // support, but unless we can properly handle 3-vectors, it will be still be
1059 // inconsistent.
1060 if (Size == 16 && Subtarget->has16BitInsts()) {
1061 if (ScalarVT == MVT::bf16) {
1062 RegisterVT = MVT::i32;
1063 IntermediateVT = MVT::v2bf16;
1064 } else {
1065 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1066 IntermediateVT = RegisterVT;
1067 }
1068 NumIntermediates = (NumElts + 1) / 2;
1069 return NumIntermediates;
1070 }
1071
1072 if (Size == 32) {
1073 RegisterVT = ScalarVT.getSimpleVT();
1074 IntermediateVT = RegisterVT;
1075 NumIntermediates = NumElts;
1076 return NumIntermediates;
1077 }
1078
1079 if (Size < 16 && Subtarget->has16BitInsts()) {
1080 // FIXME: Should probably form v2i16 pieces
1081 RegisterVT = MVT::i16;
1082 IntermediateVT = ScalarVT;
1083 NumIntermediates = NumElts;
1084 return NumIntermediates;
1085 }
1086
1087
1088 if (Size != 16 && Size <= 32) {
1089 RegisterVT = MVT::i32;
1090 IntermediateVT = ScalarVT;
1091 NumIntermediates = NumElts;
1092 return NumIntermediates;
1093 }
1094
1095 if (Size > 32) {
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = RegisterVT;
1098 NumIntermediates = NumElts * ((Size + 31) / 32);
1099 return NumIntermediates;
1100 }
1101 }
1102
1104 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1105}
1106
1107static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
1108 assert(MaxNumLanes != 0);
1109
1110 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1111 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1112 return EVT::getVectorVT(Ty->getContext(),
1113 EVT::getEVT(VT->getElementType()),
1114 NumElts);
1115 }
1116
1117 return EVT::getEVT(Ty);
1118}
1119
1120// Peek through TFE struct returns to only use the data size.
1121static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
1122 auto *ST = dyn_cast<StructType>(Ty);
1123 if (!ST)
1124 return memVTFromLoadIntrData(Ty, MaxNumLanes);
1125
1126 // TFE intrinsics return an aggregate type.
1127 assert(ST->getNumContainedTypes() == 2 &&
1128 ST->getContainedType(1)->isIntegerTy(32));
1129 return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
1130}
1131
1132/// Map address space 7 to MVT::v5i32 because that's its in-memory
1133/// representation. This return value is vector-typed because there is no
1134/// MVT::i160 and it is not clear if one can be added. While this could
1135/// cause issues during codegen, these address space 7 pointers will be
1136/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1137/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1138/// modeling, to work.
1140 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1141 return MVT::v5i32;
1143 DL.getPointerSizeInBits(AS) == 192)
1144 return MVT::v6i32;
1146}
1147/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1148/// v8i32 when padding is added.
1149/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1150/// also v8i32 with padding.
1152 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1153 DL.getPointerSizeInBits(AS) == 160) ||
1155 DL.getPointerSizeInBits(AS) == 192))
1156 return MVT::v8i32;
1158}
1159
1161 const CallInst &CI,
1162 MachineFunction &MF,
1163 unsigned IntrID) const {
1165 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1167
1168 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1171 (Intrinsic::ID)IntrID);
1172 MemoryEffects ME = Attr.getMemoryEffects();
1173 if (ME.doesNotAccessMemory())
1174 return false;
1175
1176 // TODO: Should images get their own address space?
1177 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1178
1179 if (RsrcIntr->IsImage)
1180 Info.align.reset();
1181
1182 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1183 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1184 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1185 // We conservatively set the memory operand of a buffer intrinsic to the
1186 // base resource pointer, so that we can access alias information about
1187 // those pointers. Cases like "this points at the same value
1188 // but with a different offset" are handled in
1189 // areMemAccessesTriviallyDisjoint.
1190 Info.ptrVal = RsrcArg;
1191 }
1192
1193 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1194 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1197 if (ME.onlyReadsMemory()) {
1198 unsigned MaxNumLanes = 4;
1199
1200 if (RsrcIntr->IsImage) {
1203 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1205
1206 if (!BaseOpcode->Gather4) {
1207 // If this isn't a gather, we may have excess loaded elements in the
1208 // IR type. Check the dmask for the real number of elements loaded.
1209 unsigned DMask
1210 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1211 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1212 }
1213 }
1214
1215 Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
1216
1217 // FIXME: What does alignment mean for an image?
1220 } else if (ME.onlyWritesMemory()) {
1222
1223 Type *DataTy = CI.getArgOperand(0)->getType();
1224 if (RsrcIntr->IsImage) {
1225 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1226 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1227 Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
1228 } else
1229 Info.memVT = EVT::getEVT(DataTy);
1230
1232 } else {
1233 // Atomic
1234 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1236 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1240
1241 switch (IntrID) {
1242 default:
1243 // XXX - Should this be volatile without known ordering?
1245 break;
1246 case Intrinsic::amdgcn_raw_buffer_load_lds:
1247 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1248 case Intrinsic::amdgcn_struct_buffer_load_lds:
1249 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1250 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1251 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1252 Info.ptrVal = CI.getArgOperand(1);
1253 return true;
1254 }
1255 }
1256 }
1257 return true;
1258 }
1259
1260 switch (IntrID) {
1261 case Intrinsic::amdgcn_ds_ordered_add:
1262 case Intrinsic::amdgcn_ds_ordered_swap:
1263 case Intrinsic::amdgcn_ds_fadd:
1264 case Intrinsic::amdgcn_ds_fmin:
1265 case Intrinsic::amdgcn_ds_fmax: {
1267 Info.memVT = MVT::getVT(CI.getType());
1268 Info.ptrVal = CI.getOperand(0);
1269 Info.align.reset();
1271
1272 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1273 if (!Vol->isZero())
1275
1276 return true;
1277 }
1278 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1280 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1281 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1282 Info.align.reset();
1284
1285 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1286 if (!Vol || !Vol->isZero())
1288
1289 return true;
1290 }
1291 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1292 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1294 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1295 Info.ptrVal = nullptr;
1296 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1298 return true;
1299 }
1300 case Intrinsic::amdgcn_ds_append:
1301 case Intrinsic::amdgcn_ds_consume: {
1303 Info.memVT = MVT::getVT(CI.getType());
1304 Info.ptrVal = CI.getOperand(0);
1305 Info.align.reset();
1307
1308 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1309 if (!Vol->isZero())
1311
1312 return true;
1313 }
1314 case Intrinsic::amdgcn_global_atomic_csub: {
1316 Info.memVT = MVT::getVT(CI.getType());
1317 Info.ptrVal = CI.getOperand(0);
1318 Info.align.reset();
1322 return true;
1323 }
1324 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1326 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1327
1328 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1329 Info.align.reset();
1332 return true;
1333 }
1334 case Intrinsic::amdgcn_global_atomic_fadd:
1335 case Intrinsic::amdgcn_global_atomic_fmin:
1336 case Intrinsic::amdgcn_global_atomic_fmax:
1337 case Intrinsic::amdgcn_global_atomic_fmin_num:
1338 case Intrinsic::amdgcn_global_atomic_fmax_num:
1339 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1340 case Intrinsic::amdgcn_flat_atomic_fadd:
1341 case Intrinsic::amdgcn_flat_atomic_fmin:
1342 case Intrinsic::amdgcn_flat_atomic_fmax:
1343 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1344 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1345 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1346 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1347 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1349 Info.memVT = MVT::getVT(CI.getType());
1350 Info.ptrVal = CI.getOperand(0);
1351 Info.align.reset();
1356 return true;
1357 }
1358 case Intrinsic::amdgcn_global_load_tr: {
1360 Info.memVT = MVT::getVT(CI.getType());
1361 Info.ptrVal = CI.getOperand(0);
1362 Info.align.reset();
1364 return true;
1365 }
1366 case Intrinsic::amdgcn_ds_gws_init:
1367 case Intrinsic::amdgcn_ds_gws_barrier:
1368 case Intrinsic::amdgcn_ds_gws_sema_v:
1369 case Intrinsic::amdgcn_ds_gws_sema_br:
1370 case Intrinsic::amdgcn_ds_gws_sema_p:
1371 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1373
1374 const GCNTargetMachine &TM =
1375 static_cast<const GCNTargetMachine &>(getTargetMachine());
1376
1378 Info.ptrVal = MFI->getGWSPSV(TM);
1379
1380 // This is an abstract access, but we need to specify a type and size.
1381 Info.memVT = MVT::i32;
1382 Info.size = 4;
1383 Info.align = Align(4);
1384
1385 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1387 else
1389 return true;
1390 }
1391 case Intrinsic::amdgcn_global_load_lds: {
1393 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1394 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1395 Info.ptrVal = CI.getArgOperand(1);
1397 return true;
1398 }
1399 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1401
1402 const GCNTargetMachine &TM =
1403 static_cast<const GCNTargetMachine &>(getTargetMachine());
1404
1406 Info.ptrVal = MFI->getGWSPSV(TM);
1407
1408 // This is an abstract access, but we need to specify a type and size.
1409 Info.memVT = MVT::i32;
1410 Info.size = 4;
1411 Info.align = Align(4);
1412
1414 return true;
1415 }
1416 default:
1417 return false;
1418 }
1419}
1420
1422 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1423 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1424 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1425 // The DAG's ValueType loses the addrspaces.
1426 // Add them as 2 extra Constant operands "from" and "to".
1427 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1428 unsigned DstAS = I.getType()->getPointerAddressSpace();
1429 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1430 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1431 break;
1432 }
1433 default:
1434 break;
1435 }
1436}
1437
1440 Type *&AccessTy) const {
1441 Value *Ptr = nullptr;
1442 switch (II->getIntrinsicID()) {
1443 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1444 case Intrinsic::amdgcn_ds_append:
1445 case Intrinsic::amdgcn_ds_consume:
1446 case Intrinsic::amdgcn_ds_fadd:
1447 case Intrinsic::amdgcn_ds_fmax:
1448 case Intrinsic::amdgcn_ds_fmin:
1449 case Intrinsic::amdgcn_ds_ordered_add:
1450 case Intrinsic::amdgcn_ds_ordered_swap:
1451 case Intrinsic::amdgcn_flat_atomic_fadd:
1452 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1453 case Intrinsic::amdgcn_flat_atomic_fmax:
1454 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1455 case Intrinsic::amdgcn_flat_atomic_fmin:
1456 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1457 case Intrinsic::amdgcn_global_atomic_csub:
1458 case Intrinsic::amdgcn_global_atomic_fadd:
1459 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1460 case Intrinsic::amdgcn_global_atomic_fmax:
1461 case Intrinsic::amdgcn_global_atomic_fmax_num:
1462 case Intrinsic::amdgcn_global_atomic_fmin:
1463 case Intrinsic::amdgcn_global_atomic_fmin_num:
1464 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1465 case Intrinsic::amdgcn_global_load_tr:
1466 Ptr = II->getArgOperand(0);
1467 break;
1468 case Intrinsic::amdgcn_global_load_lds:
1469 Ptr = II->getArgOperand(1);
1470 break;
1471 default:
1472 return false;
1473 }
1474 AccessTy = II->getType();
1475 Ops.push_back(Ptr);
1476 return true;
1477}
1478
1479bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1480 unsigned AddrSpace,
1481 uint64_t FlatVariant) const {
1482 if (!Subtarget->hasFlatInstOffsets()) {
1483 // Flat instructions do not have offsets, and only have the register
1484 // address.
1485 return AM.BaseOffs == 0 && AM.Scale == 0;
1486 }
1487
1488 return AM.Scale == 0 &&
1489 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1490 AM.BaseOffs, AddrSpace, FlatVariant));
1491}
1492
1494 if (Subtarget->hasFlatGlobalInsts())
1495 return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS,
1497
1498 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1499 // Assume the we will use FLAT for all global memory accesses
1500 // on VI.
1501 // FIXME: This assumption is currently wrong. On VI we still use
1502 // MUBUF instructions for the r + i addressing mode. As currently
1503 // implemented, the MUBUF instructions only work on buffer < 4GB.
1504 // It may be possible to support > 4GB buffers with MUBUF instructions,
1505 // by setting the stride value in the resource descriptor which would
1506 // increase the size limit to (stride * 4GB). However, this is risky,
1507 // because it has never been validated.
1508 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1510 }
1511
1512 return isLegalMUBUFAddressingMode(AM);
1513}
1514
1515bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1516 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1517 // additionally can do r + r + i with addr64. 32-bit has more addressing
1518 // mode options. Depending on the resource constant, it can also do
1519 // (i64 r0) + (i32 r1) * (i14 i).
1520 //
1521 // Private arrays end up using a scratch buffer most of the time, so also
1522 // assume those use MUBUF instructions. Scratch loads / stores are currently
1523 // implemented as mubuf instructions with offen bit set, so slightly
1524 // different than the normal addr64.
1525 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1526 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1527 return false;
1528
1529 // FIXME: Since we can split immediate into soffset and immediate offset,
1530 // would it make sense to allow any immediate?
1531
1532 switch (AM.Scale) {
1533 case 0: // r + i or just i, depending on HasBaseReg.
1534 return true;
1535 case 1:
1536 return true; // We have r + r or r + i.
1537 case 2:
1538 if (AM.HasBaseReg) {
1539 // Reject 2 * r + r.
1540 return false;
1541 }
1542
1543 // Allow 2 * r as r + r
1544 // Or 2 * r + i is allowed as r + r + i.
1545 return true;
1546 default: // Don't allow n * r
1547 return false;
1548 }
1549}
1550
1552 const AddrMode &AM, Type *Ty,
1553 unsigned AS, Instruction *I) const {
1554 // No global is ever allowed as a base.
1555 if (AM.BaseGV)
1556 return false;
1557
1558 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1559 return isLegalGlobalAddressingMode(AM);
1560
1561 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1565 // If the offset isn't a multiple of 4, it probably isn't going to be
1566 // correctly aligned.
1567 // FIXME: Can we get the real alignment here?
1568 if (AM.BaseOffs % 4 != 0)
1569 return isLegalMUBUFAddressingMode(AM);
1570
1571 if (!Subtarget->hasScalarSubwordLoads()) {
1572 // There are no SMRD extloads, so if we have to do a small type access we
1573 // will use a MUBUF load.
1574 // FIXME?: We also need to do this if unaligned, but we don't know the
1575 // alignment here.
1576 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1577 return isLegalGlobalAddressingMode(AM);
1578 }
1579
1581 // SMRD instructions have an 8-bit, dword offset on SI.
1582 if (!isUInt<8>(AM.BaseOffs / 4))
1583 return false;
1584 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1585 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1586 // in 8-bits, it can use a smaller encoding.
1587 if (!isUInt<32>(AM.BaseOffs / 4))
1588 return false;
1589 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1590 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1591 if (!isUInt<20>(AM.BaseOffs))
1592 return false;
1593 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1594 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1595 // for S_BUFFER_* instructions).
1596 if (!isInt<21>(AM.BaseOffs))
1597 return false;
1598 } else {
1599 // On GFX12, all offsets are signed 24-bit in bytes.
1600 if (!isInt<24>(AM.BaseOffs))
1601 return false;
1602 }
1603
1604 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1605 return true;
1606
1607 if (AM.Scale == 1 && AM.HasBaseReg)
1608 return true;
1609
1610 return false;
1611 }
1612
1613 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1614 return Subtarget->enableFlatScratch()
1615 ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS,
1617 : isLegalMUBUFAddressingMode(AM);
1618
1619 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1620 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1621 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1622 // field.
1623 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1624 // an 8-bit dword offset but we don't know the alignment here.
1625 if (!isUInt<16>(AM.BaseOffs))
1626 return false;
1627
1628 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1629 return true;
1630
1631 if (AM.Scale == 1 && AM.HasBaseReg)
1632 return true;
1633
1634 return false;
1635 }
1636
1638 // For an unknown address space, this usually means that this is for some
1639 // reason being used for pure arithmetic, and not based on some addressing
1640 // computation. We don't have instructions that compute pointers with any
1641 // addressing modes, so treat them as having no offset like flat
1642 // instructions.
1643 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1645 }
1646
1647 // Assume a user alias of global for unknown address spaces.
1648 return isLegalGlobalAddressingMode(AM);
1649}
1650
1652 const MachineFunction &MF) const {
1654 return (MemVT.getSizeInBits() <= 4 * 32);
1655 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1656 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1657 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1658 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1659 return (MemVT.getSizeInBits() <= 2 * 32);
1660 }
1661 return true;
1662}
1663
1665 unsigned Size, unsigned AddrSpace, Align Alignment,
1666 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1667 if (IsFast)
1668 *IsFast = 0;
1669
1670 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1671 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1672 // Check if alignment requirements for ds_read/write instructions are
1673 // disabled.
1674 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1675 return false;
1676
1677 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1678 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1679 Alignment < RequiredAlignment)
1680 return false;
1681
1682 // Either, the alignment requirements are "enabled", or there is an
1683 // unaligned LDS access related hardware bug though alignment requirements
1684 // are "disabled". In either case, we need to check for proper alignment
1685 // requirements.
1686 //
1687 switch (Size) {
1688 case 64:
1689 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1690 // address is negative, then the instruction is incorrectly treated as
1691 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1692 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1693 // load later in the SILoadStoreOptimizer.
1694 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1695 return false;
1696
1697 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1698 // can do a 4 byte aligned, 8 byte access in a single operation using
1699 // ds_read2/write2_b32 with adjacent offsets.
1700 RequiredAlignment = Align(4);
1701
1702 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1703 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1704 // ds_write2_b32 depending on the alignment. In either case with either
1705 // alignment there is no faster way of doing this.
1706
1707 // The numbers returned here and below are not additive, it is a 'speed
1708 // rank'. They are just meant to be compared to decide if a certain way
1709 // of lowering an operation is faster than another. For that purpose
1710 // naturally aligned operation gets it bitsize to indicate that "it
1711 // operates with a speed comparable to N-bit wide load". With the full
1712 // alignment ds128 is slower than ds96 for example. If underaligned it
1713 // is comparable to a speed of a single dword access, which would then
1714 // mean 32 < 128 and it is faster to issue a wide load regardless.
1715 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1716 // wider load which will not be aligned anymore the latter is slower.
1717 if (IsFast)
1718 *IsFast = (Alignment >= RequiredAlignment) ? 64
1719 : (Alignment < Align(4)) ? 32
1720 : 1;
1721 return true;
1722 }
1723
1724 break;
1725 case 96:
1726 if (!Subtarget->hasDS96AndDS128())
1727 return false;
1728
1729 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1730 // gfx8 and older.
1731
1732 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1733 // Naturally aligned access is fastest. However, also report it is Fast
1734 // if memory is aligned less than DWORD. A narrow load or store will be
1735 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1736 // be more of them, so overall we will pay less penalty issuing a single
1737 // instruction.
1738
1739 // See comment on the values above.
1740 if (IsFast)
1741 *IsFast = (Alignment >= RequiredAlignment) ? 96
1742 : (Alignment < Align(4)) ? 32
1743 : 1;
1744 return true;
1745 }
1746
1747 break;
1748 case 128:
1749 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1750 return false;
1751
1752 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1753 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1754 // single operation using ds_read2/write2_b64.
1755 RequiredAlignment = Align(8);
1756
1757 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1758 // Naturally aligned access is fastest. However, also report it is Fast
1759 // if memory is aligned less than DWORD. A narrow load or store will be
1760 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1761 // will be more of them, so overall we will pay less penalty issuing a
1762 // single instruction.
1763
1764 // See comment on the values above.
1765 if (IsFast)
1766 *IsFast = (Alignment >= RequiredAlignment) ? 128
1767 : (Alignment < Align(4)) ? 32
1768 : 1;
1769 return true;
1770 }
1771
1772 break;
1773 default:
1774 if (Size > 32)
1775 return false;
1776
1777 break;
1778 }
1779
1780 // See comment on the values above.
1781 // Note that we have a single-dword or sub-dword here, so if underaligned
1782 // it is a slowest possible access, hence returned value is 0.
1783 if (IsFast)
1784 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1785
1786 return Alignment >= RequiredAlignment ||
1787 Subtarget->hasUnalignedDSAccessEnabled();
1788 }
1789
1790 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1791 bool AlignedBy4 = Alignment >= Align(4);
1792 if (IsFast)
1793 *IsFast = AlignedBy4;
1794
1795 return AlignedBy4 ||
1796 Subtarget->enableFlatScratch() ||
1797 Subtarget->hasUnalignedScratchAccess();
1798 }
1799
1800 // FIXME: We have to be conservative here and assume that flat operations
1801 // will access scratch. If we had access to the IR function, then we
1802 // could determine if any private memory was used in the function.
1803 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1804 !Subtarget->hasUnalignedScratchAccess()) {
1805 bool AlignedBy4 = Alignment >= Align(4);
1806 if (IsFast)
1807 *IsFast = AlignedBy4;
1808
1809 return AlignedBy4;
1810 }
1811
1812 // So long as they are correct, wide global memory operations perform better
1813 // than multiple smaller memory ops -- even when misaligned
1814 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1815 if (IsFast)
1816 *IsFast = Size;
1817
1818 return Alignment >= Align(4) ||
1820 }
1821
1822 // Smaller than dword value must be aligned.
1823 if (Size < 32)
1824 return false;
1825
1826 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1827 // byte-address are ignored, thus forcing Dword alignment.
1828 // This applies to private, global, and constant memory.
1829 if (IsFast)
1830 *IsFast = 1;
1831
1832 return Size >= 32 && Alignment >= Align(4);
1833}
1834
1836 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1837 unsigned *IsFast) const {
1839 Alignment, Flags, IsFast);
1840}
1841
1843 const MemOp &Op, const AttributeList &FuncAttributes) const {
1844 // FIXME: Should account for address space here.
1845
1846 // The default fallback uses the private pointer size as a guess for a type to
1847 // use. Make sure we switch these to 64-bit accesses.
1848
1849 if (Op.size() >= 16 &&
1850 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1851 return MVT::v4i32;
1852
1853 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1854 return MVT::v2i32;
1855
1856 // Use the default.
1857 return MVT::Other;
1858}
1859
1861 const MemSDNode *MemNode = cast<MemSDNode>(N);
1862 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1863}
1864
1866 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1868}
1869
1871 unsigned DestAS) const {
1872 // Flat -> private/local is a simple truncate.
1873 // Flat -> global is no-op
1874 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1875 return true;
1876
1877 const GCNTargetMachine &TM =
1878 static_cast<const GCNTargetMachine &>(getTargetMachine());
1879 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1880}
1881
1883 const MemSDNode *MemNode = cast<MemSDNode>(N);
1884
1886}
1887
1890 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1891 VT.getScalarType().bitsLE(MVT::i16))
1894}
1895
1897 Type *Ty) const {
1898 // FIXME: Could be smarter if called for vector constants.
1899 return true;
1900}
1901
1903 unsigned Index) const {
1905 return false;
1906
1907 // TODO: Add more cases that are cheap.
1908 return Index == 0;
1909}
1910
1912 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1913 switch (Op) {
1914 case ISD::LOAD:
1915 case ISD::STORE:
1916
1917 // These operations are done with 32-bit instructions anyway.
1918 case ISD::AND:
1919 case ISD::OR:
1920 case ISD::XOR:
1921 case ISD::SELECT:
1922 // TODO: Extensions?
1923 return true;
1924 default:
1925 return false;
1926 }
1927 }
1928
1929 // SimplifySetCC uses this function to determine whether or not it should
1930 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1931 if (VT == MVT::i1 && Op == ISD::SETCC)
1932 return false;
1933
1935}
1936
1937SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1938 const SDLoc &SL,
1939 SDValue Chain,
1940 uint64_t Offset) const {
1941 const DataLayout &DL = DAG.getDataLayout();
1944
1945 const ArgDescriptor *InputPtrReg;
1946 const TargetRegisterClass *RC;
1947 LLT ArgTy;
1949
1950 std::tie(InputPtrReg, RC, ArgTy) =
1952
1953 // We may not have the kernarg segment argument if we have no kernel
1954 // arguments.
1955 if (!InputPtrReg)
1956 return DAG.getConstant(Offset, SL, PtrVT);
1957
1959 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1960 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1961
1962 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1963}
1964
1965SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1966 const SDLoc &SL) const {
1969 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1970}
1971
1972SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1973 const SDLoc &SL) const {
1974
1976 std::optional<uint32_t> KnownSize =
1978 if (KnownSize.has_value())
1979 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1980 return SDValue();
1981}
1982
1983SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1984 const SDLoc &SL, SDValue Val,
1985 bool Signed,
1986 const ISD::InputArg *Arg) const {
1987 // First, if it is a widened vector, narrow it.
1988 if (VT.isVector() &&
1990 EVT NarrowedVT =
1993 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1994 DAG.getConstant(0, SL, MVT::i32));
1995 }
1996
1997 // Then convert the vector elements or scalar value.
1998 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1999 VT.bitsLT(MemVT)) {
2000 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2001 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2002 }
2003
2004 if (MemVT.isFloatingPoint())
2005 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2006 else if (Signed)
2007 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2008 else
2009 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2010
2011 return Val;
2012}
2013
2014SDValue SITargetLowering::lowerKernargMemParameter(
2015 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2016 uint64_t Offset, Align Alignment, bool Signed,
2017 const ISD::InputArg *Arg) const {
2019
2020 // Try to avoid using an extload by loading earlier than the argument address,
2021 // and extracting the relevant bits. The load should hopefully be merged with
2022 // the previous argument.
2023 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2024 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2025 int64_t AlignDownOffset = alignDown(Offset, 4);
2026 int64_t OffsetDiff = Offset - AlignDownOffset;
2027
2028 EVT IntVT = MemVT.changeTypeToInteger();
2029
2030 // TODO: If we passed in the base kernel offset we could have a better
2031 // alignment than 4, but we don't really need it.
2032 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2033 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2036
2037 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2038 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2039
2040 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2041 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2042 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2043
2044
2045 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2046 }
2047
2048 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2049 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2052
2053 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2054 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2055}
2056
2057SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2058 const SDLoc &SL, SDValue Chain,
2059 const ISD::InputArg &Arg) const {
2061 MachineFrameInfo &MFI = MF.getFrameInfo();
2062
2063 if (Arg.Flags.isByVal()) {
2064 unsigned Size = Arg.Flags.getByValSize();
2065 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2066 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2067 }
2068
2069 unsigned ArgOffset = VA.getLocMemOffset();
2070 unsigned ArgSize = VA.getValVT().getStoreSize();
2071
2072 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2073
2074 // Create load nodes to retrieve arguments from the stack.
2075 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2076 SDValue ArgValue;
2077
2078 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2080 MVT MemVT = VA.getValVT();
2081
2082 switch (VA.getLocInfo()) {
2083 default:
2084 break;
2085 case CCValAssign::BCvt:
2086 MemVT = VA.getLocVT();
2087 break;
2088 case CCValAssign::SExt:
2089 ExtType = ISD::SEXTLOAD;
2090 break;
2091 case CCValAssign::ZExt:
2092 ExtType = ISD::ZEXTLOAD;
2093 break;
2094 case CCValAssign::AExt:
2095 ExtType = ISD::EXTLOAD;
2096 break;
2097 }
2098
2099 ArgValue = DAG.getExtLoad(
2100 ExtType, SL, VA.getLocVT(), Chain, FIN,
2102 MemVT);
2103 return ArgValue;
2104}
2105
2106SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2107 const SIMachineFunctionInfo &MFI,
2108 EVT VT,
2110 const ArgDescriptor *Reg = nullptr;
2111 const TargetRegisterClass *RC;
2112 LLT Ty;
2113
2115 const ArgDescriptor WorkGroupIDX =
2116 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2117 // If GridZ is not programmed in an entry function then the hardware will set
2118 // it to all zeros, so there is no need to mask the GridY value in the low
2119 // order bits.
2120 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2121 AMDGPU::TTMP7,
2122 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2123 const ArgDescriptor WorkGroupIDZ =
2124 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2125 if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
2126 switch (PVID) {
2128 Reg = &WorkGroupIDX;
2129 RC = &AMDGPU::SReg_32RegClass;
2130 Ty = LLT::scalar(32);
2131 break;
2133 Reg = &WorkGroupIDY;
2134 RC = &AMDGPU::SReg_32RegClass;
2135 Ty = LLT::scalar(32);
2136 break;
2138 Reg = &WorkGroupIDZ;
2139 RC = &AMDGPU::SReg_32RegClass;
2140 Ty = LLT::scalar(32);
2141 break;
2142 default:
2143 break;
2144 }
2145 }
2146
2147 if (!Reg)
2148 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2149 if (!Reg) {
2151 // It's possible for a kernarg intrinsic call to appear in a kernel with
2152 // no allocated segment, in which case we do not add the user sgpr
2153 // argument, so just return null.
2154 return DAG.getConstant(0, SDLoc(), VT);
2155 }
2156
2157 // It's undefined behavior if a function marked with the amdgpu-no-*
2158 // attributes uses the corresponding intrinsic.
2159 return DAG.getUNDEF(VT);
2160 }
2161
2162 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2163}
2164
2166 CallingConv::ID CallConv,
2167 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2168 FunctionType *FType,
2169 SIMachineFunctionInfo *Info) {
2170 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2171 const ISD::InputArg *Arg = &Ins[I];
2172
2173 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2174 "vector type argument should have been split");
2175
2176 // First check if it's a PS input addr.
2177 if (CallConv == CallingConv::AMDGPU_PS &&
2178 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2179 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2180
2181 // Inconveniently only the first part of the split is marked as isSplit,
2182 // so skip to the end. We only want to increment PSInputNum once for the
2183 // entire split argument.
2184 if (Arg->Flags.isSplit()) {
2185 while (!Arg->Flags.isSplitEnd()) {
2186 assert((!Arg->VT.isVector() ||
2187 Arg->VT.getScalarSizeInBits() == 16) &&
2188 "unexpected vector split in ps argument type");
2189 if (!SkipArg)
2190 Splits.push_back(*Arg);
2191 Arg = &Ins[++I];
2192 }
2193 }
2194
2195 if (SkipArg) {
2196 // We can safely skip PS inputs.
2197 Skipped.set(Arg->getOrigArgIndex());
2198 ++PSInputNum;
2199 continue;
2200 }
2201
2202 Info->markPSInputAllocated(PSInputNum);
2203 if (Arg->Used)
2204 Info->markPSInputEnabled(PSInputNum);
2205
2206 ++PSInputNum;
2207 }
2208
2209 Splits.push_back(*Arg);
2210 }
2211}
2212
2213// Allocate special inputs passed in VGPRs.
2215 MachineFunction &MF,
2216 const SIRegisterInfo &TRI,
2217 SIMachineFunctionInfo &Info) const {
2218 const LLT S32 = LLT::scalar(32);
2220
2221 if (Info.hasWorkItemIDX()) {
2222 Register Reg = AMDGPU::VGPR0;
2223 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2224
2225 CCInfo.AllocateReg(Reg);
2226 unsigned Mask = (Subtarget->hasPackedTID() &&
2227 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2228 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2229 }
2230
2231 if (Info.hasWorkItemIDY()) {
2232 assert(Info.hasWorkItemIDX());
2233 if (Subtarget->hasPackedTID()) {
2234 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2235 0x3ff << 10));
2236 } else {
2237 unsigned Reg = AMDGPU::VGPR1;
2238 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2239
2240 CCInfo.AllocateReg(Reg);
2241 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2242 }
2243 }
2244
2245 if (Info.hasWorkItemIDZ()) {
2246 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2247 if (Subtarget->hasPackedTID()) {
2248 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2249 0x3ff << 20));
2250 } else {
2251 unsigned Reg = AMDGPU::VGPR2;
2252 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2253
2254 CCInfo.AllocateReg(Reg);
2255 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2256 }
2257 }
2258}
2259
2260// Try to allocate a VGPR at the end of the argument list, or if no argument
2261// VGPRs are left allocating a stack slot.
2262// If \p Mask is is given it indicates bitfield position in the register.
2263// If \p Arg is given use it with new ]p Mask instead of allocating new.
2264static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2265 ArgDescriptor Arg = ArgDescriptor()) {
2266 if (Arg.isSet())
2267 return ArgDescriptor::createArg(Arg, Mask);
2268
2269 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2270 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2271 if (RegIdx == ArgVGPRs.size()) {
2272 // Spill to stack required.
2273 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2274
2275 return ArgDescriptor::createStack(Offset, Mask);
2276 }
2277
2278 unsigned Reg = ArgVGPRs[RegIdx];
2279 Reg = CCInfo.AllocateReg(Reg);
2280 assert(Reg != AMDGPU::NoRegister);
2281
2282 MachineFunction &MF = CCInfo.getMachineFunction();
2283 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2284 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2285 return ArgDescriptor::createRegister(Reg, Mask);
2286}
2287
2289 const TargetRegisterClass *RC,
2290 unsigned NumArgRegs) {
2291 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2292 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2293 if (RegIdx == ArgSGPRs.size())
2294 report_fatal_error("ran out of SGPRs for arguments");
2295
2296 unsigned Reg = ArgSGPRs[RegIdx];
2297 Reg = CCInfo.AllocateReg(Reg);
2298 assert(Reg != AMDGPU::NoRegister);
2299
2300 MachineFunction &MF = CCInfo.getMachineFunction();
2301 MF.addLiveIn(Reg, RC);
2303}
2304
2305// If this has a fixed position, we still should allocate the register in the
2306// CCInfo state. Technically we could get away with this for values passed
2307// outside of the normal argument range.
2309 const TargetRegisterClass *RC,
2310 MCRegister Reg) {
2311 Reg = CCInfo.AllocateReg(Reg);
2312 assert(Reg != AMDGPU::NoRegister);
2313 MachineFunction &MF = CCInfo.getMachineFunction();
2314 MF.addLiveIn(Reg, RC);
2315}
2316
2317static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2318 if (Arg) {
2319 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2320 Arg.getRegister());
2321 } else
2322 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2323}
2324
2325static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2326 if (Arg) {
2327 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2328 Arg.getRegister());
2329 } else
2330 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2331}
2332
2333/// Allocate implicit function VGPR arguments at the end of allocated user
2334/// arguments.
2336 CCState &CCInfo, MachineFunction &MF,
2337 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2338 const unsigned Mask = 0x3ff;
2339 ArgDescriptor Arg;
2340
2341 if (Info.hasWorkItemIDX()) {
2342 Arg = allocateVGPR32Input(CCInfo, Mask);
2343 Info.setWorkItemIDX(Arg);
2344 }
2345
2346 if (Info.hasWorkItemIDY()) {
2347 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2348 Info.setWorkItemIDY(Arg);
2349 }
2350
2351 if (Info.hasWorkItemIDZ())
2352 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2353}
2354
2355/// Allocate implicit function VGPR arguments in fixed registers.
2357 CCState &CCInfo, MachineFunction &MF,
2358 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2359 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2360 if (!Reg)
2361 report_fatal_error("failed to allocated VGPR for implicit arguments");
2362
2363 const unsigned Mask = 0x3ff;
2364 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2365 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2366 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2367}
2368
2370 CCState &CCInfo,
2371 MachineFunction &MF,
2372 const SIRegisterInfo &TRI,
2373 SIMachineFunctionInfo &Info) const {
2374 auto &ArgInfo = Info.getArgInfo();
2375 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2376
2377 // TODO: Unify handling with private memory pointers.
2378 if (UserSGPRInfo.hasDispatchPtr())
2379 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2380
2381 const Module *M = MF.getFunction().getParent();
2382 if (UserSGPRInfo.hasQueuePtr() &&
2384 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2385
2386 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2387 // constant offset from the kernarg segment.
2388 if (Info.hasImplicitArgPtr())
2389 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2390
2391 if (UserSGPRInfo.hasDispatchID())
2392 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2393
2394 // flat_scratch_init is not applicable for non-kernel functions.
2395
2396 if (Info.hasWorkGroupIDX())
2397 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2398
2399 if (Info.hasWorkGroupIDY())
2400 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2401
2402 if (Info.hasWorkGroupIDZ())
2403 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2404
2405 if (Info.hasLDSKernelId())
2406 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2407}
2408
2409// Allocate special inputs passed in user SGPRs.
2411 MachineFunction &MF,
2412 const SIRegisterInfo &TRI,
2413 SIMachineFunctionInfo &Info) const {
2414 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2415 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2416 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2417 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2418 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2419 }
2420
2421 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2422 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2423 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2424 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2425 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2426 }
2427
2428 if (UserSGPRInfo.hasDispatchPtr()) {
2429 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2430 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2431 CCInfo.AllocateReg(DispatchPtrReg);
2432 }
2433
2434 const Module *M = MF.getFunction().getParent();
2435 if (UserSGPRInfo.hasQueuePtr() &&
2437 Register QueuePtrReg = Info.addQueuePtr(TRI);
2438 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2439 CCInfo.AllocateReg(QueuePtrReg);
2440 }
2441
2442 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2444 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2445 CCInfo.AllocateReg(InputPtrReg);
2446
2447 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2448 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2449 }
2450
2451 if (UserSGPRInfo.hasDispatchID()) {
2452 Register DispatchIDReg = Info.addDispatchID(TRI);
2453 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2454 CCInfo.AllocateReg(DispatchIDReg);
2455 }
2456
2457 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2458 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2459 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2460 CCInfo.AllocateReg(FlatScratchInitReg);
2461 }
2462
2463 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2464 // these from the dispatch pointer.
2465}
2466
2467// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2468// sequential starting from the first argument.
2470 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2472 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2473 Function &F = MF.getFunction();
2474 unsigned LastExplicitArgOffset =
2475 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2476 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2477 bool InPreloadSequence = true;
2478 unsigned InIdx = 0;
2479 for (auto &Arg : F.args()) {
2480 if (!InPreloadSequence || !Arg.hasInRegAttr())
2481 break;
2482
2483 int ArgIdx = Arg.getArgNo();
2484 // Don't preload non-original args or parts not in the current preload
2485 // sequence.
2486 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2487 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2488 break;
2489
2490 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2491 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2492 InIdx++) {
2493 assert(ArgLocs[ArgIdx].isMemLoc());
2494 auto &ArgLoc = ArgLocs[InIdx];
2495 const Align KernelArgBaseAlign = Align(16);
2496 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2497 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2498 unsigned NumAllocSGPRs =
2499 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2500
2501 // Arg is preloaded into the previous SGPR.
2502 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2503 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2504 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2505 continue;
2506 }
2507
2508 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2509 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2510 // Check for free user SGPRs for preloading.
2511 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2512 SGPRInfo.getNumFreeUserSGPRs()) {
2513 InPreloadSequence = false;
2514 break;
2515 }
2516
2517 // Preload this argument.
2518 const TargetRegisterClass *RC =
2519 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2520 SmallVectorImpl<MCRegister> *PreloadRegs =
2521 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2522
2523 if (PreloadRegs->size() > 1)
2524 RC = &AMDGPU::SGPR_32RegClass;
2525 for (auto &Reg : *PreloadRegs) {
2526 assert(Reg);
2527 MF.addLiveIn(Reg, RC);
2528 CCInfo.AllocateReg(Reg);
2529 }
2530
2531 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2532 }
2533 }
2534}
2535
2537 const SIRegisterInfo &TRI,
2538 SIMachineFunctionInfo &Info) const {
2539 // Always allocate this last since it is a synthetic preload.
2540 if (Info.hasLDSKernelId()) {
2541 Register Reg = Info.addLDSKernelId();
2542 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2543 CCInfo.AllocateReg(Reg);
2544 }
2545}
2546
2547// Allocate special input registers that are initialized per-wave.
2549 MachineFunction &MF,
2551 CallingConv::ID CallConv,
2552 bool IsShader) const {
2553 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2554 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2555 // Note: user SGPRs are handled by the front-end for graphics shaders
2556 // Pad up the used user SGPRs with dead inputs.
2557
2558 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2559 // before enabling architected SGPRs for workgroup IDs.
2560 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2561
2562 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2563 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2564 // rely on it to reach 16 since if we end up having no stack usage, it will
2565 // not really be added.
2566 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2567 Info.hasWorkGroupIDY() +
2568 Info.hasWorkGroupIDZ() +
2569 Info.hasWorkGroupInfo();
2570 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2571 Register Reg = Info.addReservedUserSGPR();
2572 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2573 CCInfo.AllocateReg(Reg);
2574 }
2575 }
2576
2577 if (!HasArchitectedSGPRs) {
2578 if (Info.hasWorkGroupIDX()) {
2579 Register Reg = Info.addWorkGroupIDX();
2580 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2581 CCInfo.AllocateReg(Reg);
2582 }
2583
2584 if (Info.hasWorkGroupIDY()) {
2585 Register Reg = Info.addWorkGroupIDY();
2586 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2587 CCInfo.AllocateReg(Reg);
2588 }
2589
2590 if (Info.hasWorkGroupIDZ()) {
2591 Register Reg = Info.addWorkGroupIDZ();
2592 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2593 CCInfo.AllocateReg(Reg);
2594 }
2595 }
2596
2597 if (Info.hasWorkGroupInfo()) {
2598 Register Reg = Info.addWorkGroupInfo();
2599 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2600 CCInfo.AllocateReg(Reg);
2601 }
2602
2603 if (Info.hasPrivateSegmentWaveByteOffset()) {
2604 // Scratch wave offset passed in system SGPR.
2605 unsigned PrivateSegmentWaveByteOffsetReg;
2606
2607 if (IsShader) {
2608 PrivateSegmentWaveByteOffsetReg =
2609 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2610
2611 // This is true if the scratch wave byte offset doesn't have a fixed
2612 // location.
2613 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2614 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2615 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2616 }
2617 } else
2618 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2619
2620 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2621 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2622 }
2623
2624 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2625 Info.getNumPreloadedSGPRs() >= 16);
2626}
2627
2629 MachineFunction &MF,
2630 const SIRegisterInfo &TRI,
2631 SIMachineFunctionInfo &Info) {
2632 // Now that we've figured out where the scratch register inputs are, see if
2633 // should reserve the arguments and use them directly.
2634 MachineFrameInfo &MFI = MF.getFrameInfo();
2635 bool HasStackObjects = MFI.hasStackObjects();
2636 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2637
2638 // Record that we know we have non-spill stack objects so we don't need to
2639 // check all stack objects later.
2640 if (HasStackObjects)
2641 Info.setHasNonSpillStackObjects(true);
2642
2643 // Everything live out of a block is spilled with fast regalloc, so it's
2644 // almost certain that spilling will be required.
2645 if (TM.getOptLevel() == CodeGenOptLevel::None)
2646 HasStackObjects = true;
2647
2648 // For now assume stack access is needed in any callee functions, so we need
2649 // the scratch registers to pass in.
2650 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2651
2652 if (!ST.enableFlatScratch()) {
2653 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2654 // If we have stack objects, we unquestionably need the private buffer
2655 // resource. For the Code Object V2 ABI, this will be the first 4 user
2656 // SGPR inputs. We can reserve those and use them directly.
2657
2658 Register PrivateSegmentBufferReg =
2660 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2661 } else {
2662 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2663 // We tentatively reserve the last registers (skipping the last registers
2664 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2665 // we'll replace these with the ones immediately after those which were
2666 // really allocated. In the prologue copies will be inserted from the
2667 // argument to these reserved registers.
2668
2669 // Without HSA, relocations are used for the scratch pointer and the
2670 // buffer resource setup is always inserted in the prologue. Scratch wave
2671 // offset is still in an input SGPR.
2672 Info.setScratchRSrcReg(ReservedBufferReg);
2673 }
2674 }
2675
2677
2678 // For entry functions we have to set up the stack pointer if we use it,
2679 // whereas non-entry functions get this "for free". This means there is no
2680 // intrinsic advantage to using S32 over S34 in cases where we do not have
2681 // calls but do need a frame pointer (i.e. if we are requested to have one
2682 // because frame pointer elimination is disabled). To keep things simple we
2683 // only ever use S32 as the call ABI stack pointer, and so using it does not
2684 // imply we need a separate frame pointer.
2685 //
2686 // Try to use s32 as the SP, but move it if it would interfere with input
2687 // arguments. This won't work with calls though.
2688 //
2689 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2690 // registers.
2691 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2692 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2693 } else {
2695
2696 if (MFI.hasCalls())
2697 report_fatal_error("call in graphics shader with too many input SGPRs");
2698
2699 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2700 if (!MRI.isLiveIn(Reg)) {
2701 Info.setStackPtrOffsetReg(Reg);
2702 break;
2703 }
2704 }
2705
2706 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2707 report_fatal_error("failed to find register for SP");
2708 }
2709
2710 // hasFP should be accurate for entry functions even before the frame is
2711 // finalized, because it does not rely on the known stack size, only
2712 // properties like whether variable sized objects are present.
2713 if (ST.getFrameLowering()->hasFP(MF)) {
2714 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2715 }
2716}
2717
2720 return !Info->isEntryFunction();
2721}
2722
2724
2725}
2726
2728 MachineBasicBlock *Entry,
2729 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2731
2732 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2733 if (!IStart)
2734 return;
2735
2736 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2737 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2738 MachineBasicBlock::iterator MBBI = Entry->begin();
2739 for (const MCPhysReg *I = IStart; *I; ++I) {
2740 const TargetRegisterClass *RC = nullptr;
2741 if (AMDGPU::SReg_64RegClass.contains(*I))
2742 RC = &AMDGPU::SGPR_64RegClass;
2743 else if (AMDGPU::SReg_32RegClass.contains(*I))
2744 RC = &AMDGPU::SGPR_32RegClass;
2745 else
2746 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2747
2748 Register NewVR = MRI->createVirtualRegister(RC);
2749 // Create copy from CSR to a virtual register.
2750 Entry->addLiveIn(*I);
2751 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2752 .addReg(*I);
2753
2754 // Insert the copy-back instructions right before the terminator.
2755 for (auto *Exit : Exits)
2756 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2757 TII->get(TargetOpcode::COPY), *I)
2758 .addReg(NewVR);
2759 }
2760}
2761
2763 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2764 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2765 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2767
2769 const Function &Fn = MF.getFunction();
2772
2773 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2774 DiagnosticInfoUnsupported NoGraphicsHSA(
2775 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2776 DAG.getContext()->diagnose(NoGraphicsHSA);
2777 return DAG.getEntryNode();
2778 }
2779
2782 BitVector Skipped(Ins.size());
2783 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2784 *DAG.getContext());
2785
2786 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2787 bool IsKernel = AMDGPU::isKernel(CallConv);
2788 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2789
2790 if (IsGraphics) {
2791 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2792 assert(!UserSGPRInfo.hasDispatchPtr() &&
2793 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2794 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2795 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2796 (void)UserSGPRInfo;
2797 if (!Subtarget->enableFlatScratch())
2798 assert(!UserSGPRInfo.hasFlatScratchInit());
2799 if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs())
2800 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2801 !Info->hasWorkGroupIDZ());
2802 }
2803
2804 if (CallConv == CallingConv::AMDGPU_PS) {
2805 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2806
2807 // At least one interpolation mode must be enabled or else the GPU will
2808 // hang.
2809 //
2810 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2811 // set PSInputAddr, the user wants to enable some bits after the compilation
2812 // based on run-time states. Since we can't know what the final PSInputEna
2813 // will look like, so we shouldn't do anything here and the user should take
2814 // responsibility for the correct programming.
2815 //
2816 // Otherwise, the following restrictions apply:
2817 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2818 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2819 // enabled too.
2820 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2821 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2822 CCInfo.AllocateReg(AMDGPU::VGPR0);
2823 CCInfo.AllocateReg(AMDGPU::VGPR1);
2824 Info->markPSInputAllocated(0);
2825 Info->markPSInputEnabled(0);
2826 }
2827 if (Subtarget->isAmdPalOS()) {
2828 // For isAmdPalOS, the user does not enable some bits after compilation
2829 // based on run-time states; the register values being generated here are
2830 // the final ones set in hardware. Therefore we need to apply the
2831 // workaround to PSInputAddr and PSInputEnable together. (The case where
2832 // a bit is set in PSInputAddr but not PSInputEnable is where the
2833 // frontend set up an input arg for a particular interpolation mode, but
2834 // nothing uses that input arg. Really we should have an earlier pass
2835 // that removes such an arg.)
2836 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2837 if ((PsInputBits & 0x7F) == 0 ||
2838 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2839 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2840 }
2841 } else if (IsKernel) {
2842 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2843 } else {
2844 Splits.append(Ins.begin(), Ins.end());
2845 }
2846
2847 if (IsKernel)
2848 analyzeFormalArgumentsCompute(CCInfo, Ins);
2849
2850 if (IsEntryFunc) {
2851 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2852 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2853 if (IsKernel && Subtarget->hasKernargPreload())
2854 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2855
2856 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2857 } else if (!IsGraphics) {
2858 // For the fixed ABI, pass workitem IDs in the last argument register.
2859 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2860
2861 // FIXME: Sink this into allocateSpecialInputSGPRs
2862 if (!Subtarget->enableFlatScratch())
2863 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2864
2865 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2866 }
2867
2868 if (!IsKernel) {
2869 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2870 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2871 }
2872
2874
2875 // FIXME: This is the minimum kernel argument alignment. We should improve
2876 // this to the maximum alignment of the arguments.
2877 //
2878 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2879 // kern arg offset.
2880 const Align KernelArgBaseAlign = Align(16);
2881
2882 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2883 const ISD::InputArg &Arg = Ins[i];
2884 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2885 InVals.push_back(DAG.getUNDEF(Arg.VT));
2886 continue;
2887 }
2888
2889 CCValAssign &VA = ArgLocs[ArgIdx++];
2890 MVT VT = VA.getLocVT();
2891
2892 if (IsEntryFunc && VA.isMemLoc()) {
2893 VT = Ins[i].VT;
2894 EVT MemVT = VA.getLocVT();
2895
2896 const uint64_t Offset = VA.getLocMemOffset();
2897 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2898
2899 if (Arg.Flags.isByRef()) {
2900 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2901
2902 const GCNTargetMachine &TM =
2903 static_cast<const GCNTargetMachine &>(getTargetMachine());
2904 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2905 Arg.Flags.getPointerAddrSpace())) {
2908 }
2909
2910 InVals.push_back(Ptr);
2911 continue;
2912 }
2913
2914 SDValue NewArg;
2915 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2916 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2917 // In this case the argument is packed into the previous preload SGPR.
2918 int64_t AlignDownOffset = alignDown(Offset, 4);
2919 int64_t OffsetDiff = Offset - AlignDownOffset;
2920 EVT IntVT = MemVT.changeTypeToInteger();
2921
2925 Register Reg =
2926 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2927
2928 assert(Reg);
2929 Register VReg = MRI.getLiveInVirtReg(Reg);
2930 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2931
2932 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2933 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2934
2935 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2936 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2937 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2938 Ins[i].Flags.isSExt(), &Ins[i]);
2939
2940 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2941 } else {
2945 const SmallVectorImpl<MCRegister> &PreloadRegs =
2946 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2947
2948 SDValue Copy;
2949 if (PreloadRegs.size() == 1) {
2950 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2951 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2952 NewArg = DAG.getCopyFromReg(
2953 Chain, DL, VReg,
2955 TRI->getRegSizeInBits(*RC)));
2956
2957 } else {
2958 // If the kernarg alignment does not match the alignment of the SGPR
2959 // tuple RC that can accommodate this argument, it will be built up
2960 // via copies from from the individual SGPRs that the argument was
2961 // preloaded to.
2963 for (auto Reg : PreloadRegs) {
2964 Register VReg = MRI.getLiveInVirtReg(Reg);
2965 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2966 Elts.push_back(Copy);
2967 }
2968 NewArg =
2969 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
2970 PreloadRegs.size()),
2971 DL, Elts);
2972 }
2973
2974 SDValue CMemVT;
2975 if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
2976 CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
2977 else
2978 CMemVT = DAG.getBitcast(MemVT, NewArg);
2979 NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
2980 Ins[i].Flags.isSExt(), &Ins[i]);
2981 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
2982 }
2983 } else {
2984 NewArg =
2985 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
2986 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2987 }
2988 Chains.push_back(NewArg.getValue(1));
2989
2990 auto *ParamTy =
2991 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2993 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2994 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2995 // On SI local pointers are just offsets into LDS, so they are always
2996 // less than 16-bits. On CI and newer they could potentially be
2997 // real pointers, so we can't guarantee their size.
2998 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
2999 DAG.getValueType(MVT::i16));
3000 }
3001
3002 InVals.push_back(NewArg);
3003 continue;
3004 } else if (!IsEntryFunc && VA.isMemLoc()) {
3005 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3006 InVals.push_back(Val);
3007 if (!Arg.Flags.isByVal())
3008 Chains.push_back(Val.getValue(1));
3009 continue;
3010 }
3011
3012 assert(VA.isRegLoc() && "Parameter must be in a register!");
3013
3014 Register Reg = VA.getLocReg();
3015 const TargetRegisterClass *RC = nullptr;
3016 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3017 RC = &AMDGPU::VGPR_32RegClass;
3018 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3019 RC = &AMDGPU::SGPR_32RegClass;
3020 else
3021 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3022 EVT ValVT = VA.getValVT();
3023
3024 Reg = MF.addLiveIn(Reg, RC);
3025 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3026
3027 if (Arg.Flags.isSRet()) {
3028 // The return object should be reasonably addressable.
3029
3030 // FIXME: This helps when the return is a real sret. If it is a
3031 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3032 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3033 unsigned NumBits
3035 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3036 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3037 }
3038
3039 // If this is an 8 or 16-bit value, it is really passed promoted
3040 // to 32 bits. Insert an assert[sz]ext to capture this, then
3041 // truncate to the right size.
3042 switch (VA.getLocInfo()) {
3043 case CCValAssign::Full:
3044 break;
3045 case CCValAssign::BCvt:
3046 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3047 break;
3048 case CCValAssign::SExt:
3049 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3050 DAG.getValueType(ValVT));
3051 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3052 break;
3053 case CCValAssign::ZExt:
3054 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3055 DAG.getValueType(ValVT));
3056 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3057 break;
3058 case CCValAssign::AExt:
3059 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3060 break;
3061 default:
3062 llvm_unreachable("Unknown loc info!");
3063 }
3064
3065 InVals.push_back(Val);
3066 }
3067
3068 // Start adding system SGPRs.
3069 if (IsEntryFunc)
3070 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3071
3072 auto &ArgUsageInfo =
3074 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3075
3076 unsigned StackArgSize = CCInfo.getStackSize();
3077 Info->setBytesInStackArgArea(StackArgSize);
3078
3079 return Chains.empty() ? Chain :
3080 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3081}
3082
3083// TODO: If return values can't fit in registers, we should return as many as
3084// possible in registers before passing on stack.
3086 CallingConv::ID CallConv,
3087 MachineFunction &MF, bool IsVarArg,
3089 LLVMContext &Context) const {
3090 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3091 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3092 // for shaders. Vector types should be explicitly handled by CC.
3093 if (AMDGPU::isEntryFunctionCC(CallConv))
3094 return true;
3095
3097 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3098 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3099 return false;
3100
3101 // We must use the stack if return would require unavailable registers.
3102 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3103 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3104 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3105 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3106 return false;
3107
3108 return true;
3109}
3110
3111SDValue
3113 bool isVarArg,
3115 const SmallVectorImpl<SDValue> &OutVals,
3116 const SDLoc &DL, SelectionDAG &DAG) const {
3119
3120 if (AMDGPU::isKernel(CallConv)) {
3121 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3122 OutVals, DL, DAG);
3123 }
3124
3125 bool IsShader = AMDGPU::isShader(CallConv);
3126
3127 Info->setIfReturnsVoid(Outs.empty());
3128 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3129
3130 // CCValAssign - represent the assignment of the return value to a location.
3133
3134 // CCState - Info about the registers and stack slots.
3135 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3136 *DAG.getContext());
3137
3138 // Analyze outgoing return values.
3139 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3140
3141 SDValue Glue;
3143 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3144
3145 // Copy the result values into the output registers.
3146 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3147 ++I, ++RealRVLocIdx) {
3148 CCValAssign &VA = RVLocs[I];
3149 assert(VA.isRegLoc() && "Can only return in registers!");
3150 // TODO: Partially return in registers if return values don't fit.
3151 SDValue Arg = OutVals[RealRVLocIdx];
3152
3153 // Copied from other backends.
3154 switch (VA.getLocInfo()) {
3155 case CCValAssign::Full:
3156 break;
3157 case CCValAssign::BCvt:
3158 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3159 break;
3160 case CCValAssign::SExt:
3161 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3162 break;
3163 case CCValAssign::ZExt:
3164 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3165 break;
3166 case CCValAssign::AExt:
3167 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3168 break;
3169 default:
3170 llvm_unreachable("Unknown loc info!");
3171 }
3172
3173 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3174 Glue = Chain.getValue(1);
3175 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3176 }
3177
3178 // FIXME: Does sret work properly?
3179 if (!Info->isEntryFunction()) {
3180 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3181 const MCPhysReg *I =
3182 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3183 if (I) {
3184 for (; *I; ++I) {
3185 if (AMDGPU::SReg_64RegClass.contains(*I))
3186 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3187 else if (AMDGPU::SReg_32RegClass.contains(*I))
3188 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3189 else
3190 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3191 }
3192 }
3193 }
3194
3195 // Update chain and glue.
3196 RetOps[0] = Chain;
3197 if (Glue.getNode())
3198 RetOps.push_back(Glue);
3199
3200 unsigned Opc = AMDGPUISD::ENDPGM;
3201 if (!IsWaveEnd)
3203 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3204}
3205
3207 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3208 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3209 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3210 SDValue ThisVal) const {
3211 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3212
3213 // Assign locations to each value returned by this call.
3215 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3216 *DAG.getContext());
3217 CCInfo.AnalyzeCallResult(Ins, RetCC);
3218
3219 // Copy all of the result registers out of their specified physreg.
3220 for (unsigned i = 0; i != RVLocs.size(); ++i) {
3221 CCValAssign VA = RVLocs[i];
3222 SDValue Val;
3223
3224 if (VA.isRegLoc()) {
3225 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3226 Chain = Val.getValue(1);
3227 InGlue = Val.getValue(2);
3228 } else if (VA.isMemLoc()) {
3229 report_fatal_error("TODO: return values in memory");
3230 } else
3231 llvm_unreachable("unknown argument location type");
3232
3233 switch (VA.getLocInfo()) {
3234 case CCValAssign::Full:
3235 break;
3236 case CCValAssign::BCvt:
3237 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3238 break;
3239 case CCValAssign::ZExt:
3240 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3241 DAG.getValueType(VA.getValVT()));
3242 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3243 break;
3244 case CCValAssign::SExt:
3245 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3246 DAG.getValueType(VA.getValVT()));
3247 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3248 break;
3249 case CCValAssign::AExt:
3250 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3251 break;
3252 default:
3253 llvm_unreachable("Unknown loc info!");
3254 }
3255
3256 InVals.push_back(Val);
3257 }
3258
3259 return Chain;
3260}
3261
3262// Add code to pass special inputs required depending on used features separate
3263// from the explicit user arguments present in the IR.
3265 CallLoweringInfo &CLI,
3266 CCState &CCInfo,
3267 const SIMachineFunctionInfo &Info,
3268 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3269 SmallVectorImpl<SDValue> &MemOpChains,
3270 SDValue Chain) const {
3271 // If we don't have a call site, this was a call inserted by
3272 // legalization. These can never use special inputs.
3273 if (!CLI.CB)
3274 return;
3275
3276 SelectionDAG &DAG = CLI.DAG;
3277 const SDLoc &DL = CLI.DL;
3278 const Function &F = DAG.getMachineFunction().getFunction();
3279
3280 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3281 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3282
3283 const AMDGPUFunctionArgInfo *CalleeArgInfo
3285 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3286 auto &ArgUsageInfo =
3288 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3289 }
3290
3291 // TODO: Unify with private memory register handling. This is complicated by
3292 // the fact that at least in kernels, the input argument is not necessarily
3293 // in the same location as the input.
3294 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3296 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3297 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3298 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3299 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3300 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3301 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3302 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3303 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3304 };
3305
3306 for (auto Attr : ImplicitAttrs) {
3307 const ArgDescriptor *OutgoingArg;
3308 const TargetRegisterClass *ArgRC;
3309 LLT ArgTy;
3310
3311 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3312
3313 // If the callee does not use the attribute value, skip copying the value.
3314 if (CLI.CB->hasFnAttr(Attr.second))
3315 continue;
3316
3317 std::tie(OutgoingArg, ArgRC, ArgTy) =
3318 CalleeArgInfo->getPreloadedValue(InputID);
3319 if (!OutgoingArg)
3320 continue;
3321
3322 const ArgDescriptor *IncomingArg;
3323 const TargetRegisterClass *IncomingArgRC;
3324 LLT Ty;
3325 std::tie(IncomingArg, IncomingArgRC, Ty) =
3326 CallerArgInfo.getPreloadedValue(InputID);
3327 assert(IncomingArgRC == ArgRC);
3328
3329 // All special arguments are ints for now.
3330 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3331 SDValue InputReg;
3332
3333 if (IncomingArg) {
3334 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3335 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3336 // The implicit arg ptr is special because it doesn't have a corresponding
3337 // input for kernels, and is computed from the kernarg segment pointer.
3338 InputReg = getImplicitArgPtr(DAG, DL);
3339 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3340 std::optional<uint32_t> Id =
3342 if (Id.has_value()) {
3343 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3344 } else {
3345 InputReg = DAG.getUNDEF(ArgVT);
3346 }
3347 } else {
3348 // We may have proven the input wasn't needed, although the ABI is
3349 // requiring it. We just need to allocate the register appropriately.
3350 InputReg = DAG.getUNDEF(ArgVT);
3351 }
3352
3353 if (OutgoingArg->isRegister()) {
3354 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3355 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3356 report_fatal_error("failed to allocate implicit input argument");
3357 } else {
3358 unsigned SpecialArgOffset =
3359 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3360 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3361 SpecialArgOffset);
3362 MemOpChains.push_back(ArgStore);
3363 }
3364 }
3365
3366 // Pack workitem IDs into a single register or pass it as is if already
3367 // packed.
3368 const ArgDescriptor *OutgoingArg;
3369 const TargetRegisterClass *ArgRC;
3370 LLT Ty;
3371
3372 std::tie(OutgoingArg, ArgRC, Ty) =
3374 if (!OutgoingArg)
3375 std::tie(OutgoingArg, ArgRC, Ty) =
3377 if (!OutgoingArg)
3378 std::tie(OutgoingArg, ArgRC, Ty) =
3380 if (!OutgoingArg)
3381 return;
3382
3383 const ArgDescriptor *IncomingArgX = std::get<0>(
3385 const ArgDescriptor *IncomingArgY = std::get<0>(
3387 const ArgDescriptor *IncomingArgZ = std::get<0>(
3389
3390 SDValue InputReg;
3391 SDLoc SL;
3392
3393 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3394 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3395 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3396
3397 // If incoming ids are not packed we need to pack them.
3398 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3399 NeedWorkItemIDX) {
3400 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3401 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3402 } else {
3403 InputReg = DAG.getConstant(0, DL, MVT::i32);
3404 }
3405 }
3406
3407 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3408 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3409 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3410 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3411 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3412 InputReg = InputReg.getNode() ?
3413 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3414 }
3415
3416 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3417 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3418 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3419 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3420 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3421 InputReg = InputReg.getNode() ?
3422 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3423 }
3424
3425 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3426 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3427 // We're in a situation where the outgoing function requires the workitem
3428 // ID, but the calling function does not have it (e.g a graphics function
3429 // calling a C calling convention function). This is illegal, but we need
3430 // to produce something.
3431 InputReg = DAG.getUNDEF(MVT::i32);
3432 } else {
3433 // Workitem ids are already packed, any of present incoming arguments
3434 // will carry all required fields.
3436 IncomingArgX ? *IncomingArgX :
3437 IncomingArgY ? *IncomingArgY :
3438 *IncomingArgZ, ~0u);
3439 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3440 }
3441 }
3442
3443 if (OutgoingArg->isRegister()) {
3444 if (InputReg)
3445 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3446
3447 CCInfo.AllocateReg(OutgoingArg->getRegister());
3448 } else {
3449 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3450 if (InputReg) {
3451 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3452 SpecialArgOffset);
3453 MemOpChains.push_back(ArgStore);
3454 }
3455 }
3456}
3457
3459 return CC == CallingConv::Fast;
3460}
3461
3462/// Return true if we might ever do TCO for calls with this calling convention.
3464 switch (CC) {
3465 case CallingConv::C:
3467 return true;
3468 default:
3469 return canGuaranteeTCO(CC);
3470 }
3471}
3472
3474 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3476 const SmallVectorImpl<SDValue> &OutVals,
3477 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3478 if (AMDGPU::isChainCC(CalleeCC))
3479 return true;
3480
3481 if (!mayTailCallThisCC(CalleeCC))
3482 return false;
3483
3484 // For a divergent call target, we need to do a waterfall loop over the
3485 // possible callees which precludes us from using a simple jump.
3486 if (Callee->isDivergent())
3487 return false;
3488
3490 const Function &CallerF = MF.getFunction();
3491 CallingConv::ID CallerCC = CallerF.getCallingConv();
3493 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3494
3495 // Kernels aren't callable, and don't have a live in return address so it
3496 // doesn't make sense to do a tail call with entry functions.
3497 if (!CallerPreserved)
3498 return false;
3499
3500 bool CCMatch = CallerCC == CalleeCC;
3501
3503 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3504 return true;
3505 return false;
3506 }
3507
3508 // TODO: Can we handle var args?
3509 if (IsVarArg)
3510 return false;
3511
3512 for (const Argument &Arg : CallerF.args()) {
3513 if (Arg.hasByValAttr())
3514 return false;
3515 }
3516
3517 LLVMContext &Ctx = *DAG.getContext();
3518
3519 // Check that the call results are passed in the same way.
3520 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3521 CCAssignFnForCall(CalleeCC, IsVarArg),
3522 CCAssignFnForCall(CallerCC, IsVarArg)))
3523 return false;
3524
3525 // The callee has to preserve all registers the caller needs to preserve.
3526 if (!CCMatch) {
3527 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3528 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3529 return false;
3530 }
3531
3532 // Nothing more to check if the callee is taking no arguments.
3533 if (Outs.empty())
3534 return true;
3535
3537 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3538
3539 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3540
3541 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3542 // If the stack arguments for this call do not fit into our own save area then
3543 // the call cannot be made tail.
3544 // TODO: Is this really necessary?
3545 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3546 return false;
3547
3548 const MachineRegisterInfo &MRI = MF.getRegInfo();
3549 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3550}
3551
3553 if (!CI->isTailCall())
3554 return false;
3555
3556 const Function *ParentFn = CI->getParent()->getParent();
3558 return false;
3559 return true;
3560}
3561
3562// The wave scratch offset register is used as the global base pointer.
3564 SmallVectorImpl<SDValue> &InVals) const {
3565 CallingConv::ID CallConv = CLI.CallConv;
3566 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3567
3568 SelectionDAG &DAG = CLI.DAG;
3569
3570 TargetLowering::ArgListEntry RequestedExec;
3571 if (IsChainCallConv) {
3572 // The last argument should be the value that we need to put in EXEC.
3573 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3574 // don't treat it like the rest of the arguments.
3575 RequestedExec = CLI.Args.back();
3576 assert(RequestedExec.Node && "No node for EXEC");
3577
3578 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3579 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3580
3581 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3582 CLI.Outs.pop_back();
3583 CLI.OutVals.pop_back();
3584
3585 if (RequestedExec.Ty->isIntegerTy(64)) {
3586 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3587 CLI.Outs.pop_back();
3588 CLI.OutVals.pop_back();
3589 }
3590
3591 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3592 "Haven't popped all the pieces of the EXEC mask");
3593 }
3594
3595 const SDLoc &DL = CLI.DL;
3597 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3599 SDValue Chain = CLI.Chain;
3600 SDValue Callee = CLI.Callee;
3601 bool &IsTailCall = CLI.IsTailCall;
3602 bool IsVarArg = CLI.IsVarArg;
3603 bool IsSibCall = false;
3605
3606 if (Callee.isUndef() || isNullConstant(Callee)) {
3607 if (!CLI.IsTailCall) {
3608 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3609 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3610 }
3611
3612 return Chain;
3613 }
3614
3615 if (IsVarArg) {
3616 return lowerUnhandledCall(CLI, InVals,
3617 "unsupported call to variadic function ");
3618 }
3619
3620 if (!CLI.CB)
3621 report_fatal_error("unsupported libcall legalization");
3622
3623 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3624 return lowerUnhandledCall(CLI, InVals,
3625 "unsupported required tail call to function ");
3626 }
3627
3628 if (IsTailCall) {
3630 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3631 if (!IsTailCall &&
3632 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3633 report_fatal_error("failed to perform tail call elimination on a call "
3634 "site marked musttail or on llvm.amdgcn.cs.chain");
3635 }
3636
3637 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3638
3639 // A sibling call is one where we're under the usual C ABI and not planning
3640 // to change that but can still do a tail call:
3641 if (!TailCallOpt && IsTailCall)
3642 IsSibCall = true;
3643
3644 if (IsTailCall)
3645 ++NumTailCalls;
3646 }
3647
3650 SmallVector<SDValue, 8> MemOpChains;
3651
3652 // Analyze operands of the call, assigning locations to each operand.
3654 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3655 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3656
3657 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3658 // With a fixed ABI, allocate fixed registers before user arguments.
3659 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3660 }
3661
3662 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3663
3664 // Get a count of how many bytes are to be pushed on the stack.
3665 unsigned NumBytes = CCInfo.getStackSize();
3666
3667 if (IsSibCall) {
3668 // Since we're not changing the ABI to make this a tail call, the memory
3669 // operands are already available in the caller's incoming argument space.
3670 NumBytes = 0;
3671 }
3672
3673 // FPDiff is the byte offset of the call's argument area from the callee's.
3674 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3675 // by this amount for a tail call. In a sibling call it must be 0 because the
3676 // caller will deallocate the entire stack and the callee still expects its
3677 // arguments to begin at SP+0. Completely unused for non-tail calls.
3678 int32_t FPDiff = 0;
3679 MachineFrameInfo &MFI = MF.getFrameInfo();
3680
3681 // Adjust the stack pointer for the new arguments...
3682 // These operations are automatically eliminated by the prolog/epilog pass
3683 if (!IsSibCall)
3684 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3685
3686 if (!IsSibCall || IsChainCallConv) {
3687 if (!Subtarget->enableFlatScratch()) {
3688 SmallVector<SDValue, 4> CopyFromChains;
3689
3690 // In the HSA case, this should be an identity copy.
3691 SDValue ScratchRSrcReg
3692 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3693 RegsToPass.emplace_back(IsChainCallConv
3694 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3695 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3696 ScratchRSrcReg);
3697 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3698 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3699 }
3700 }
3701
3702 MVT PtrVT = MVT::i32;
3703
3704 // Walk the register/memloc assignments, inserting copies/loads.
3705 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3706 CCValAssign &VA = ArgLocs[i];
3707 SDValue Arg = OutVals[i];
3708
3709 // Promote the value if needed.
3710 switch (VA.getLocInfo()) {
3711 case CCValAssign::Full:
3712 break;
3713 case CCValAssign::BCvt:
3714 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3715 break;
3716 case CCValAssign::ZExt:
3717 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3718 break;
3719 case CCValAssign::SExt:
3720 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3721 break;
3722 case CCValAssign::AExt:
3723 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3724 break;
3725 case CCValAssign::FPExt:
3726 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3727 break;
3728 default:
3729 llvm_unreachable("Unknown loc info!");
3730 }
3731
3732 if (VA.isRegLoc()) {
3733 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3734 } else {
3735 assert(VA.isMemLoc());
3736
3737 SDValue DstAddr;
3738 MachinePointerInfo DstInfo;
3739
3740 unsigned LocMemOffset = VA.getLocMemOffset();
3741 int32_t Offset = LocMemOffset;
3742
3743 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3744 MaybeAlign Alignment;
3745
3746 if (IsTailCall) {
3747 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3748 unsigned OpSize = Flags.isByVal() ?
3749 Flags.getByValSize() : VA.getValVT().getStoreSize();
3750
3751 // FIXME: We can have better than the minimum byval required alignment.
3752 Alignment =
3753 Flags.isByVal()
3754 ? Flags.getNonZeroByValAlign()
3755 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3756
3757 Offset = Offset + FPDiff;
3758 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3759
3760 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3761 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3762
3763 // Make sure any stack arguments overlapping with where we're storing
3764 // are loaded before this eventual operation. Otherwise they'll be
3765 // clobbered.
3766
3767 // FIXME: Why is this really necessary? This seems to just result in a
3768 // lot of code to copy the stack and write them back to the same
3769 // locations, which are supposed to be immutable?
3770 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3771 } else {
3772 // Stores to the argument stack area are relative to the stack pointer.
3773 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3774 MVT::i32);
3775 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3776 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3777 Alignment =
3778 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3779 }
3780
3781 if (Outs[i].Flags.isByVal()) {
3782 SDValue SizeNode =
3783 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3784 SDValue Cpy =
3785 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3786 Outs[i].Flags.getNonZeroByValAlign(),
3787 /*isVol = */ false, /*AlwaysInline = */ true,
3788 /*isTailCall = */ false, DstInfo,
3790
3791 MemOpChains.push_back(Cpy);
3792 } else {
3793 SDValue Store =
3794 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3795 MemOpChains.push_back(Store);
3796 }
3797 }
3798 }
3799
3800 if (!MemOpChains.empty())
3801 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3802
3803 // Build a sequence of copy-to-reg nodes chained together with token chain
3804 // and flag operands which copy the outgoing args into the appropriate regs.
3805 SDValue InGlue;
3806 for (auto &RegToPass : RegsToPass) {
3807 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3808 RegToPass.second, InGlue);
3809 InGlue = Chain.getValue(1);
3810 }
3811
3812
3813 // We don't usually want to end the call-sequence here because we would tidy
3814 // the frame up *after* the call, however in the ABI-changing tail-call case
3815 // we've carefully laid out the parameters so that when sp is reset they'll be
3816 // in the correct location.
3817 if (IsTailCall && !IsSibCall) {
3818 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3819 InGlue = Chain.getValue(1);
3820 }
3821
3822 std::vector<SDValue> Ops;
3823 Ops.push_back(Chain);
3824 Ops.push_back(Callee);
3825 // Add a redundant copy of the callee global which will not be legalized, as
3826 // we need direct access to the callee later.
3827 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3828 const GlobalValue *GV = GSD->getGlobal();
3829 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3830 } else {
3831 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3832 }
3833
3834 if (!IsTailCall)
3835 Ops.push_back(CLI.ConvergenceControlToken);
3836
3837 if (IsTailCall) {
3838 // Each tail call may have to adjust the stack by a different amount, so
3839 // this information must travel along with the operation for eventual
3840 // consumption by emitEpilogue.
3841 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3842 }
3843
3844 if (IsChainCallConv)
3845 Ops.push_back(RequestedExec.Node);
3846
3847 // Add argument registers to the end of the list so that they are known live
3848 // into the call.
3849 for (auto &RegToPass : RegsToPass) {
3850 Ops.push_back(DAG.getRegister(RegToPass.first,
3851 RegToPass.second.getValueType()));
3852 }
3853
3854 // Add a register mask operand representing the call-preserved registers.
3855 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3856 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3857 assert(Mask && "Missing call preserved mask for calling convention");
3858 Ops.push_back(DAG.getRegisterMask(Mask));
3859
3860 if (InGlue.getNode())
3861 Ops.push_back(InGlue);
3862
3863 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3864
3865 // If we're doing a tall call, use a TC_RETURN here rather than an
3866 // actual call instruction.
3867 if (IsTailCall) {
3868 MFI.setHasTailCall();
3869 unsigned OPC = AMDGPUISD::TC_RETURN;
3870 switch (CallConv) {
3873 break;
3877 break;
3878 }
3879
3880 return DAG.getNode(OPC, DL, NodeTys, Ops);
3881 }
3882
3883 // Returns a chain and a flag for retval copy to use.
3884 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3885 Chain = Call.getValue(0);
3886 InGlue = Call.getValue(1);
3887
3888 uint64_t CalleePopBytes = NumBytes;
3889 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3890 if (!Ins.empty())
3891 InGlue = Chain.getValue(1);
3892
3893 // Handle result values, copying them out of physregs into vregs that we
3894 // return.
3895 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3896 InVals, /*IsThisReturn=*/false, SDValue());
3897}
3898
3899// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3900// except for applying the wave size scale to the increment amount.
3902 SDValue Op, SelectionDAG &DAG) const {
3903 const MachineFunction &MF = DAG.getMachineFunction();
3905
3906 SDLoc dl(Op);
3907 EVT VT = Op.getValueType();
3908 SDValue Tmp1 = Op;
3909 SDValue Tmp2 = Op.getValue(1);
3910 SDValue Tmp3 = Op.getOperand(2);
3911 SDValue Chain = Tmp1.getOperand(0);
3912
3913 Register SPReg = Info->getStackPtrOffsetReg();
3914
3915 // Chain the dynamic stack allocation so that it doesn't modify the stack
3916 // pointer when other instructions are using the stack.
3917 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3918
3919 SDValue Size = Tmp2.getOperand(1);
3920 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3921 Chain = SP.getValue(1);
3922 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3923 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3924 unsigned Opc =
3927
3928 SDValue ScaledSize = DAG.getNode(
3929 ISD::SHL, dl, VT, Size,
3930 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3931
3932 Align StackAlign = TFL->getStackAlign();
3933 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3934 if (Alignment && *Alignment > StackAlign) {
3935 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3936 DAG.getConstant(-(uint64_t)Alignment->value()
3937 << Subtarget->getWavefrontSizeLog2(),
3938 dl, VT));
3939 }
3940
3941 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3942 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3943
3944 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3945}
3946
3948 SelectionDAG &DAG) const {
3949 // We only handle constant sizes here to allow non-entry block, static sized
3950 // allocas. A truly dynamic value is more difficult to support because we
3951 // don't know if the size value is uniform or not. If the size isn't uniform,
3952 // we would need to do a wave reduction to get the maximum size to know how
3953 // much to increment the uniform stack pointer.
3954 SDValue Size = Op.getOperand(1);
3955 if (isa<ConstantSDNode>(Size))
3956 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3957
3959}
3960
3962 if (Op.getValueType() != MVT::i32)
3963 return Op; // Defer to cannot select error.
3964
3966 SDLoc SL(Op);
3967
3968 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
3969
3970 // Convert from wave uniform to swizzled vector address. This should protect
3971 // from any edge cases where the stacksave result isn't directly used with
3972 // stackrestore.
3973 SDValue VectorAddress =
3974 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
3975 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
3976}
3977
3979 SelectionDAG &DAG) const {
3980 SDLoc SL(Op);
3981 assert(Op.getValueType() == MVT::i32);
3982
3983 uint32_t BothRoundHwReg =
3985 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
3986
3987 SDValue IntrinID =
3988 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
3989 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
3990 Op.getOperand(0), IntrinID, GetRoundBothImm);
3991
3992 // There are two rounding modes, one for f32 and one for f64/f16. We only
3993 // report in the standard value range if both are the same.
3994 //
3995 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
3996 // ties away from zero is not supported, and the other values are rotated by
3997 // 1.
3998 //
3999 // If the two rounding modes are not the same, report a target defined value.
4000
4001 // Mode register rounding mode fields:
4002 //
4003 // [1:0] Single-precision round mode.
4004 // [3:2] Double/Half-precision round mode.
4005 //
4006 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4007 //
4008 // Hardware Spec
4009 // Toward-0 3 0
4010 // Nearest Even 0 1
4011 // +Inf 1 2
4012 // -Inf 2 3
4013 // NearestAway0 N/A 4
4014 //
4015 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4016 // table we can index by the raw hardware mode.
4017 //
4018 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4019
4020 SDValue BitTable =
4022
4023 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4024 SDValue RoundModeTimesNumBits =
4025 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4026
4027 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4028 // knew only one mode was demanded.
4029 SDValue TableValue =
4030 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4031 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4032
4033 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4034 SDValue TableEntry =
4035 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4036
4037 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4038 // if it's an extended value.
4039 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4040 SDValue IsStandardValue =
4041 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4042 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4043 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4044 TableEntry, EnumOffset);
4045
4046 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4047}
4048
4050 if (Op->isDivergent())
4051 return SDValue();
4052
4053 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4058 break;
4059 default:
4060 return SDValue();
4061 }
4062
4063 return Op;
4064}
4065
4066// Work around DAG legality rules only based on the result type.
4068 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4069 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4070 EVT SrcVT = Src.getValueType();
4071
4072 if (SrcVT.getScalarType() != MVT::bf16)
4073 return Op;
4074
4075 SDLoc SL(Op);
4076 SDValue BitCast =
4077 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4078
4079 EVT DstVT = Op.getValueType();
4080 if (IsStrict)
4081 llvm_unreachable("Need STRICT_BF16_TO_FP");
4082
4083 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4084}
4085
4087 SDLoc SL(Op);
4088 if (Op.getValueType() != MVT::i64)
4089 return Op;
4090
4091 uint32_t ModeHwReg =
4093 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4094 uint32_t TrapHwReg =
4096 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4097
4098 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4099 SDValue IntrinID =
4100 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4101 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4102 Op.getOperand(0), IntrinID, ModeHwRegImm);
4103 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4104 Op.getOperand(0), IntrinID, TrapHwRegImm);
4105 SDValue TokenReg =
4106 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4107 GetTrapReg.getValue(1));
4108
4109 SDValue CvtPtr =
4110 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4111 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4112
4113 return DAG.getMergeValues({Result, TokenReg}, SL);
4114}
4115
4117 SDLoc SL(Op);
4118 if (Op.getOperand(1).getValueType() != MVT::i64)
4119 return Op;
4120
4121 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4122 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4123 DAG.getConstant(0, SL, MVT::i32));
4124 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4125 DAG.getConstant(1, SL, MVT::i32));
4126
4127 SDValue ReadFirstLaneID =
4128 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4129 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4130 ReadFirstLaneID, NewModeReg);
4131 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4132 ReadFirstLaneID, NewTrapReg);
4133
4134 unsigned ModeHwReg =
4136 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4137 unsigned TrapHwReg =
4139 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4140
4141 SDValue IntrinID =
4142 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4143 SDValue SetModeReg =
4144 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4145 IntrinID, ModeHwRegImm, NewModeReg);
4146 SDValue SetTrapReg =
4147 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4148 IntrinID, TrapHwRegImm, NewTrapReg);
4149 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4150}
4151
4153 const MachineFunction &MF) const {
4155 .Case("m0", AMDGPU::M0)
4156 .Case("exec", AMDGPU::EXEC)
4157 .Case("exec_lo", AMDGPU::EXEC_LO)
4158 .Case("exec_hi", AMDGPU::EXEC_HI)
4159 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4160 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4161 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4162 .Default(Register());
4163
4164 if (Reg == AMDGPU::NoRegister) {
4165 report_fatal_error(Twine("invalid register name \""
4166 + StringRef(RegName) + "\"."));
4167
4168 }
4169
4170 if (!Subtarget->hasFlatScrRegister() &&
4171 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4172 report_fatal_error(Twine("invalid register \""
4173 + StringRef(RegName) + "\" for subtarget."));
4174 }
4175
4176 switch (Reg) {
4177 case AMDGPU::M0:
4178 case AMDGPU::EXEC_LO:
4179 case AMDGPU::EXEC_HI:
4180 case AMDGPU::FLAT_SCR_LO:
4181 case AMDGPU::FLAT_SCR_HI:
4182 if (VT.getSizeInBits() == 32)
4183 return Reg;
4184 break;
4185 case AMDGPU::EXEC:
4186 case AMDGPU::FLAT_SCR:
4187 if (VT.getSizeInBits() == 64)
4188 return Reg;
4189 break;
4190 default:
4191 llvm_unreachable("missing register type checking");
4192 }
4193
4194 report_fatal_error(Twine("invalid type for register \""
4195 + StringRef(RegName) + "\"."));
4196}
4197
4198// If kill is not the last instruction, split the block so kill is always a
4199// proper terminator.
4202 MachineBasicBlock *BB) const {
4203 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4205 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4206 return SplitBB;
4207}
4208
4209// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4210// \p MI will be the only instruction in the loop body block. Otherwise, it will
4211// be the first instruction in the remainder block.
4212//
4213/// \returns { LoopBody, Remainder }
4214static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4218
4219 // To insert the loop we need to split the block. Move everything after this
4220 // point to a new block, and insert a new empty block between the two.
4222 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4224 ++MBBI;
4225
4226 MF->insert(MBBI, LoopBB);
4227 MF->insert(MBBI, RemainderBB);
4228
4229 LoopBB->addSuccessor(LoopBB);
4230 LoopBB->addSuccessor(RemainderBB);
4231
4232 // Move the rest of the block into a new block.
4233 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4234
4235 if (InstInLoop) {
4236 auto Next = std::next(I);
4237
4238 // Move instruction to loop body.
4239 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4240
4241 // Move the rest of the block.
4242 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4243 } else {
4244 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4245 }
4246
4247 MBB.addSuccessor(LoopBB);
4248
4249 return std::pair(LoopBB, RemainderBB);
4250}
4251
4252/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4254 MachineBasicBlock *MBB = MI.getParent();
4256 auto I = MI.getIterator();
4257 auto E = std::next(I);
4258
4259 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4260 .addImm(0);
4261
4262 MIBundleBuilder Bundler(*MBB, I, E);
4263 finalizeBundle(*MBB, Bundler.begin());
4264}
4265
4268 MachineBasicBlock *BB) const {
4269 const DebugLoc &DL = MI.getDebugLoc();
4270
4272
4273 MachineBasicBlock *LoopBB;
4274 MachineBasicBlock *RemainderBB;
4276
4277 // Apparently kill flags are only valid if the def is in the same block?
4278 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4279 Src->setIsKill(false);
4280
4281 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4282
4283 MachineBasicBlock::iterator I = LoopBB->end();
4284
4285 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4287
4288 // Clear TRAP_STS.MEM_VIOL
4289 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4290 .addImm(0)
4291 .addImm(EncodedReg);
4292
4294
4295 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4296
4297 // Load and check TRAP_STS.MEM_VIOL
4298 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4299 .addImm(EncodedReg);
4300
4301 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4302 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4303 .addReg(Reg, RegState::Kill)
4304 .addImm(0);
4305 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4306 .addMBB(LoopBB);
4307
4308 return RemainderBB;
4309}
4310
4311// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4312// wavefront. If the value is uniform and just happens to be in a VGPR, this
4313// will only do one iteration. In the worst case, this will loop 64 times.
4314//
4315// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4318 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4319 const DebugLoc &DL, const MachineOperand &Idx,
4320 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4321 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4322 Register &SGPRIdxReg) {
4323
4324 MachineFunction *MF = OrigBB.getParent();
4325 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4326 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4328
4329 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4330 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4331 Register NewExec = MRI.createVirtualRegister(BoolRC);
4332 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4333 Register CondReg = MRI.createVirtualRegister(BoolRC);
4334
4335 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4336 .addReg(InitReg)
4337 .addMBB(&OrigBB)
4338 .addReg(ResultReg)
4339 .addMBB(&LoopBB);
4340
4341 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4342 .addReg(InitSaveExecReg)
4343 .addMBB(&OrigBB)
4344 .addReg(NewExec)
4345 .addMBB(&LoopBB);
4346
4347 // Read the next variant <- also loop target.
4348 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4349 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4350
4351 // Compare the just read M0 value to all possible Idx values.
4352 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4353 .addReg(CurrentIdxReg)
4354 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4355
4356 // Update EXEC, save the original EXEC value to VCC.
4357 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4358 : AMDGPU::S_AND_SAVEEXEC_B64),
4359 NewExec)
4360 .addReg(CondReg, RegState::Kill);
4361
4362 MRI.setSimpleHint(NewExec, CondReg);
4363
4364 if (UseGPRIdxMode) {
4365 if (Offset == 0) {
4366 SGPRIdxReg = CurrentIdxReg;
4367 } else {
4368 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4369 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4370 .addReg(CurrentIdxReg, RegState::Kill)
4371 .addImm(Offset);
4372 }
4373 } else {
4374 // Move index from VCC into M0
4375 if (Offset == 0) {
4376 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4377 .addReg(CurrentIdxReg, RegState::Kill);
4378 } else {
4379 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4380 .addReg(CurrentIdxReg, RegState::Kill)
4381 .addImm(Offset);
4382 }
4383 }
4384
4385 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4386 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4387 MachineInstr *InsertPt =
4388 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4389 : AMDGPU::S_XOR_B64_term), Exec)
4390 .addReg(Exec)
4391 .addReg(NewExec);
4392
4393 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4394 // s_cbranch_scc0?
4395
4396 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4397 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4398 .addMBB(&LoopBB);
4399
4400 return InsertPt->getIterator();
4401}
4402
4403// This has slightly sub-optimal regalloc when the source vector is killed by
4404// the read. The register allocator does not understand that the kill is
4405// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4406// subregister from it, using 1 more VGPR than necessary. This was saved when
4407// this was expanded after register allocation.
4410 unsigned InitResultReg, unsigned PhiReg, int Offset,
4411 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4413 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4414 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4416 const DebugLoc &DL = MI.getDebugLoc();
4418
4419 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4420 Register DstReg = MI.getOperand(0).getReg();
4421 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4422 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4423 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4424 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4425
4426 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4427
4428 // Save the EXEC mask
4429 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4430 .addReg(Exec);
4431
4432 MachineBasicBlock *LoopBB;
4433 MachineBasicBlock *RemainderBB;
4434 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4435
4436 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4437
4438 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4439 InitResultReg, DstReg, PhiReg, TmpExec,
4440 Offset, UseGPRIdxMode, SGPRIdxReg);
4441
4442 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4444 ++MBBI;
4445 MF->insert(MBBI, LandingPad);
4446 LoopBB->removeSuccessor(RemainderBB);
4447 LandingPad->addSuccessor(RemainderBB);
4448 LoopBB->addSuccessor(LandingPad);
4449 MachineBasicBlock::iterator First = LandingPad->begin();
4450 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4451 .addReg(SaveExec);
4452
4453 return InsPt;
4454}
4455
4456// Returns subreg index, offset
4457static std::pair<unsigned, int>
4459 const TargetRegisterClass *SuperRC,
4460 unsigned VecReg,
4461 int Offset) {
4462 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4463
4464 // Skip out of bounds offsets, or else we would end up using an undefined
4465 // register.
4466 if (Offset >= NumElts || Offset < 0)
4467 return std::pair(AMDGPU::sub0, Offset);
4468
4469 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4470}
4471
4474 int Offset) {
4475 MachineBasicBlock *MBB = MI.getParent();
4476 const DebugLoc &DL = MI.getDebugLoc();
4478
4479 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4480
4481 assert(Idx->getReg() != AMDGPU::NoRegister);
4482
4483 if (Offset == 0) {
4484 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4485 } else {
4486 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4487 .add(*Idx)
4488 .addImm(Offset);
4489 }
4490}
4491
4494 int Offset) {
4495 MachineBasicBlock *MBB = MI.getParent();
4496 const DebugLoc &DL = MI.getDebugLoc();
4498
4499 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4500
4501 if (Offset == 0)
4502 return Idx->getReg();
4503
4504 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4505 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4506 .add(*Idx)
4507 .addImm(Offset);
4508 return Tmp;
4509}
4510
4513 const GCNSubtarget &ST) {
4514 const SIInstrInfo *TII = ST.getInstrInfo();
4515 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4518
4519 Register Dst = MI.getOperand(0).getReg();
4520 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4521 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4522 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4523
4524 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4525 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4526
4527 unsigned SubReg;
4528 std::tie(SubReg, Offset)
4529 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4530
4531 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4532
4533 // Check for a SGPR index.
4534 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4536 const DebugLoc &DL = MI.getDebugLoc();
4537
4538 if (UseGPRIdxMode) {
4539 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4540 // to avoid interfering with other uses, so probably requires a new
4541 // optimization pass.
4543
4544 const MCInstrDesc &GPRIDXDesc =
4545 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4546 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4547 .addReg(SrcReg)
4548 .addReg(Idx)
4549 .addImm(SubReg);
4550 } else {
4552
4553 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4554 .addReg(SrcReg, 0, SubReg)
4555 .addReg(SrcReg, RegState::Implicit);
4556 }
4557
4558 MI.eraseFromParent();
4559
4560 return &MBB;
4561 }
4562
4563 // Control flow needs to be inserted if indexing with a VGPR.
4564 const DebugLoc &DL = MI.getDebugLoc();
4566
4567 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4568 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4569
4570 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4571
4572 Register SGPRIdxReg;
4573 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4574 UseGPRIdxMode, SGPRIdxReg);
4575
4576 MachineBasicBlock *LoopBB = InsPt->getParent();
4577
4578 if (UseGPRIdxMode) {
4579 const MCInstrDesc &GPRIDXDesc =
4580 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4581
4582 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4583 .addReg(SrcReg)
4584 .addReg(SGPRIdxReg)
4585 .addImm(SubReg);
4586 } else {
4587 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4588 .addReg(SrcReg, 0, SubReg)
4589 .addReg(SrcReg, RegState::Implicit);
4590 }
4591
4592 MI.eraseFromParent();
4593
4594 return LoopBB;
4595}
4596
4599 const GCNSubtarget &ST) {
4600 const SIInstrInfo *TII = ST.getInstrInfo();
4601 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4604
4605 Register Dst = MI.getOperand(0).getReg();
4606 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4607 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4608 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4609 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4610 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4611 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4612
4613 // This can be an immediate, but will be folded later.
4614 assert(Val->getReg());
4615
4616 unsigned SubReg;
4617 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4618 SrcVec->getReg(),
4619 Offset);
4620 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4621
4622 if (Idx->getReg() == AMDGPU::NoRegister) {
4624 const DebugLoc &DL = MI.getDebugLoc();
4625
4626 assert(Offset == 0);
4627
4628 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4629 .add(*SrcVec)
4630 .add(*Val)
4631 .addImm(SubReg);
4632
4633 MI.eraseFromParent();
4634 return &MBB;
4635 }
4636
4637 // Check for a SGPR index.
4638 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4640 const DebugLoc &DL = MI.getDebugLoc();
4641
4642 if (UseGPRIdxMode) {
4644
4645 const MCInstrDesc &GPRIDXDesc =
4646 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4647 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4648 .addReg(SrcVec->getReg())
4649 .add(*Val)
4650 .addReg(Idx)
4651 .addImm(SubReg);
4652 } else {
4654
4655 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4656 TRI.getRegSizeInBits(*VecRC), 32, false);
4657 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4658 .addReg(SrcVec->getReg())
4659 .add(*Val)
4660 .addImm(SubReg);
4661 }
4662 MI.eraseFromParent();
4663 return &MBB;
4664 }
4665
4666 // Control flow needs to be inserted if indexing with a VGPR.
4667 if (Val->isReg())
4668 MRI.clearKillFlags(Val->getReg());
4669
4670 const DebugLoc &DL = MI.getDebugLoc();
4671
4672 Register PhiReg = MRI.createVirtualRegister(VecRC);
4673
4674 Register SGPRIdxReg;
4675 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4676 UseGPRIdxMode, SGPRIdxReg);
4677 MachineBasicBlock *LoopBB = InsPt->getParent();
4678
4679 if (UseGPRIdxMode) {
4680 const MCInstrDesc &GPRIDXDesc =
4681 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4682
4683 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4684 .addReg(PhiReg)
4685 .add(*Val)
4686 .addReg(SGPRIdxReg)
4687 .addImm(AMDGPU::sub0);
4688 } else {
4689 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4690 TRI.getRegSizeInBits(*VecRC), 32, false);
4691 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4692 .addReg(PhiReg)
4693 .add(*Val)
4694 .addImm(AMDGPU::sub0);
4695 }
4696
4697 MI.eraseFromParent();
4698 return LoopBB;
4699}
4700
4703 const GCNSubtarget &ST,
4704 unsigned Opc) {
4706 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4707 const DebugLoc &DL = MI.getDebugLoc();
4708 const SIInstrInfo *TII = ST.getInstrInfo();
4709
4710 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4711 Register SrcReg = MI.getOperand(1).getReg();
4712 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4713 Register DstReg = MI.getOperand(0).getReg();
4714 MachineBasicBlock *RetBB = nullptr;
4715 if (isSGPR) {
4716 // These operations with a uniform value i.e. SGPR are idempotent.
4717 // Reduced value will be same as given sgpr.
4718 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4719 RetBB = &BB;
4720 } else {
4721 // TODO: Implement DPP Strategy and switch based on immediate strategy
4722 // operand. For now, for all the cases (default, Iterative and DPP we use
4723 // iterative approach by default.)
4724
4725 // To reduce the VGPR using iterative approach, we need to iterate
4726 // over all the active lanes. Lowering consists of ComputeLoop,
4727 // which iterate over only active lanes. We use copy of EXEC register
4728 // as induction variable and every active lane modifies it using bitset0
4729 // so that we will get the next active lane for next iteration.
4731 Register SrcReg = MI.getOperand(1).getReg();
4732
4733 // Create Control flow for loop
4734 // Split MI's Machine Basic block into For loop
4735 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4736
4737 // Create virtual registers required for lowering.
4738 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4739 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4740 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4741 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4742
4743 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4744 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4745 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4746
4747 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4748 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4749
4750 bool IsWave32 = ST.isWave32();
4751 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4752 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4753
4754 // Create initail values of induction variable from Exec, Accumulator and
4755 // insert branch instr to newly created ComputeBlockk
4756 uint32_t InitalValue =
4757 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4758 auto TmpSReg =
4759 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4760 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4761 .addImm(InitalValue);
4762 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4763
4764 // Start constructing ComputeLoop
4765 I = ComputeLoop->end();
4766 auto Accumulator =
4767 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4768 .addReg(InitalValReg)
4769 .addMBB(&BB);
4770 auto ActiveBits =
4771 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4772 .addReg(TmpSReg->getOperand(0).getReg())
4773 .addMBB(&BB);
4774
4775 // Perform the computations
4776 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4777 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4778 .addReg(ActiveBits->getOperand(0).getReg());
4779 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4780 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4781 .addReg(SrcReg)
4782 .addReg(FF1->getOperand(0).getReg());
4783 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4784 .addReg(Accumulator->getOperand(0).getReg())
4785 .addReg(LaneValue->getOperand(0).getReg());
4786
4787 // Manipulate the iterator to get the next active lane
4788 unsigned BITSETOpc =
4789 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4790 auto NewActiveBits =
4791 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4792 .addReg(FF1->getOperand(0).getReg())
4793 .addReg(ActiveBits->getOperand(0).getReg());
4794
4795 // Add phi nodes
4796 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4797 .addMBB(ComputeLoop);
4798 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4799 .addMBB(ComputeLoop);
4800
4801 // Creating branching
4802 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4803 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4804 .addReg(NewActiveBits->getOperand(0).getReg())
4805 .addImm(0);
4806 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4807 .addMBB(ComputeLoop);
4808
4809 RetBB = ComputeEnd;
4810 }
4811 MI.eraseFromParent();
4812 return RetBB;
4813}
4814
4816 MachineInstr &MI, MachineBasicBlock *BB) const {
4817
4819 MachineFunction *MF = BB->getParent();
4821
4822 switch (MI.getOpcode()) {
4823 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4824 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4825 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4826 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4827 case AMDGPU::S_UADDO_PSEUDO:
4828 case AMDGPU::S_USUBO_PSEUDO: {
4829 const DebugLoc &DL = MI.getDebugLoc();
4830 MachineOperand &Dest0 = MI.getOperand(0);
4831 MachineOperand &Dest1 = MI.getOperand(1);
4832 MachineOperand &Src0 = MI.getOperand(2);
4833 MachineOperand &Src1 = MI.getOperand(3);
4834
4835 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4836 ? AMDGPU::S_ADD_I32
4837 : AMDGPU::S_SUB_I32;
4838 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4839
4840 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4841 .addImm(1)
4842 .addImm(0);
4843
4844 MI.eraseFromParent();
4845 return BB;
4846 }
4847 case AMDGPU::S_ADD_U64_PSEUDO:
4848 case AMDGPU::S_SUB_U64_PSEUDO: {
4849 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4850 // For GFX12, we emit s_add_u64 and s_sub_u64.
4851 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4853 const DebugLoc &DL = MI.getDebugLoc();
4854 MachineOperand &Dest = MI.getOperand(0);
4855 MachineOperand &Src0 = MI.getOperand(1);
4856 MachineOperand &Src1 = MI.getOperand(2);
4857 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4858 if (Subtarget->hasScalarAddSub64()) {
4859 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4860 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
4861 .addReg(Src0.getReg())
4862 .addReg(Src1.getReg());
4863 } else {
4864 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4865 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4866
4867 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4868 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4869
4870 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4871 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4872 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4873 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4874
4875 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4876 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4877 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4878 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4879
4880 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4881 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4882 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4883 .add(Src0Sub0)
4884 .add(Src1Sub0);
4885 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4886 .add(Src0Sub1)
4887 .add(Src1Sub1);
4888 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4889 .addReg(DestSub0)
4890 .addImm(AMDGPU::sub0)
4891 .addReg(DestSub1)
4892 .addImm(AMDGPU::sub1);
4893 }
4894 MI.eraseFromParent();
4895 return BB;
4896 }
4897 case AMDGPU::V_ADD_U64_PSEUDO:
4898 case AMDGPU::V_SUB_U64_PSEUDO: {
4900 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4901 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4902 const DebugLoc &DL = MI.getDebugLoc();
4903
4904 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4905
4906 MachineOperand &Dest = MI.getOperand(0);
4907 MachineOperand &Src0 = MI.getOperand(1);
4908 MachineOperand &Src1 = MI.getOperand(2);
4909
4910 if (IsAdd && ST.hasLshlAddB64()) {
4911 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
4912 Dest.getReg())
4913 .add(Src0)
4914 .addImm(0)
4915 .add(Src1);
4916 TII->legalizeOperands(*Add);
4917 MI.eraseFromParent();
4918 return BB;
4919 }
4920
4921 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4922
4923 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4924 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4925
4926 Register CarryReg = MRI.createVirtualRegister(CarryRC);
4927 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
4928
4929 const TargetRegisterClass *Src0RC = Src0.isReg()
4930 ? MRI.getRegClass(Src0.getReg())
4931 : &AMDGPU::VReg_64RegClass;
4932 const TargetRegisterClass *Src1RC = Src1.isReg()
4933 ? MRI.getRegClass(Src1.getReg())
4934 : &AMDGPU::VReg_64RegClass;
4935
4936 const TargetRegisterClass *Src0SubRC =
4937 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4938 const TargetRegisterClass *Src1SubRC =
4939 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4940
4941 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
4942 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4943 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
4944 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4945
4946 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
4947 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4948 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
4949 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4950
4951 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4952 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4953 .addReg(CarryReg, RegState::Define)
4954 .add(SrcReg0Sub0)
4955 .add(SrcReg1Sub0)
4956 .addImm(0); // clamp bit
4957
4958 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4959 MachineInstr *HiHalf =
4960 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4961 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4962 .add(SrcReg0Sub1)
4963 .add(SrcReg1Sub1)
4964 .addReg(CarryReg, RegState::Kill)
4965 .addImm(0); // clamp bit
4966
4967 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4968 .addReg(DestSub0)
4969 .addImm(AMDGPU::sub0)
4970 .addReg(DestSub1)
4971 .addImm(AMDGPU::sub1);
4972 TII->legalizeOperands(*LoHalf);
4973 TII->legalizeOperands(*HiHalf);
4974 MI.eraseFromParent();
4975 return BB;
4976 }
4977 case AMDGPU::S_ADD_CO_PSEUDO:
4978 case AMDGPU::S_SUB_CO_PSEUDO: {
4979 // This pseudo has a chance to be selected
4980 // only from uniform add/subcarry node. All the VGPR operands
4981 // therefore assumed to be splat vectors.
4983 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4984 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4986 const DebugLoc &DL = MI.getDebugLoc();
4987 MachineOperand &Dest = MI.getOperand(0);
4988 MachineOperand &CarryDest = MI.getOperand(1);
4989 MachineOperand &Src0 = MI.getOperand(2);
4990 MachineOperand &Src1 = MI.getOperand(3);
4991 MachineOperand &Src2 = MI.getOperand(4);
4992 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
4993 ? AMDGPU::S_ADDC_U32
4994 : AMDGPU::S_SUBB_U32;
4995 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
4996 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4997 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
4998 .addReg(Src0.getReg());
4999 Src0.setReg(RegOp0);
5000 }
5001 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5002 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5003 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5004 .addReg(Src1.getReg());
5005 Src1.setReg(RegOp1);
5006 }
5007 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5008 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5009 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5010 .addReg(Src2.getReg());
5011 Src2.setReg(RegOp2);
5012 }
5013
5014 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5015 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5016 assert(WaveSize == 64 || WaveSize == 32);
5017
5018 if (WaveSize == 64) {
5019 if (ST.hasScalarCompareEq64()) {
5020 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5021 .addReg(Src2.getReg())
5022 .addImm(0);
5023 } else {
5024 const TargetRegisterClass *SubRC =
5025 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5026 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5027 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5028 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5029 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5030 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5031
5032 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5033 .add(Src2Sub0)
5034 .add(Src2Sub1);
5035
5036 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5037 .addReg(Src2_32, RegState::Kill)
5038 .addImm(0);
5039 }
5040 } else {
5041 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5042 .addReg(Src2.getReg())
5043 .addImm(0);
5044 }
5045
5046 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5047
5048 unsigned SelOpc =
5049 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5050
5051 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5052 .addImm(-1)
5053 .addImm(0);
5054
5055 MI.eraseFromParent();
5056 return BB;
5057 }
5058 case AMDGPU::SI_INIT_M0: {
5059 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5060 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5061 .add(MI.getOperand(0));
5062 MI.eraseFromParent();
5063 return BB;
5064 }
5065 case AMDGPU::GET_GROUPSTATICSIZE: {
5066 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5067 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5068 DebugLoc DL = MI.getDebugLoc();
5069 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5070 .add(MI.getOperand(0))
5071 .addImm(MFI->getLDSSize());
5072 MI.eraseFromParent();
5073 return BB;
5074 }
5075 case AMDGPU::GET_SHADERCYCLESHILO: {
5078 const DebugLoc &DL = MI.getDebugLoc();
5079 // The algorithm is:
5080 //
5081 // hi1 = getreg(SHADER_CYCLES_HI)
5082 // lo1 = getreg(SHADER_CYCLES_LO)
5083 // hi2 = getreg(SHADER_CYCLES_HI)
5084 //
5085 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5086 // Otherwise there was overflow and the result is hi2:0. In both cases the
5087 // result should represent the actual time at some point during the sequence
5088 // of three getregs.
5089 using namespace AMDGPU::Hwreg;
5090 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5091 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5092 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5093 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5094 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5095 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5096 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5097 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5098 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5099 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5100 .addReg(RegHi1)
5101 .addReg(RegHi2);
5102 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5103 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5104 .addReg(RegLo1)
5105 .addImm(0);
5106 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5107 .add(MI.getOperand(0))
5108 .addReg(RegLo)
5109 .addImm(AMDGPU::sub0)
5110 .addReg(RegHi2)
5111 .addImm(AMDGPU::sub1);
5112 MI.eraseFromParent();
5113 return BB;
5114 }
5115 case AMDGPU::SI_INDIRECT_SRC_V1:
5116 case AMDGPU::SI_INDIRECT_SRC_V2:
5117 case AMDGPU::SI_INDIRECT_SRC_V4:
5118 case AMDGPU::SI_INDIRECT_SRC_V8:
5119 case AMDGPU::SI_INDIRECT_SRC_V9:
5120 case AMDGPU::SI_INDIRECT_SRC_V10:
5121 case AMDGPU::SI_INDIRECT_SRC_V11:
5122 case AMDGPU::SI_INDIRECT_SRC_V12:
5123 case AMDGPU::SI_INDIRECT_SRC_V16:
5124 case AMDGPU::SI_INDIRECT_SRC_V32:
5125 return emitIndirectSrc(MI, *BB, *getSubtarget());
5126 case AMDGPU::SI_INDIRECT_DST_V1:
5127 case AMDGPU::SI_INDIRECT_DST_V2:
5128 case AMDGPU::SI_INDIRECT_DST_V4:
5129 case AMDGPU::SI_INDIRECT_DST_V8:
5130 case AMDGPU::SI_INDIRECT_DST_V9:
5131 case AMDGPU::SI_INDIRECT_DST_V10:
5132 case AMDGPU::SI_INDIRECT_DST_V11:
5133 case AMDGPU::SI_INDIRECT_DST_V12:
5134 case AMDGPU::SI_INDIRECT_DST_V16:
5135 case AMDGPU::SI_INDIRECT_DST_V32:
5136 return emitIndirectDst(MI, *BB, *getSubtarget());
5137 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5138 case AMDGPU::SI_KILL_I1_PSEUDO:
5139 return splitKillBlock(MI, BB);
5140 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5142 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5143 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5144
5145 Register Dst = MI.getOperand(0).getReg();
5146 const MachineOperand &Src0 = MI.getOperand(1);
5147 const MachineOperand &Src1 = MI.getOperand(2);
5148 const DebugLoc &DL = MI.getDebugLoc();
5149 Register SrcCond = MI.getOperand(3).getReg();
5150
5151 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5152 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5153 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5154 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5155
5156 const TargetRegisterClass *Src0RC = Src0.isReg()
5157 ? MRI.getRegClass(Src0.getReg())
5158 : &AMDGPU::VReg_64RegClass;
5159 const TargetRegisterClass *Src1RC = Src1.isReg()
5160 ? MRI.getRegClass(Src1.getReg())
5161 : &AMDGPU::VReg_64RegClass;
5162
5163 const TargetRegisterClass *Src0SubRC =
5164 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5165 const TargetRegisterClass *Src1SubRC =
5166 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5167
5168 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5169 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5170 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5171 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5172
5173 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5174 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5175 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5176 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5177
5178 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5179 .addReg(SrcCond);
5180 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5181 .addImm(0)
5182 .add(Src0Sub0)
5183 .addImm(0)
5184 .add(Src1Sub0)
5185 .addReg(SrcCondCopy);
5186 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5187 .addImm(0)
5188 .add(Src0Sub1)
5189 .addImm(0)
5190 .add(Src1Sub1)
5191 .addReg(SrcCondCopy);
5192
5193 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5194 .addReg(DstLo)
5195 .addImm(AMDGPU::sub0)
5196 .addReg(DstHi)
5197 .addImm(AMDGPU::sub1);
5198 MI.eraseFromParent();
5199 return BB;
5200 }
5201 case AMDGPU::SI_BR_UNDEF: {
5203 const DebugLoc &DL = MI.getDebugLoc();
5204 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5205 .add(MI.getOperand(0));
5206 Br->getOperand(1).setIsUndef(); // read undef SCC
5207 MI.eraseFromParent();
5208 return BB;
5209 }
5210 case AMDGPU::ADJCALLSTACKUP:
5211 case AMDGPU::ADJCALLSTACKDOWN: {
5213 MachineInstrBuilder MIB(*MF, &MI);
5214 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5215 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5216 return BB;
5217 }
5218 case AMDGPU::SI_CALL_ISEL: {
5220 const DebugLoc &DL = MI.getDebugLoc();
5221
5222 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5223
5225 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5226
5227 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
5228 MachineOperand &MO = MI.getOperand(I);
5229 if (I != 2) {
5230 MIB.add(MO);
5231 continue;
5232 }
5233 }
5234
5235 MachineOperand &MO = MI.getOperand(2);
5237 // The token operand is always a register, whose definition is IMPLICIT_DEF
5238 // iff there was no token on the call.
5239 if (MachineInstr *Def = MRI.getVRegDef(MO.getReg())) {
5240 if (Def->getOpcode() != TargetOpcode::IMPLICIT_DEF) {
5241 MO.setImplicit();
5242 MIB.add(MO);
5243 }
5244 }
5245
5246 MIB.cloneMemRefs(MI);
5247 MI.eraseFromParent();
5248 return BB;
5249 }
5250 case AMDGPU::V_ADD_CO_U32_e32:
5251 case AMDGPU::V_SUB_CO_U32_e32:
5252 case AMDGPU::V_SUBREV_CO_U32_e32: {
5253 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5254 const DebugLoc &DL = MI.getDebugLoc();
5255 unsigned Opc = MI.getOpcode();
5256
5257 bool NeedClampOperand = false;
5258 if (TII->pseudoToMCOpcode(Opc) == -1) {
5259 Opc = AMDGPU::getVOPe64(Opc);
5260 NeedClampOperand = true;
5261 }
5262
5263 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5264 if (TII->isVOP3(*I)) {
5265 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5266 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5267 I.addReg(TRI->getVCC(), RegState::Define);
5268 }
5269 I.add(MI.getOperand(1))
5270 .add(MI.getOperand(2));
5271 if (NeedClampOperand)
5272 I.addImm(0); // clamp bit for e64 encoding
5273
5274 TII->legalizeOperands(*I);
5275
5276 MI.eraseFromParent();
5277 return BB;
5278 }
5279 case AMDGPU::V_ADDC_U32_e32:
5280 case AMDGPU::V_SUBB_U32_e32:
5281 case AMDGPU::V_SUBBREV_U32_e32:
5282 // These instructions have an implicit use of vcc which counts towards the
5283 // constant bus limit.
5284 TII->legalizeOperands(MI);
5285 return BB;
5286 case AMDGPU::DS_GWS_INIT:
5287 case AMDGPU::DS_GWS_SEMA_BR:
5288 case AMDGPU::DS_GWS_BARRIER:
5289 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5290 [[fallthrough]];
5291 case AMDGPU::DS_GWS_SEMA_V:
5292 case AMDGPU::DS_GWS_SEMA_P:
5293 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5294 // A s_waitcnt 0 is required to be the instruction immediately following.
5295 if (getSubtarget()->hasGWSAutoReplay()) {
5297 return BB;
5298 }
5299
5300 return emitGWSMemViolTestLoop(MI, BB);
5301 case AMDGPU::S_SETREG_B32: {
5302 // Try to optimize cases that only set the denormal mode or rounding mode.
5303 //
5304 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5305 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5306 // instead.
5307 //
5308 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5309 // allow you to have a no side effect instruction in the output of a
5310 // sideeffecting pattern.
5311 auto [ID, Offset, Width] =
5312 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5314 return BB;
5315
5316 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5317 const unsigned SetMask = WidthMask << Offset;
5318
5319 if (getSubtarget()->hasDenormModeInst()) {
5320 unsigned SetDenormOp = 0;
5321 unsigned SetRoundOp = 0;
5322
5323 // The dedicated instructions can only set the whole denorm or round mode
5324 // at once, not a subset of bits in either.
5325 if (SetMask ==
5327 // If this fully sets both the round and denorm mode, emit the two
5328 // dedicated instructions for these.
5329 SetRoundOp = AMDGPU::S_ROUND_MODE;
5330 SetDenormOp = AMDGPU::S_DENORM_MODE;
5331 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5332 SetRoundOp = AMDGPU::S_ROUND_MODE;
5333 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5334 SetDenormOp = AMDGPU::S_DENORM_MODE;
5335 }
5336
5337 if (SetRoundOp || SetDenormOp) {
5339 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5340 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5341 unsigned ImmVal = Def->getOperand(1).getImm();
5342 if (SetRoundOp) {
5343 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5344 .addImm(ImmVal & 0xf);
5345
5346 // If we also have the denorm mode, get just the denorm mode bits.
5347 ImmVal >>= 4;
5348 }
5349
5350 if (SetDenormOp) {
5351 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5352 .addImm(ImmVal & 0xf);
5353 }
5354
5355 MI.eraseFromParent();
5356 return BB;
5357 }
5358 }
5359 }
5360
5361 // If only FP bits are touched, used the no side effects pseudo.
5362 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5363 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5364 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5365
5366 return BB;
5367 }
5368 case AMDGPU::S_INVERSE_BALLOT_U32:
5369 case AMDGPU::S_INVERSE_BALLOT_U64: {
5371 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5372 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5373 const DebugLoc &DL = MI.getDebugLoc();
5374 const Register DstReg = MI.getOperand(0).getReg();
5375 Register MaskReg = MI.getOperand(1).getReg();
5376
5377 const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg);
5378
5379 if (IsVALU) {
5380 MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI);
5381 }
5382
5383 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg);
5384 MI.eraseFromParent();
5385 return BB;
5386 }
5387 case AMDGPU::ENDPGM_TRAP: {
5388 const DebugLoc &DL = MI.getDebugLoc();
5389 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5390 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5391 MI.addOperand(MachineOperand::CreateImm(0));
5392 return BB;
5393 }
5394
5395 // We need a block split to make the real endpgm a terminator. We also don't
5396 // want to break phis in successor blocks, so we can't just delete to the
5397 // end of the block.
5398
5399 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5401 MF->push_back(TrapBB);
5402 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5403 .addImm(0);
5404 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5405 .addMBB(TrapBB);
5406
5407 BB->addSuccessor(TrapBB);
5408 MI.eraseFromParent();
5409 return SplitBB;
5410 }
5411 default:
5413 }
5414}
5415
5417 // This currently forces unfolding various combinations of fsub into fma with
5418 // free fneg'd operands. As long as we have fast FMA (controlled by
5419 // isFMAFasterThanFMulAndFAdd), we should perform these.
5420
5421 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5422 // most of these combines appear to be cycle neutral but save on instruction
5423 // count / code size.
5424 return true;
5425}
5426
5428
5430 EVT VT) const {
5431 if (!VT.isVector()) {
5432 return MVT::i1;
5433 }
5434 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5435}
5436
5438 // TODO: Should i16 be used always if legal? For now it would force VALU
5439 // shifts.
5440 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5441}
5442
5444 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5445 ? Ty.changeElementSize(16)
5446 : Ty.changeElementSize(32);
5447}
5448
5449// Answering this is somewhat tricky and depends on the specific device which
5450// have different rates for fma or all f64 operations.
5451//
5452// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5453// regardless of which device (although the number of cycles differs between
5454// devices), so it is always profitable for f64.
5455//
5456// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5457// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5458// which we can always do even without fused FP ops since it returns the same
5459// result as the separate operations and since it is always full
5460// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5461// however does not support denormals, so we do report fma as faster if we have
5462// a fast fma device and require denormals.
5463//
5465 EVT VT) const {
5466 VT = VT.getScalarType();
5467
5468 switch (VT.getSimpleVT().SimpleTy) {
5469 case MVT::f32: {
5470 // If mad is not available this depends only on if f32 fma is full rate.
5471 if (!Subtarget->hasMadMacF32Insts())
5472 return Subtarget->hasFastFMAF32();
5473
5474 // Otherwise f32 mad is always full rate and returns the same result as
5475 // the separate operations so should be preferred over fma.
5476 // However does not support denormals.
5478 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5479
5480 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5481 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5482 }
5483 case MVT::f64:
5484 return true;
5485 case MVT::f16:
5486 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5487 default:
5488 break;
5489 }
5490
5491 return false;
5492}
5493
5495 LLT Ty) const {
5496 switch (Ty.getScalarSizeInBits()) {
5497 case 16:
5498 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5499 case 32:
5500 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5501 case 64:
5502 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5503 default:
5504 break;
5505 }
5506
5507 return false;
5508}
5509
5511 if (!Ty.isScalar())
5512 return false;
5513
5514 if (Ty.getScalarSizeInBits() == 16)
5515 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5516 if (Ty.getScalarSizeInBits() == 32)
5517 return Subtarget->hasMadMacF32Insts() &&
5518 denormalModeIsFlushAllF32(*MI.getMF());
5519
5520 return false;
5521}
5522
5524 const SDNode *N) const {
5525 // TODO: Check future ftz flag
5526 // v_mad_f32/v_mac_f32 do not support denormals.
5527 EVT VT = N->getValueType(0);
5528 if (VT == MVT::f32)
5529 return Subtarget->hasMadMacF32Insts() &&
5531 if (VT == MVT::f16) {
5532 return Subtarget->hasMadF16() &&
5534 }
5535
5536 return false;
5537}
5538
5539//===----------------------------------------------------------------------===//
5540// Custom DAG Lowering Operations
5541//===----------------------------------------------------------------------===//
5542
5543// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5544// wider vector type is legal.
5546 SelectionDAG &DAG) const {
5547 unsigned Opc = Op.getOpcode();
5548 EVT VT = Op.getValueType();
5549 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5550 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5551 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5552 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5553
5554 SDValue Lo, Hi;
5555 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5556
5557 SDLoc SL(Op);
5558 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5559 Op->getFlags());
5560 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5561 Op->getFlags());
5562
5563 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5564}
5565
5566// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5567// wider vector type is legal.
5569 SelectionDAG &DAG) const {
5570 unsigned Opc = Op.getOpcode();
5571 EVT VT = Op.getValueType();
5572 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5573 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5574 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5575 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5576
5577 SDValue Lo0, Hi0;
5578 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5579 SDValue Lo1, Hi1;
5580 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5581
5582 SDLoc SL(Op);
5583
5584 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5585 Op->getFlags());
5586 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5587 Op->getFlags());
5588
5589 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5590}
5591
5593 SelectionDAG &DAG) const {
5594 unsigned Opc = Op.getOpcode();
5595 EVT VT = Op.getValueType();
5596 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5597 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5598 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5599 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5600 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5601 VT == MVT::v32bf16);
5602
5603 SDValue Lo0, Hi0;
5604 SDValue Op0 = Op.getOperand(0);
5605 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5606 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5607 : std::pair(Op0, Op0);
5608 SDValue Lo1, Hi1;
5609 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5610 SDValue Lo2, Hi2;
5611 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5612
5613 SDLoc SL(Op);
5614 auto ResVT = DAG.GetSplitDestVTs(VT);
5615
5616 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5617 Op->getFlags());
5618 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5619 Op->getFlags());
5620
5621 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5622}
5623
5624
5626 switch (Op.getOpcode()) {
5627 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5628 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5629 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5630 case ISD::LOAD: {
5631 SDValue Result = LowerLOAD(Op, DAG);
5632 assert((!Result.getNode() ||
5633 Result.getNode()->getNumValues() == 2) &&
5634 "Load should return a value and a chain");
5635 return Result;
5636 }
5637 case ISD::FSQRT: {
5638 EVT VT = Op.getValueType();
5639 if (VT == MVT::f32)
5640 return lowerFSQRTF32(Op, DAG);
5641 if (VT == MVT::f64)
5642 return lowerFSQRTF64(Op, DAG);
5643 return SDValue();
5644 }
5645 case ISD::FSIN:
5646 case ISD::FCOS:
5647 return LowerTrig(Op, DAG);
5648 case ISD::SELECT: return LowerSELECT(Op, DAG);
5649 case ISD::FDIV: return LowerFDIV(Op, DAG);
5650 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5651 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5652 case ISD::STORE: return LowerSTORE(Op, DAG);
5653 case ISD::GlobalAddress: {
5656 return LowerGlobalAddress(MFI, Op, DAG);
5657 }
5658 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5659 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5660 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5661 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5663 return lowerINSERT_SUBVECTOR(Op, DAG);
5665 return lowerINSERT_VECTOR_ELT(Op, DAG);
5667 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5669 return lowerVECTOR_SHUFFLE(Op, DAG);
5671 return lowerSCALAR_TO_VECTOR(Op, DAG);
5672 case ISD::BUILD_VECTOR:
5673 return lowerBUILD_VECTOR(Op, DAG);
5674 case ISD::FP_ROUND:
5676 return lowerFP_ROUND(Op, DAG);
5677 case ISD::FPTRUNC_ROUND: {
5678 unsigned Opc;
5679 SDLoc DL(Op);
5680
5681 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5682 return SDValue();
5683
5684 // Get the rounding mode from the last operand
5685 int RoundMode = Op.getConstantOperandVal(1);
5686 if (RoundMode == (int)RoundingMode::TowardPositive)
5688 else if (RoundMode == (int)RoundingMode::TowardNegative)
5690 else
5691 return SDValue();
5692
5693 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5694 }
5695 case ISD::TRAP:
5696 return lowerTRAP(Op, DAG);
5697 case ISD::DEBUGTRAP:
5698 return lowerDEBUGTRAP(Op, DAG);
5699 case ISD::FABS:
5700 case ISD::FNEG:
5701 case ISD::FCANONICALIZE:
5702 case ISD::BSWAP:
5703 return splitUnaryVectorOp(Op, DAG);
5704 case ISD::FMINNUM:
5705 case ISD::FMAXNUM:
5706 return lowerFMINNUM_FMAXNUM(Op, DAG);
5707 case ISD::FLDEXP:
5708 case ISD::STRICT_FLDEXP:
5709 return lowerFLDEXP(Op, DAG);
5710 case ISD::FMA:
5711 return splitTernaryVectorOp(Op, DAG);
5712 case ISD::FP_TO_SINT:
5713 case ISD::FP_TO_UINT:
5714 return LowerFP_TO_INT(Op, DAG);
5715 case ISD::SHL:
5716 case ISD::SRA:
5717 case ISD::SRL:
5718 case ISD::ADD:
5719 case ISD::SUB:
5720 case ISD::SMIN:
5721 case ISD::SMAX:
5722 case ISD::UMIN:
5723 case ISD::UMAX:
5724 case ISD::FADD:
5725 case ISD::FMUL:
5726 case ISD::FMINNUM_IEEE:
5727 case ISD::FMAXNUM_IEEE:
5728 case ISD::UADDSAT:
5729 case ISD::USUBSAT:
5730 case ISD::SADDSAT:
5731 case ISD::SSUBSAT:
5732 return splitBinaryVectorOp(Op, DAG);
5733 case ISD::MUL:
5734 return lowerMUL(Op, DAG);
5735 case ISD::SMULO:
5736 case ISD::UMULO:
5737 return lowerXMULO(Op, DAG);
5738 case ISD::SMUL_LOHI:
5739 case ISD::UMUL_LOHI:
5740 return lowerXMUL_LOHI(Op, DAG);
5742 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5743 case ISD::STACKSAVE:
5744 return LowerSTACKSAVE(Op, DAG);
5745 case ISD::GET_ROUNDING:
5746 return lowerGET_ROUNDING(Op, DAG);
5747 case ISD::PREFETCH:
5748 return lowerPREFETCH(Op, DAG);
5749 case ISD::FP_EXTEND:
5751 return lowerFP_EXTEND(Op, DAG);
5752 case ISD::GET_FPENV:
5753 return lowerGET_FPENV(Op, DAG);
5754 case ISD::SET_FPENV:
5755 return lowerSET_FPENV(Op, DAG);
5756 }
5757 return SDValue();
5758}
5759
5760// Used for D16: Casts the result of an instruction into the right vector,
5761// packs values if loads return unpacked values.
5763 const SDLoc &DL,
5764 SelectionDAG &DAG, bool Unpacked) {
5765 if (!LoadVT.isVector())
5766 return Result;
5767
5768 // Cast back to the original packed type or to a larger type that is a
5769 // multiple of 32 bit for D16. Widening the return type is a required for
5770 // legalization.
5771 EVT FittingLoadVT = LoadVT;
5772 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5773 FittingLoadVT =
5775 LoadVT.getVectorNumElements() + 1);
5776 }
5777
5778 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5779 // Truncate to v2i16/v4i16.
5780 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5781
5782 // Workaround legalizer not scalarizing truncate after vector op
5783 // legalization but not creating intermediate vector trunc.
5785 DAG.ExtractVectorElements(Result, Elts);
5786 for (SDValue &Elt : Elts)
5787 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5788
5789 // Pad illegal v1i16/v3fi6 to v4i16
5790 if ((LoadVT.getVectorNumElements() % 2) == 1)
5791 Elts.push_back(DAG.getUNDEF(MVT::i16));
5792
5793 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5794
5795 // Bitcast to original type (v2f16/v4f16).
5796 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5797 }
5798
5799 // Cast back to the original packed type.
5800 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5801}
5802
5803SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5804 MemSDNode *M,
5805 SelectionDAG &DAG,
5807 bool IsIntrinsic) const {
5808 SDLoc DL(M);
5809
5810 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5811 EVT LoadVT = M->getValueType(0);
5812
5813 EVT EquivLoadVT = LoadVT;
5814 if (LoadVT.isVector()) {
5815 if (Unpacked) {
5816 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5817 LoadVT.getVectorNumElements());
5818 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5819 // Widen v3f16 to legal type
5820 EquivLoadVT =
5822 LoadVT.getVectorNumElements() + 1);
5823 }
5824 }
5825
5826 // Change from v4f16/v2f16 to EquivLoadVT.
5827 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5828
5830 = DAG.getMemIntrinsicNode(
5831 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5832 VTList, Ops, M->getMemoryVT(),
5833 M->getMemOperand());
5834
5835 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5836
5837 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5838}
5839
5840SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5841 SelectionDAG &DAG,
5842 ArrayRef<SDValue> Ops) const {
5843 SDLoc DL(M);
5844 EVT LoadVT = M->getValueType(0);
5845 EVT EltType = LoadVT.getScalarType();
5846 EVT IntVT = LoadVT.changeTypeToInteger();
5847
5848 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5849
5850 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5851 bool IsTFE = M->getNumValues() == 3;
5852
5853 unsigned Opc;
5854 if (IsFormat) {
5857 } else {
5858 // TODO: Support non-format TFE loads.
5859 if (IsTFE)
5860 return SDValue();
5862 }
5863
5864 if (IsD16) {
5865 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5866 }
5867
5868 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5869 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5870 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand());
5871
5872 if (isTypeLegal(LoadVT)) {
5873 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
5874 M->getMemOperand(), DAG);
5875 }
5876
5877 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
5878 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
5879 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
5880 M->getMemOperand(), DAG);
5881 return DAG.getMergeValues(
5882 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
5883 DL);
5884}
5885
5887 SDNode *N, SelectionDAG &DAG) {
5888 EVT VT = N->getValueType(0);
5889 unsigned CondCode = N->getConstantOperandVal(3);
5890 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
5891 return DAG.getUNDEF(VT);
5892
5893 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
5894
5895 SDValue LHS = N->getOperand(1);
5896 SDValue RHS = N->getOperand(2);
5897
5898 SDLoc DL(N);
5899
5900 EVT CmpVT = LHS.getValueType();
5901 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
5902 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
5904 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
5905 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
5906 }
5907
5908 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
5909
5910 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
5911 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
5912
5913 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
5914 DAG.getCondCode(CCOpcode));
5915 if (VT.bitsEq(CCVT))
5916 return SetCC;
5917 return DAG.getZExtOrTrunc(SetCC, DL, VT);
5918}
5919
5921 SDNode *N, SelectionDAG &DAG) {
5922 EVT VT = N->getValueType(0);
5923
5924 unsigned CondCode = N->getConstantOperandVal(3);
5925 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
5926 return DAG.getUNDEF(VT);
5927
5928 SDValue Src0 = N->getOperand(1);
5929 SDValue Src1 = N->getOperand(2);
5930 EVT CmpVT = Src0.getValueType();
5931 SDLoc SL(N);
5932
5933 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
5934 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
5935 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
5936 }
5937
5938 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
5939 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
5940 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
5941 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
5942 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
5943 Src1, DAG.getCondCode(CCOpcode));
5944 if (VT.bitsEq(CCVT))
5945 return SetCC;
5946 return DAG.getZExtOrTrunc(SetCC, SL, VT);
5947}
5948
5950 SelectionDAG &DAG) {
5951 EVT VT = N->getValueType(0);
5952 SDValue Src = N->getOperand(1);
5953 SDLoc SL(N);
5954
5955 if (Src.getOpcode() == ISD::SETCC) {
5956 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
5957 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
5958 Src.getOperand(1), Src.getOperand(2));
5959 }
5960 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
5961 // (ballot 0) -> 0
5962 if (Arg->isZero())
5963 return DAG.getConstant(0, SL, VT);
5964
5965 // (ballot 1) -> EXEC/EXEC_LO
5966 if (Arg->isOne()) {
5967 Register Exec;
5968 if (VT.getScalarSizeInBits() == 32)
5969 Exec = AMDGPU::EXEC_LO;
5970 else if (VT.getScalarSizeInBits() == 64)
5971 Exec = AMDGPU::EXEC;
5972 else
5973 return SDValue();
5974
5975 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
5976 }
5977 }
5978
5979 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
5980 // ISD::SETNE)
5981 return DAG.getNode(
5982 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
5983 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
5984}
5985
5988 SelectionDAG &DAG) const {
5989 switch (N->getOpcode()) {
5991 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
5992 Results.push_back(Res);
5993 return;
5994 }
5996 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
5997 Results.push_back(Res);
5998 return;
5999 }
6001 unsigned IID = N->getConstantOperandVal(0);
6002 switch (IID) {
6003 case Intrinsic::amdgcn_make_buffer_rsrc:
6004 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6005 return;
6006 case Intrinsic::amdgcn_cvt_pkrtz: {
6007 SDValue Src0 = N->getOperand(1);
6008 SDValue Src1 = N->getOperand(2);
6009 SDLoc SL(N);
6010 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6011 Src0, Src1);
6012 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6013 return;
6014 }
6015 case Intrinsic::amdgcn_cvt_pknorm_i16:
6016 case Intrinsic::amdgcn_cvt_pknorm_u16:
6017 case Intrinsic::amdgcn_cvt_pk_i16:
6018 case Intrinsic::amdgcn_cvt_pk_u16: {
6019 SDValue Src0 = N->getOperand(1);
6020 SDValue Src1 = N->getOperand(2);
6021 SDLoc SL(N);
6022 unsigned Opcode;
6023
6024 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6026 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6028 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6030 else
6032
6033 EVT VT = N->getValueType(0);
6034 if (isTypeLegal(VT))
6035 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6036 else {
6037 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6038 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6039 }
6040 return;
6041 }
6042 case Intrinsic::amdgcn_s_buffer_load: {
6043 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6044 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6045 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6046 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6047 // s_buffer_load_i8.
6048 if (!Subtarget->hasScalarSubwordLoads())
6049 return;
6050 SDValue Op = SDValue(N, 0);
6051 SDValue Rsrc = Op.getOperand(1);
6052 SDValue Offset = Op.getOperand(2);
6053 SDValue CachePolicy = Op.getOperand(3);
6054 EVT VT = Op.getValueType();
6055 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6056 SDLoc DL(Op);
6058 const DataLayout &DataLayout = DAG.getDataLayout();
6059 Align Alignment =
6065 VT.getStoreSize(), Alignment);
6066 SDValue LoadVal;
6067 if (!Offset->isDivergent()) {
6068 SDValue Ops[] = {Rsrc, // source register
6069 Offset, CachePolicy};
6070 SDValue BufferLoad =
6072 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6073 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6074 } else {
6075 SDValue Ops[] = {
6076 DAG.getEntryNode(), // Chain
6077 Rsrc, // rsrc
6078 DAG.getConstant(0, DL, MVT::i32), // vindex
6079 {}, // voffset
6080 {}, // soffset
6081 {}, // offset
6082 CachePolicy, // cachepolicy
6083 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6084 };
6085 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6086 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6087 }
6088 Results.push_back(LoadVal);
6089 return;
6090 }
6091 }
6092 break;
6093 }
6095 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6096 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6097 // FIXME: Hacky
6098 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6099 Results.push_back(Res.getOperand(I));
6100 }
6101 } else {
6102 Results.push_back(Res);
6103 Results.push_back(Res.getValue(1));
6104 }
6105 return;
6106 }
6107
6108 break;
6109 }
6110 case ISD::SELECT: {
6111 SDLoc SL(N);
6112 EVT VT = N->getValueType(0);
6113 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6114 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6115 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6116
6117 EVT SelectVT = NewVT;
6118 if (NewVT.bitsLT(MVT::i32)) {
6119 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6120 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6121 SelectVT = MVT::i32;
6122 }
6123
6124 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6125 N->getOperand(0), LHS, RHS);
6126
6127 if (NewVT != SelectVT)
6128 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6129 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6130 return;
6131 }
6132 case ISD::FNEG: {
6133 if (N->getValueType(0) != MVT::v2f16)
6134 break;
6135
6136 SDLoc SL(N);
6137 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6138
6139 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6140 BC,
6141 DAG.getConstant(0x80008000, SL, MVT::i32));
6142 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6143 return;
6144 }
6145 case ISD::FABS: {
6146 if (N->getValueType(0) != MVT::v2f16)
6147 break;
6148
6149 SDLoc SL(N);
6150 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6151
6152 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6153 BC,
6154 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6155 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6156 return;
6157 }
6158 case ISD::FSQRT: {
6159 if (N->getValueType(0) != MVT::f16)
6160 break;
6161 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6162 break;
6163 }
6164 default:
6166 break;
6167 }
6168}
6169
6170/// Helper function for LowerBRCOND
6171static SDNode *findUser(SDValue Value, unsigned Opcode) {
6172
6173 SDNode *Parent = Value.getNode();
6174 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6175 I != E; ++I) {
6176
6177 if (I.getUse().get() != Value)
6178 continue;
6179
6180 if (I->getOpcode() == Opcode)
6181 return *I;
6182 }
6183 return nullptr;
6184}
6185
6186unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6187 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6188 switch (Intr->getConstantOperandVal(1)) {
6189 case Intrinsic::amdgcn_if:
6190 return AMDGPUISD::IF;
6191 case Intrinsic::amdgcn_else:
6192 return AMDGPUISD::ELSE;
6193 case Intrinsic::amdgcn_loop:
6194 return AMDGPUISD::LOOP;
6195 case Intrinsic::amdgcn_end_cf:
6196 llvm_unreachable("should not occur");
6197 default:
6198 return 0;
6199 }
6200 }
6201
6202 // break, if_break, else_break are all only used as inputs to loop, not
6203 // directly as branch conditions.
6204 return 0;
6205}
6206
6208 const Triple &TT = getTargetMachine().getTargetTriple();
6212}
6213
6215 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6216 return false;
6217
6218 // FIXME: Either avoid relying on address space here or change the default
6219 // address space for functions to avoid the explicit check.
6220 return (GV->getValueType()->isFunctionTy() ||
6223}
6224
6226 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6227}
6228
6230 if (!GV->hasExternalLinkage())
6231 return true;
6232
6233 const auto OS = getTargetMachine().getTargetTriple().getOS();
6234 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6235}
6236
6237/// This transforms the control flow intrinsics to get the branch destination as
6238/// last parameter, also switches branch target with BR if the need arise
6239SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6240 SelectionDAG &DAG) const {
6241 SDLoc DL(BRCOND);
6242
6243 SDNode *Intr = BRCOND.getOperand(1).getNode();
6244 SDValue Target = BRCOND.getOperand(2);
6245 SDNode *BR = nullptr;
6246 SDNode *SetCC = nullptr;
6247
6248 if (Intr->getOpcode() == ISD::SETCC) {
6249 // As long as we negate the condition everything is fine
6250 SetCC = Intr;
6251 Intr = SetCC->getOperand(0).getNode();
6252
6253 } else {
6254 // Get the target from BR if we don't negate the condition
6255 BR = findUser(BRCOND, ISD::BR);
6256 assert(BR && "brcond missing unconditional branch user");
6257 Target = BR->getOperand(1);
6258 }
6259
6260 unsigned CFNode = isCFIntrinsic(Intr);
6261 if (CFNode == 0) {
6262 // This is a uniform branch so we don't need to legalize.
6263 return BRCOND;
6264 }
6265
6266 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6267 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6268
6269 assert(!SetCC ||
6270 (SetCC->getConstantOperandVal(1) == 1 &&
6271 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6272 ISD::SETNE));
6273
6274 // operands of the new intrinsic call
6276 if (HaveChain)
6277 Ops.push_back(BRCOND.getOperand(0));
6278
6279 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6280 Ops.push_back(Target);
6281
6282 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6283
6284 // build the new intrinsic call
6285 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6286
6287 if (!HaveChain) {
6288 SDValue Ops[] = {
6289 SDValue(Result, 0),
6290 BRCOND.getOperand(0)
6291 };
6292
6293 Result = DAG.getMergeValues(Ops, DL).getNode();
6294 }
6295
6296 if (BR) {
6297 // Give the branch instruction our target
6298 SDValue Ops[] = {
6299 BR->getOperand(0),
6300 BRCOND.getOperand(2)
6301 };
6302 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6303 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6304 }
6305
6306 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6307
6308 // Copy the intrinsic results to registers
6309 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6311 if (!CopyToReg)
6312 continue;
6313
6314 Chain = DAG.getCopyToReg(
6315 Chain, DL,
6316 CopyToReg->getOperand(1),
6317 SDValue(Result, i - 1),
6318 SDValue());
6319
6320 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6321 }
6322
6323 // Remove the old intrinsic from the chain
6325 SDValue(Intr, Intr->getNumValues() - 1),
6326 Intr->getOperand(0));
6327
6328 return Chain;
6329}
6330
6331SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6332 SelectionDAG &DAG) const {
6333 MVT VT = Op.getSimpleValueType();
6334 SDLoc DL(Op);
6335 // Checking the depth
6336 if (Op.getConstantOperandVal(0) != 0)
6337 return DAG.getConstant(0, DL, VT);
6338
6341 // Check for kernel and shader functions
6342 if (Info->isEntryFunction())
6343 return DAG.getConstant(0, DL, VT);
6344
6345 MachineFrameInfo &MFI = MF.getFrameInfo();
6346 // There is a call to @llvm.returnaddress in this function
6347 MFI.setReturnAddressIsTaken(true);
6348
6350 // Get the return address reg and mark it as an implicit live-in
6351 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6352
6353 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6354}
6355
6356SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6357 SDValue Op,
6358 const SDLoc &DL,
6359 EVT VT) const {
6360 return Op.getValueType().bitsLE(VT) ?
6361 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6362 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6363 DAG.getTargetConstant(0, DL, MVT::i32));
6364}
6365
6366SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6367 assert(Op.getValueType() == MVT::f16 &&
6368 "Do not know how to custom lower FP_ROUND for non-f16 type");
6369
6370 SDValue Src = Op.getOperand(0);
6371 EVT SrcVT = Src.getValueType();
6372 if (SrcVT != MVT::f64)
6373 return Op;
6374
6375 // TODO: Handle strictfp
6376 if (Op.getOpcode() != ISD::FP_ROUND)
6377 return Op;
6378
6379 SDLoc DL(Op);
6380
6381 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6382 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6383 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6384}
6385
6386SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6387 SelectionDAG &DAG) const {
6388 EVT VT = Op.getValueType();
6389 const MachineFunction &MF = DAG.getMachineFunction();
6391 bool IsIEEEMode = Info->getMode().IEEE;
6392
6393 // FIXME: Assert during selection that this is only selected for
6394 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6395 // mode functions, but this happens to be OK since it's only done in cases
6396 // where there is known no sNaN.
6397 if (IsIEEEMode)
6398 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6399
6400 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6401 VT == MVT::v16bf16)
6402 return splitBinaryVectorOp(Op, DAG);
6403 return Op;
6404}
6405
6406SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6407 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6408 EVT VT = Op.getValueType();
6409 assert(VT == MVT::f16);
6410
6411 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6412 EVT ExpVT = Exp.getValueType();
6413 if (ExpVT == MVT::i16)
6414 return Op;
6415
6416 SDLoc DL(Op);
6417
6418 // Correct the exponent type for f16 to i16.
6419 // Clamp the range of the exponent to the instruction's range.
6420
6421 // TODO: This should be a generic narrowing legalization, and can easily be
6422 // for GlobalISel.
6423
6424 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6425 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6426
6427 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6428 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6429
6430 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6431
6432 if (IsStrict) {
6433 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6434 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6435 }
6436
6437 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6438}
6439
6440// Custom lowering for vector multiplications and s_mul_u64.
6441SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6442 EVT VT = Op.getValueType();
6443
6444 // Split vector operands.
6445 if (VT.isVector())
6446 return splitBinaryVectorOp(Op, DAG);
6447
6448 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6449
6450 // There are four ways to lower s_mul_u64:
6451 //
6452 // 1. If all the operands are uniform, then we lower it as it is.
6453 //
6454 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6455 // multiplications because there is not a vector equivalent of s_mul_u64.
6456 //
6457 // 3. If the cost model decides that it is more efficient to use vector
6458 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6459 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6460 //
6461 // 4. If the cost model decides to use vector registers and both of the
6462 // operands are zero-extended/sign-extended from 32-bits, then we split the
6463 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6464 // possible to check if the operands are zero-extended or sign-extended in
6465 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6466 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6467 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6468 // If the cost model decides that we have to use vector registers, then
6469 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6470 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6471 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6472 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6473 // SIInstrInfo.cpp .
6474
6475 if (Op->isDivergent())
6476 return SDValue();
6477
6478 SDValue Op0 = Op.getOperand(0);
6479 SDValue Op1 = Op.getOperand(1);
6480 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6481 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6482 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6483 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6484 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6485 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6486 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6487 SDLoc SL(Op);
6488 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6489 return SDValue(
6490 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6491 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6492 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6493 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6494 return SDValue(
6495 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6496 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6497 return Op;
6498}
6499
6500SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6501 EVT VT = Op.getValueType();
6502 SDLoc SL(Op);
6503 SDValue LHS = Op.getOperand(0);
6504 SDValue RHS = Op.getOperand(1);
6505 bool isSigned = Op.getOpcode() == ISD::SMULO;
6506
6507 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6508 const APInt &C = RHSC->getAPIntValue();
6509 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6510 if (C.isPowerOf2()) {
6511 // smulo(x, signed_min) is same as umulo(x, signed_min).
6512 bool UseArithShift = isSigned && !C.isMinSignedValue();
6513 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6514 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6515 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6516 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6517 SL, VT, Result, ShiftAmt),
6518 LHS, ISD::SETNE);
6519 return DAG.getMergeValues({ Result, Overflow }, SL);
6520 }
6521 }
6522
6523 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6525 SL, VT, LHS, RHS);
6526
6527 SDValue Sign = isSigned
6528 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6529 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6530 : DAG.getConstant(0, SL, VT);
6531 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6532
6533 return DAG.getMergeValues({ Result, Overflow }, SL);
6534}
6535
6536SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6537 if (Op->isDivergent()) {
6538 // Select to V_MAD_[IU]64_[IU]32.
6539 return Op;
6540 }
6541 if (Subtarget->hasSMulHi()) {
6542 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6543 return SDValue();
6544 }
6545 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6546 // calculate the high part, so we might as well do the whole thing with
6547 // V_MAD_[IU]64_[IU]32.
6548 return Op;
6549}
6550
6551SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6552 if (!Subtarget->isTrapHandlerEnabled() ||
6554 return lowerTrapEndpgm(Op, DAG);
6555
6556 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6557 lowerTrapHsaQueuePtr(Op, DAG);
6558}
6559
6560SDValue SITargetLowering::lowerTrapEndpgm(
6561 SDValue Op, SelectionDAG &DAG) const {
6562 SDLoc SL(Op);
6563 SDValue Chain = Op.getOperand(0);
6564 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6565}
6566
6567SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6568 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6571 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6573 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6576}
6577
6578SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6579 SDValue Op, SelectionDAG &DAG) const {
6580 SDLoc SL(Op);
6581 SDValue Chain = Op.getOperand(0);
6582
6583 SDValue QueuePtr;
6584 // For code object version 5, QueuePtr is passed through implicit kernarg.
6585 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6587 QueuePtr =
6588 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6589 } else {
6592 Register UserSGPR = Info->getQueuePtrUserSGPR();
6593
6594 if (UserSGPR == AMDGPU::NoRegister) {
6595 // We probably are in a function incorrectly marked with
6596 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6597 // trap, so just use a null pointer.
6598 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6599 } else {
6600 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6601 MVT::i64);
6602 }
6603 }
6604
6605 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6606 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6607 QueuePtr, SDValue());
6608
6610 SDValue Ops[] = {
6611 ToReg,
6612 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6613 SGPR01,
6614 ToReg.getValue(1)
6615 };
6616 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6617}
6618
6619SDValue SITargetLowering::lowerTrapHsa(
6620 SDValue Op, SelectionDAG &DAG) const {
6621 SDLoc SL(Op);
6622 SDValue Chain = Op.getOperand(0);
6623
6625 SDValue Ops[] = {
6626 Chain,
6627 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6628 };
6629 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6630}
6631
6632SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6633 SDLoc SL(Op);
6634 SDValue Chain = Op.getOperand(0);
6636
6637 if (!Subtarget->isTrapHandlerEnabled() ||
6640 "debugtrap handler not supported",
6641 Op.getDebugLoc(),
6642 DS_Warning);
6643 LLVMContext &Ctx = MF.getFunction().getContext();
6644 Ctx.diagnose(NoTrap);
6645 return Chain;
6646 }
6647
6649 SDValue Ops[] = {
6650 Chain,
6651 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6652 };
6653 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6654}
6655
6656SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6657 SelectionDAG &DAG) const {
6658 if (Subtarget->hasApertureRegs()) {
6659 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6660 ? AMDGPU::SRC_SHARED_BASE
6661 : AMDGPU::SRC_PRIVATE_BASE;
6662 // Note: this feature (register) is broken. When used as a 32-bit operand,
6663 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6664 // bits.
6665 //
6666 // To work around the issue, directly emit a 64 bit mov from this register
6667 // then extract the high bits. Note that this shouldn't even result in a
6668 // shift being emitted and simply become a pair of registers (e.g.):
6669 // s_mov_b64 s[6:7], src_shared_base
6670 // v_mov_b32_e32 v1, s7
6671 //
6672 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6673 // coalescing would kick in and it would think it's okay to use the "HI"
6674 // subregister directly (instead of extracting the HI 32 bits) which is an
6675 // artificial (unusable) register.
6676 // Register TableGen definitions would need an overhaul to get rid of the
6677 // artificial "HI" aperture registers and prevent this kind of issue from
6678 // happening.
6679 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
6680 DAG.getRegister(ApertureRegNo, MVT::i64));
6681 return DAG.getNode(
6682 ISD::TRUNCATE, DL, MVT::i32,
6683 DAG.getNode(ISD::SRL, DL, MVT::i64,
6684 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6685 }
6686
6687 // For code object version 5, private_base and shared_base are passed through
6688 // implicit kernargs.
6689 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6693 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
6694 }
6695
6698 Register UserSGPR = Info->getQueuePtrUserSGPR();
6699 if (UserSGPR == AMDGPU::NoRegister) {
6700 // We probably are in a function incorrectly marked with
6701 // amdgpu-no-queue-ptr. This is undefined.
6702 return DAG.getUNDEF(MVT::i32);
6703 }
6704
6705 SDValue QueuePtr = CreateLiveInRegister(
6706 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
6707
6708 // Offset into amd_queue_t for group_segment_aperture_base_hi /
6709 // private_segment_aperture_base_hi.
6710 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
6711
6712 SDValue Ptr =
6713 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
6714
6715 // TODO: Use custom target PseudoSourceValue.
6716 // TODO: We should use the value from the IR intrinsic call, but it might not
6717 // be available and how do we get it?
6719 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
6720 commonAlignment(Align(64), StructOffset),
6723}
6724
6725/// Return true if the value is a known valid address, such that a null check is
6726/// not necessary.
6728 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
6729 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
6730 isa<BasicBlockSDNode>(Val))
6731 return true;
6732
6733 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
6734 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
6735
6736 // TODO: Search through arithmetic, handle arguments and loads
6737 // marked nonnull.
6738 return false;
6739}
6740
6741SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
6742 SelectionDAG &DAG) const {
6743 SDLoc SL(Op);
6744
6745 const AMDGPUTargetMachine &TM =
6746 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
6747
6748 unsigned DestAS, SrcAS;
6749 SDValue Src;
6750 bool IsNonNull = false;
6751 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
6752 SrcAS = ASC->getSrcAddressSpace();
6753 Src = ASC->getOperand(0);
6754 DestAS = ASC->getDestAddressSpace();
6755 } else {
6756 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6757 Op.getConstantOperandVal(0) ==
6758 Intrinsic::amdgcn_addrspacecast_nonnull);
6759 Src = Op->getOperand(1);
6760 SrcAS = Op->getConstantOperandVal(2);
6761 DestAS = Op->getConstantOperandVal(3);
6762 IsNonNull = true;
6763 }
6764
6765 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
6766
6767 // flat -> local/private
6768 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
6769 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
6770 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
6771 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6772
6773 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
6774 return Ptr;
6775
6776 unsigned NullVal = TM.getNullPointerValue(DestAS);
6777 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6778 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
6779
6780 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
6781 SegmentNullPtr);
6782 }
6783 }
6784
6785 // local/private -> flat
6786 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
6787 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
6788 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
6789
6790 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
6791 SDValue CvtPtr =
6792 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
6793 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
6794
6795 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
6796 return CvtPtr;
6797
6798 unsigned NullVal = TM.getNullPointerValue(SrcAS);
6799 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6800
6801 SDValue NonNull
6802 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
6803
6804 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
6805 FlatNullPtr);
6806 }
6807 }
6808
6809 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6810 Op.getValueType() == MVT::i64) {
6813 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
6814 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
6815 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
6816 }
6817
6818 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6819 Src.getValueType() == MVT::i64)
6820 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6821
6822 // global <-> flat are no-ops and never emitted.
6823
6824 const MachineFunction &MF = DAG.getMachineFunction();
6825 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
6826 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
6827 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
6828
6829 return DAG.getUNDEF(Op->getValueType(0));
6830}
6831
6832// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
6833// the small vector and inserting them into the big vector. That is better than
6834// the default expansion of doing it via a stack slot. Even though the use of
6835// the stack slot would be optimized away afterwards, the stack slot itself
6836// remains.
6837SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
6838 SelectionDAG &DAG) const {
6839 SDValue Vec = Op.getOperand(0);
6840 SDValue Ins = Op.getOperand(1);
6841 SDValue Idx = Op.getOperand(2);
6842 EVT VecVT = Vec.getValueType();
6843 EVT InsVT = Ins.getValueType();
6844 EVT EltVT = VecVT.getVectorElementType();
6845 unsigned InsNumElts = InsVT.getVectorNumElements();
6846 unsigned IdxVal = Idx->getAsZExtVal();
6847 SDLoc SL(Op);
6848
6849 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
6850 // Insert 32-bit registers at a time.
6851 assert(InsNumElts % 2 == 0 && "expect legal vector types");
6852
6853 unsigned VecNumElts = VecVT.getVectorNumElements();
6854 EVT NewVecVT =
6855 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
6856 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6858 MVT::i32, InsNumElts / 2);
6859
6860 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
6861 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
6862
6863 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
6864 SDValue Elt;
6865 if (InsNumElts == 2) {
6866 Elt = Ins;
6867 } else {
6868 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
6869 DAG.getConstant(I, SL, MVT::i32));
6870 }
6871 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
6872 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
6873 }
6874
6875 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
6876 }
6877
6878 for (unsigned I = 0; I != InsNumElts; ++I) {
6879 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
6880 DAG.getConstant(I, SL, MVT::i32));
6881 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
6882 DAG.getConstant(IdxVal + I, SL, MVT::i32));
6883 }
6884 return Vec;
6885}
6886
6887SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
6888 SelectionDAG &DAG) const {
6889 SDValue Vec = Op.getOperand(0);
6890 SDValue InsVal = Op.getOperand(1);
6891 SDValue Idx = Op.getOperand(2);
6892 EVT VecVT = Vec.getValueType();
6893 EVT EltVT = VecVT.getVectorElementType();
6894 unsigned VecSize = VecVT.getSizeInBits();
6895 unsigned EltSize = EltVT.getSizeInBits();
6896 SDLoc SL(Op);
6897
6898 // Specially handle the case of v4i16 with static indexing.
6899 unsigned NumElts = VecVT.getVectorNumElements();
6900 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
6901 if (NumElts == 4 && EltSize == 16 && KIdx) {
6902 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
6903
6904 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
6905 DAG.getConstant(0, SL, MVT::i32));
6906 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
6907 DAG.getConstant(1, SL, MVT::i32));
6908
6909 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
6910 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
6911
6912 unsigned Idx = KIdx->getZExtValue();
6913 bool InsertLo = Idx < 2;
6914 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
6915 InsertLo ? LoVec : HiVec,
6916 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
6917 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
6918
6919 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
6920
6921 SDValue Concat = InsertLo ?
6922 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
6923 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
6924
6925 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
6926 }
6927
6928 // Static indexing does not lower to stack access, and hence there is no need
6929 // for special custom lowering to avoid stack access.
6930 if (isa<ConstantSDNode>(Idx))
6931 return SDValue();
6932
6933 // Avoid stack access for dynamic indexing by custom lowering to
6934 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
6935
6936 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
6937
6938 MVT IntVT = MVT::getIntegerVT(VecSize);
6939
6940 // Convert vector index to bit-index and get the required bit mask.
6941 assert(isPowerOf2_32(EltSize));
6942 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
6943 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
6944 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
6945 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
6946 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
6947
6948 // 1. Create a congruent vector with the target value in each element.
6949 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
6950 DAG.getSplatBuildVector(VecVT, SL, InsVal));
6951
6952 // 2. Mask off all other indicies except the required index within (1).
6953 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
6954
6955 // 3. Mask off the required index within the target vector.
6956 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
6957 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
6958 DAG.getNOT(SL, BFM, IntVT), BCVec);
6959
6960 // 4. Get (2) and (3) ORed into the target vector.
6961 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
6962
6963 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
6964}
6965
6966SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
6967 SelectionDAG &DAG) const {
6968 SDLoc SL(Op);
6969
6970 EVT ResultVT = Op.getValueType();
6971 SDValue Vec = Op.getOperand(0);
6972 SDValue Idx = Op.getOperand(1);
6973 EVT VecVT = Vec.getValueType();
6974 unsigned VecSize = VecVT.getSizeInBits();
6975 EVT EltVT = VecVT.getVectorElementType();
6976
6977 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
6978
6979 // Make sure we do any optimizations that will make it easier to fold
6980 // source modifiers before obscuring it with bit operations.
6981
6982 // XXX - Why doesn't this get called when vector_shuffle is expanded?
6983 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
6984 return Combined;
6985
6986 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
6987 SDValue Lo, Hi;
6988 EVT LoVT, HiVT;
6989 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
6990
6991 if (VecSize == 128) {
6992 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
6993 Lo = DAG.getBitcast(LoVT,
6994 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
6995 DAG.getConstant(0, SL, MVT::i32)));
6996 Hi = DAG.getBitcast(HiVT,
6997 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
6998 DAG.getConstant(1, SL, MVT::i32)));
6999 } else if (VecSize == 256) {
7000 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7001 SDValue Parts[4];
7002 for (unsigned P = 0; P < 4; ++P) {
7003 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7004 DAG.getConstant(P, SL, MVT::i32));
7005 }
7006
7007 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7008 Parts[0], Parts[1]));
7009 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7010 Parts[2], Parts[3]));
7011 } else {
7012 assert(VecSize == 512);
7013
7014 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7015 SDValue Parts[8];
7016 for (unsigned P = 0; P < 8; ++P) {
7017 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7018 DAG.getConstant(P, SL, MVT::i32));
7019 }
7020
7021 Lo = DAG.getBitcast(LoVT,
7022 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7023 Parts[0], Parts[1], Parts[2], Parts[3]));
7024 Hi = DAG.getBitcast(HiVT,
7025 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7026 Parts[4], Parts[5],Parts[6], Parts[7]));
7027 }
7028
7029 EVT IdxVT = Idx.getValueType();
7030 unsigned NElem = VecVT.getVectorNumElements();
7031 assert(isPowerOf2_32(NElem));
7032 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7033 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7034 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7035 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7036 }
7037
7038 assert(VecSize <= 64);
7039
7040 MVT IntVT = MVT::getIntegerVT(VecSize);
7041
7042 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7043 SDValue VecBC = peekThroughBitcasts(Vec);
7044 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7045 SDValue Src = VecBC.getOperand(0);
7046 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7047 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7048 }
7049
7050 unsigned EltSize = EltVT.getSizeInBits();
7051 assert(isPowerOf2_32(EltSize));
7052
7053 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7054
7055 // Convert vector index to bit-index (* EltSize)
7056 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7057
7058 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7059 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7060
7061 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7062 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7063 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7064 }
7065
7066 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7067}
7068
7069static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7070 assert(Elt % 2 == 0);
7071 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7072}
7073
7074SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7075 SelectionDAG &DAG) const {
7076 SDLoc SL(Op);
7077 EVT ResultVT = Op.getValueType();
7078 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7079
7080 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7081 EVT EltVT = PackVT.getVectorElementType();
7082 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7083
7084 // vector_shuffle <0,1,6,7> lhs, rhs
7085 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7086 //
7087 // vector_shuffle <6,7,2,3> lhs, rhs
7088 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7089 //
7090 // vector_shuffle <6,7,0,1> lhs, rhs
7091 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7092
7093 // Avoid scalarizing when both halves are reading from consecutive elements.
7095 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7096 if (elementPairIsContiguous(SVN->getMask(), I)) {
7097 const int Idx = SVN->getMaskElt(I);
7098 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7099 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7100 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7101 PackVT, SVN->getOperand(VecIdx),
7102 DAG.getConstant(EltIdx, SL, MVT::i32));
7103 Pieces.push_back(SubVec);
7104 } else {
7105 const int Idx0 = SVN->getMaskElt(I);
7106 const int Idx1 = SVN->getMaskElt(I + 1);
7107 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7108 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7109 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7110 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7111
7112 SDValue Vec0 = SVN->getOperand(VecIdx0);
7113 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7114 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7115
7116 SDValue Vec1 = SVN->getOperand(VecIdx1);
7117 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7118 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7119 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7120 }
7121 }
7122
7123 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7124}
7125
7126SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7127 SelectionDAG &DAG) const {
7128 SDValue SVal = Op.getOperand(0);
7129 EVT ResultVT = Op.getValueType();
7130 EVT SValVT = SVal.getValueType();
7131 SDValue UndefVal = DAG.getUNDEF(SValVT);
7132 SDLoc SL(Op);
7133
7135 VElts.push_back(SVal);
7136 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7137 VElts.push_back(UndefVal);
7138
7139 return DAG.getBuildVector(ResultVT, SL, VElts);
7140}
7141
7142SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7143 SelectionDAG &DAG) const {
7144 SDLoc SL(Op);
7145 EVT VT = Op.getValueType();
7146
7147 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7148 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7150 VT.getVectorNumElements() / 2);
7151 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
7152
7153 // Turn into pair of packed build_vectors.
7154 // TODO: Special case for constants that can be materialized with s_mov_b64.
7155 SmallVector<SDValue, 4> LoOps, HiOps;
7156 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7157 LoOps.push_back(Op.getOperand(I));
7158 HiOps.push_back(Op.getOperand(I + E));
7159 }
7160 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7161 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
7162
7163 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7164 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
7165
7166 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
7167 { CastLo, CastHi });
7168 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7169 }
7170
7171 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7173 VT.getVectorNumElements() / 4);
7174 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7175
7176 SmallVector<SDValue, 4> Parts[4];
7177 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7178 for (unsigned P = 0; P < 4; ++P)
7179 Parts[P].push_back(Op.getOperand(I + P * E));
7180 }
7181 SDValue Casts[4];
7182 for (unsigned P = 0; P < 4; ++P) {
7183 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7184 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7185 }
7186
7187 SDValue Blend =
7188 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
7189 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7190 }
7191
7192 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7194 VT.getVectorNumElements() / 8);
7195 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7196
7197 SmallVector<SDValue, 8> Parts[8];
7198 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7199 for (unsigned P = 0; P < 8; ++P)
7200 Parts[P].push_back(Op.getOperand(I + P * E));
7201 }
7202 SDValue Casts[8];
7203 for (unsigned P = 0; P < 8; ++P) {
7204 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7205 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7206 }
7207
7208 SDValue Blend =
7209 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
7210 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7211 }
7212
7213 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7214 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7215
7216 SDValue Lo = Op.getOperand(0);
7217 SDValue Hi = Op.getOperand(1);
7218
7219 // Avoid adding defined bits with the zero_extend.
7220 if (Hi.isUndef()) {
7221 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7222 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7223 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7224 }
7225
7226 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7227 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7228
7229 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7230 DAG.getConstant(16, SL, MVT::i32));
7231 if (Lo.isUndef())
7232 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7233
7234 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7235 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7236
7237 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7238 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7239}
7240
7241bool
7243 // OSes that use ELF REL relocations (instead of RELA) can only store a
7244 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7245 // which can create arbitrary 64-bit addends. (This is only a problem for
7246 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7247 // the high 32 bits of the addend.)
7248 //
7249 // This should be kept in sync with how HasRelocationAddend is initialized in
7250 // the constructor of ELFAMDGPUAsmBackend.
7251 if (!Subtarget->isAmdHsaOS())
7252 return false;
7253
7254 // We can fold offsets for anything that doesn't require a GOT relocation.
7255 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7259}
7260
7261static SDValue
7263 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7264 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7265 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7266 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7267 // lowered to the following code sequence:
7268 //
7269 // For constant address space:
7270 // s_getpc_b64 s[0:1]
7271 // s_add_u32 s0, s0, $symbol
7272 // s_addc_u32 s1, s1, 0
7273 //
7274 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7275 // a fixup or relocation is emitted to replace $symbol with a literal
7276 // constant, which is a pc-relative offset from the encoding of the $symbol
7277 // operand to the global variable.
7278 //
7279 // For global address space:
7280 // s_getpc_b64 s[0:1]
7281 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7282 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7283 //
7284 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7285 // fixups or relocations are emitted to replace $symbol@*@lo and
7286 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7287 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7288 // operand to the global variable.
7289 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7290 SDValue PtrHi;
7291 if (GAFlags == SIInstrInfo::MO_NONE)
7292 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7293 else
7294 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7295 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7296}
7297
7298SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7299 SDValue Op,
7300 SelectionDAG &DAG) const {
7301 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7302 SDLoc DL(GSD);
7303 EVT PtrVT = Op.getValueType();
7304
7305 const GlobalValue *GV = GSD->getGlobal();
7311 GV->hasExternalLinkage()) {
7312 Type *Ty = GV->getValueType();
7313 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7314 // zero-sized type in other languages to declare the dynamic shared
7315 // memory which size is not known at the compile time. They will be
7316 // allocated by the runtime and placed directly after the static
7317 // allocated ones. They all share the same offset.
7318 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7319 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7320 // Adjust alignment for that dynamic shared memory array.
7322 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7323 MFI->setUsesDynamicLDS(true);
7324 return SDValue(
7325 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7326 }
7327 }
7329 }
7330
7332 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7334 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7335 }
7336
7337 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7338 SDValue AddrLo = DAG.getTargetGlobalAddress(
7339 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7340 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7341
7342 SDValue AddrHi = DAG.getTargetGlobalAddress(
7343 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7344 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7345
7346 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7347 }
7348
7349 if (shouldEmitFixup(GV))
7350 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7351
7352 if (shouldEmitPCReloc(GV))
7353 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7355
7356 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7358
7359 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7361 const DataLayout &DataLayout = DAG.getDataLayout();
7362 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7363 MachinePointerInfo PtrInfo
7365
7366 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7369}
7370
7372 const SDLoc &DL, SDValue V) const {
7373 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7374 // the destination register.
7375 //
7376 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7377 // so we will end up with redundant moves to m0.
7378 //
7379 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7380
7381 // A Null SDValue creates a glue result.
7382 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7383 V, Chain);
7384 return SDValue(M0, 0);
7385}
7386
7387SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7388 SDValue Op,
7389 MVT VT,
7390 unsigned Offset) const {
7391 SDLoc SL(Op);
7392 SDValue Param = lowerKernargMemParameter(
7393 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7394 // The local size values will have the hi 16-bits as zero.
7395 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7396 DAG.getValueType(VT));
7397}
7398
7400 EVT VT) {
7402 "non-hsa intrinsic with hsa target",
7403 DL.getDebugLoc());
7404 DAG.getContext()->diagnose(BadIntrin);
7405 return DAG.getUNDEF(VT);
7406}
7407
7409 EVT VT) {
7411 "intrinsic not supported on subtarget",
7412 DL.getDebugLoc());
7413 DAG.getContext()->diagnose(BadIntrin);
7414 return DAG.getUNDEF(VT);
7415}
7416
7418 ArrayRef<SDValue> Elts) {
7419 assert(!Elts.empty());
7420 MVT Type;
7421 unsigned NumElts = Elts.size();
7422
7423 if (NumElts <= 12) {
7424 Type = MVT::getVectorVT(MVT::f32, NumElts);
7425 } else {
7426 assert(Elts.size() <= 16);
7427 Type = MVT::v16f32;
7428 NumElts = 16;
7429 }
7430
7431 SmallVector<SDValue, 16> VecElts(NumElts);
7432 for (unsigned i = 0; i < Elts.size(); ++i) {
7433 SDValue Elt = Elts[i];
7434 if (Elt.getValueType() != MVT::f32)
7435 Elt = DAG.getBitcast(MVT::f32, Elt);
7436 VecElts[i] = Elt;
7437 }
7438 for (unsigned i = Elts.size(); i < NumElts; ++i)
7439 VecElts[i] = DAG.getUNDEF(MVT::f32);
7440
7441 if (NumElts == 1)
7442 return VecElts[0];
7443 return DAG.getBuildVector(Type, DL, VecElts);
7444}
7445
7446static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7447 SDValue Src, int ExtraElts) {
7448 EVT SrcVT = Src.getValueType();
7449
7451
7452 if (SrcVT.isVector())
7453 DAG.ExtractVectorElements(Src, Elts);
7454 else
7455 Elts.push_back(Src);
7456
7457 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7458 while (ExtraElts--)
7459 Elts.push_back(Undef);
7460
7461 return DAG.getBuildVector(CastVT, DL, Elts);
7462}
7463
7464// Re-construct the required return value for a image load intrinsic.
7465// This is more complicated due to the optional use TexFailCtrl which means the required
7466// return type is an aggregate
7468 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7469 bool Unpacked, bool IsD16, int DMaskPop,
7470 int NumVDataDwords, bool IsAtomicPacked16Bit,
7471 const SDLoc &DL) {
7472 // Determine the required return type. This is the same regardless of IsTexFail flag
7473 EVT ReqRetVT = ResultTypes[0];
7474 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7475 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7476 ? (ReqRetNumElts + 1) / 2
7477 : ReqRetNumElts;
7478
7479 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
7480 DMaskPop : (DMaskPop + 1) / 2;
7481
7482 MVT DataDwordVT = NumDataDwords == 1 ?
7483 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7484
7485 MVT MaskPopVT = MaskPopDwords == 1 ?
7486 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7487
7488 SDValue Data(Result, 0);
7489 SDValue TexFail;
7490
7491 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7492 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7493 if (MaskPopVT.isVector()) {
7494 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7495 SDValue(Result, 0), ZeroIdx);
7496 } else {
7497 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7498 SDValue(Result, 0), ZeroIdx);
7499 }
7500 }
7501
7502 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7503 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7504 NumDataDwords - MaskPopDwords);
7505
7506 if (IsD16)
7507 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7508
7509 EVT LegalReqRetVT = ReqRetVT;
7510 if (!ReqRetVT.isVector()) {
7511 if (!Data.getValueType().isInteger())
7512 Data = DAG.getNode(ISD::BITCAST, DL,
7513 Data.getValueType().changeTypeToInteger(), Data);
7514 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7515 } else {
7516 // We need to widen the return vector to a legal type
7517 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7518 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7519 LegalReqRetVT =
7521 ReqRetVT.getVectorNumElements() + 1);
7522 }
7523 }
7524 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7525
7526 if (IsTexFail) {
7527 TexFail =
7528 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7529 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7530
7531 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7532 }
7533
7534 if (Result->getNumValues() == 1)
7535 return Data;
7536
7537 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7538}
7539
7540static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7541 SDValue *LWE, bool &IsTexFail) {
7542 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7543
7544 uint64_t Value = TexFailCtrlConst->getZExtValue();
7545 if (Value) {
7546 IsTexFail = true;
7547 }
7548
7549 SDLoc DL(TexFailCtrlConst);
7550 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7551 Value &= ~(uint64_t)0x1;
7552 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7553 Value &= ~(uint64_t)0x2;
7554
7555 return Value == 0;
7556}
7557
7559 MVT PackVectorVT,
7560 SmallVectorImpl<SDValue> &PackedAddrs,
7561 unsigned DimIdx, unsigned EndIdx,
7562 unsigned NumGradients) {
7563 SDLoc DL(Op);
7564 for (unsigned I = DimIdx; I < EndIdx; I++) {
7565 SDValue Addr = Op.getOperand(I);
7566
7567 // Gradients are packed with undef for each coordinate.
7568 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7569 // 1D: undef,dx/dh; undef,dx/dv
7570 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7571 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7572 if (((I + 1) >= EndIdx) ||
7573 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7574 I == DimIdx + NumGradients - 1))) {
7575 if (Addr.getValueType() != MVT::i16)
7576 Addr = DAG.getBitcast(MVT::i16, Addr);
7577 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7578 } else {
7579 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7580 I++;
7581 }
7582 Addr = DAG.getBitcast(MVT::f32, Addr);
7583 PackedAddrs.push_back(Addr);
7584 }
7585}
7586
7587SDValue SITargetLowering::lowerImage(SDValue Op,
7589 SelectionDAG &DAG, bool WithChain) const {
7590 SDLoc DL(Op);
7592 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7593 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7595 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7596 unsigned IntrOpcode = Intr->BaseOpcode;
7597 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7598 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7599 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7600
7601 SmallVector<EVT, 3> ResultTypes(Op->values());
7602 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7603 bool IsD16 = false;
7604 bool IsG16 = false;
7605 bool IsA16 = false;
7606 SDValue VData;
7607 int NumVDataDwords;
7608 bool AdjustRetType = false;
7609 bool IsAtomicPacked16Bit = false;
7610
7611 // Offset of intrinsic arguments
7612 const unsigned ArgOffset = WithChain ? 2 : 1;
7613
7614 unsigned DMask;
7615 unsigned DMaskLanes = 0;
7616
7617 if (BaseOpcode->Atomic) {
7618 VData = Op.getOperand(2);
7619
7620 IsAtomicPacked16Bit =
7621 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7622 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7623
7624 bool Is64Bit = VData.getValueSizeInBits() == 64;
7625 if (BaseOpcode->AtomicX2) {
7626 SDValue VData2 = Op.getOperand(3);
7627 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7628 {VData, VData2});
7629 if (Is64Bit)
7630 VData = DAG.getBitcast(MVT::v4i32, VData);
7631
7632 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7633 DMask = Is64Bit ? 0xf : 0x3;
7634 NumVDataDwords = Is64Bit ? 4 : 2;
7635 } else {
7636 DMask = Is64Bit ? 0x3 : 0x1;
7637 NumVDataDwords = Is64Bit ? 2 : 1;
7638 }
7639 } else {
7640 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7641 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7642
7643 if (BaseOpcode->Store) {
7644 VData = Op.getOperand(2);
7645
7646 MVT StoreVT = VData.getSimpleValueType();
7647 if (StoreVT.getScalarType() == MVT::f16) {
7648 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7649 return Op; // D16 is unsupported for this instruction
7650
7651 IsD16 = true;
7652 VData = handleD16VData(VData, DAG, true);
7653 }
7654
7655 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7656 } else {
7657 // Work out the num dwords based on the dmask popcount and underlying type
7658 // and whether packing is supported.
7659 MVT LoadVT = ResultTypes[0].getSimpleVT();
7660 if (LoadVT.getScalarType() == MVT::f16) {
7661 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7662 return Op; // D16 is unsupported for this instruction
7663
7664 IsD16 = true;
7665 }
7666
7667 // Confirm that the return type is large enough for the dmask specified
7668 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7669 (!LoadVT.isVector() && DMaskLanes > 1))
7670 return Op;
7671
7672 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7673 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7674 // instructions.
7675 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7676 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7677 NumVDataDwords = (DMaskLanes + 1) / 2;
7678 else
7679 NumVDataDwords = DMaskLanes;
7680
7681 AdjustRetType = true;
7682 }
7683 }
7684
7685 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7687
7688 // Check for 16 bit addresses or derivatives and pack if true.
7689 MVT VAddrVT =
7690 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
7691 MVT VAddrScalarVT = VAddrVT.getScalarType();
7692 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7693 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7694
7695 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
7696 VAddrScalarVT = VAddrVT.getScalarType();
7697 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7698 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7699
7700 // Push back extra arguments.
7701 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
7702 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
7703 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7704 // Special handling of bias when A16 is on. Bias is of type half but
7705 // occupies full 32-bit.
7706 SDValue Bias = DAG.getBuildVector(
7707 MVT::v2f16, DL,
7708 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
7709 VAddrs.push_back(Bias);
7710 } else {
7711 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7712 "Bias needs to be converted to 16 bit in A16 mode");
7713 VAddrs.push_back(Op.getOperand(ArgOffset + I));
7714 }
7715 }
7716
7717 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
7718 // 16 bit gradients are supported, but are tied to the A16 control
7719 // so both gradients and addresses must be 16 bit
7720 LLVM_DEBUG(
7721 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
7722 "require 16 bit args for both gradients and addresses");
7723 return Op;
7724 }
7725
7726 if (IsA16) {
7727 if (!ST->hasA16()) {
7728 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
7729 "support 16 bit addresses\n");
7730 return Op;
7731 }
7732 }
7733
7734 // We've dealt with incorrect input so we know that if IsA16, IsG16
7735 // are set then we have to compress/pack operands (either address,
7736 // gradient or both)
7737 // In the case where a16 and gradients are tied (no G16 support) then we
7738 // have already verified that both IsA16 and IsG16 are true
7739 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
7740 // Activate g16
7741 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
7743 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
7744 }
7745
7746 // Add gradients (packed or unpacked)
7747 if (IsG16) {
7748 // Pack the gradients
7749 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
7750 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
7751 ArgOffset + Intr->GradientStart,
7752 ArgOffset + Intr->CoordStart, Intr->NumGradients);
7753 } else {
7754 for (unsigned I = ArgOffset + Intr->GradientStart;
7755 I < ArgOffset + Intr->CoordStart; I++)
7756 VAddrs.push_back(Op.getOperand(I));
7757 }
7758
7759 // Add addresses (packed or unpacked)
7760 if (IsA16) {
7761 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
7762 ArgOffset + Intr->CoordStart, VAddrEnd,
7763 0 /* No gradients */);
7764 } else {
7765 // Add uncompressed address
7766 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
7767 VAddrs.push_back(Op.getOperand(I));
7768 }
7769
7770 // If the register allocator cannot place the address registers contiguously
7771 // without introducing moves, then using the non-sequential address encoding
7772 // is always preferable, since it saves VALU instructions and is usually a
7773 // wash in terms of code size or even better.
7774 //
7775 // However, we currently have no way of hinting to the register allocator that
7776 // MIMG addresses should be placed contiguously when it is possible to do so,
7777 // so force non-NSA for the common 2-address case as a heuristic.
7778 //
7779 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7780 // allocation when possible.
7781 //
7782 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7783 // set of the remaining addresses.
7784 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
7785 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
7786 const bool UseNSA = ST->hasNSAEncoding() &&
7787 VAddrs.size() >= ST->getNSAThreshold(MF) &&
7788 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
7789 const bool UsePartialNSA =
7790 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
7791
7792 SDValue VAddr;
7793 if (UsePartialNSA) {
7794 VAddr = getBuildDwordsVector(DAG, DL,
7795 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
7796 }
7797 else if (!UseNSA) {
7798 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
7799 }
7800
7801 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
7802 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
7803 SDValue Unorm;
7804 if (!BaseOpcode->Sampler) {
7805 Unorm = True;
7806 } else {
7807 uint64_t UnormConst =
7808 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
7809
7810 Unorm = UnormConst ? True : False;
7811 }
7812
7813 SDValue TFE;
7814 SDValue LWE;
7815 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
7816 bool IsTexFail = false;
7817 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
7818 return Op;
7819
7820 if (IsTexFail) {
7821 if (!DMaskLanes) {
7822 // Expecting to get an error flag since TFC is on - and dmask is 0
7823 // Force dmask to be at least 1 otherwise the instruction will fail
7824 DMask = 0x1;
7825 DMaskLanes = 1;
7826 NumVDataDwords = 1;
7827 }
7828 NumVDataDwords += 1;
7829 AdjustRetType = true;
7830 }
7831
7832 // Has something earlier tagged that the return type needs adjusting
7833 // This happens if the instruction is a load or has set TexFailCtrl flags
7834 if (AdjustRetType) {
7835 // NumVDataDwords reflects the true number of dwords required in the return type
7836 if (DMaskLanes == 0 && !BaseOpcode->Store) {
7837 // This is a no-op load. This can be eliminated
7838 SDValue Undef = DAG.getUNDEF(Op.getValueType());
7839 if (isa<MemSDNode>(Op))
7840 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
7841 return Undef;
7842 }
7843
7844 EVT NewVT = NumVDataDwords > 1 ?
7845 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
7846 : MVT::i32;
7847
7848 ResultTypes[0] = NewVT;
7849 if (ResultTypes.size() == 3) {
7850 // Original result was aggregate type used for TexFailCtrl results
7851 // The actual instruction returns as a vector type which has now been
7852 // created. Remove the aggregate result.
7853 ResultTypes.erase(&ResultTypes[1]);
7854 }
7855 }
7856
7857 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
7858 if (BaseOpcode->Atomic)
7859 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
7860 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
7862 return Op;
7863
7865 if (BaseOpcode->Store || BaseOpcode->Atomic)
7866 Ops.push_back(VData); // vdata
7867 if (UsePartialNSA) {
7868 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
7869 Ops.push_back(VAddr);
7870 }
7871 else if (UseNSA)
7872 append_range(Ops, VAddrs);
7873 else
7874 Ops.push_back(VAddr);
7875 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
7876 if (BaseOpcode->Sampler)
7877 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
7878 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
7879 if (IsGFX10Plus)
7880 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
7881 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7882 Ops.push_back(Unorm);
7883 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
7884 Ops.push_back(IsA16 && // r128, a16 for gfx9
7885 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
7886 if (IsGFX10Plus)
7887 Ops.push_back(IsA16 ? True : False);
7888 if (!Subtarget->hasGFX90AInsts()) {
7889 Ops.push_back(TFE); //tfe
7890 } else if (TFE->getAsZExtVal()) {
7891 report_fatal_error("TFE is not supported on this GPU");
7892 }
7893 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7894 Ops.push_back(LWE); // lwe
7895 if (!IsGFX10Plus)
7896 Ops.push_back(DimInfo->DA ? True : False);
7897 if (BaseOpcode->HasD16)
7898 Ops.push_back(IsD16 ? True : False);
7899 if (isa<MemSDNode>(Op))
7900 Ops.push_back(Op.getOperand(0)); // chain
7901
7902 int NumVAddrDwords =
7903 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
7904 int Opcode = -1;
7905
7906 if (IsGFX12Plus) {
7907 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
7908 NumVDataDwords, NumVAddrDwords);
7909 } else if (IsGFX11Plus) {
7910 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
7911 UseNSA ? AMDGPU::MIMGEncGfx11NSA
7912 : AMDGPU::MIMGEncGfx11Default,
7913 NumVDataDwords, NumVAddrDwords);
7914 } else if (IsGFX10Plus) {
7915 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
7916 UseNSA ? AMDGPU::MIMGEncGfx10NSA
7917 : AMDGPU::MIMGEncGfx10Default,
7918 NumVDataDwords, NumVAddrDwords);
7919 } else {
7920 if (Subtarget->hasGFX90AInsts()) {
7921 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
7922 NumVDataDwords, NumVAddrDwords);
7923 if (Opcode == -1)
7925 "requested image instruction is not supported on this GPU");
7926 }
7927 if (Opcode == -1 &&
7929 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
7930 NumVDataDwords, NumVAddrDwords);
7931 if (Opcode == -1)
7932 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
7933 NumVDataDwords, NumVAddrDwords);
7934 }
7935 if (Opcode == -1)
7936 return Op;
7937
7938 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
7939 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
7940 MachineMemOperand *MemRef = MemOp->getMemOperand();
7941 DAG.setNodeMemRefs(NewNode, {MemRef});
7942 }
7943
7944 if (BaseOpcode->AtomicX2) {
7946 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
7947 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
7948 }
7949 if (BaseOpcode->Store)
7950 return SDValue(NewNode, 0);
7951 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
7952 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
7953 NumVDataDwords, IsAtomicPacked16Bit, DL);
7954}
7955
7956SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
7957 SDValue Offset, SDValue CachePolicy,
7958 SelectionDAG &DAG) const {
7960
7961 const DataLayout &DataLayout = DAG.getDataLayout();
7962 Align Alignment =
7964
7969 VT.getStoreSize(), Alignment);
7970
7971 if (!Offset->isDivergent()) {
7972 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
7973
7974 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
7975 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
7976 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
7977 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
7978 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
7979 SDValue BufferLoad =
7981 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7982 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7983 }
7984
7985 // Widen vec3 load to vec4.
7986 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
7987 !Subtarget->hasScalarDwordx3Loads()) {
7988 EVT WidenedVT =
7990 auto WidenedOp = DAG.getMemIntrinsicNode(
7991 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
7992 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
7993 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
7994 DAG.getVectorIdxConstant(0, DL));
7995 return Subvector;
7996 }
7997
7999 DAG.getVTList(VT), Ops, VT, MMO);
8000 }
8001
8002 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8003 // assume that the buffer is unswizzled.
8004 SDValue Ops[] = {
8005 DAG.getEntryNode(), // Chain
8006 Rsrc, // rsrc
8007 DAG.getConstant(0, DL, MVT::i32), // vindex
8008 {}, // voffset
8009 {}, // soffset
8010 {}, // offset
8011 CachePolicy, // cachepolicy
8012 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8013 };
8014 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8015 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8016 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8017 }
8018
8020 unsigned NumLoads = 1;
8021 MVT LoadVT = VT.getSimpleVT();
8022 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8023 assert((LoadVT.getScalarType() == MVT::i32 ||
8024 LoadVT.getScalarType() == MVT::f32));
8025
8026 if (NumElts == 8 || NumElts == 16) {
8027 NumLoads = NumElts / 4;
8028 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8029 }
8030
8031 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8032
8033 // Use the alignment to ensure that the required offsets will fit into the
8034 // immediate offsets.
8035 setBufferOffsets(Offset, DAG, &Ops[3],
8036 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8037
8038 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8039 for (unsigned i = 0; i < NumLoads; ++i) {
8040 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8041 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8042 LoadVT, MMO, DAG));
8043 }
8044
8045 if (NumElts == 8 || NumElts == 16)
8046 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8047
8048 return Loads[0];
8049}
8050
8051SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8052 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8053 if (!Subtarget->hasArchitectedSGPRs())
8054 return {};
8055 SDLoc SL(Op);
8056 MVT VT = MVT::i32;
8057 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8058 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8059 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8060}
8061
8062SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8063 unsigned Dim,
8064 const ArgDescriptor &Arg) const {
8065 SDLoc SL(Op);
8067 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8068 if (MaxID == 0)
8069 return DAG.getConstant(0, SL, MVT::i32);
8070
8071 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8072 SDLoc(DAG.getEntryNode()), Arg);
8073
8074 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8075 // masking operations anyway.
8076 //
8077 // TODO: We could assert the top bit is 0 for the source copy.
8078 if (Arg.isMasked())
8079 return Val;
8080
8081 // Preserve the known bits after expansion to a copy.
8083 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8084 DAG.getValueType(SmallVT));
8085}
8086
8087SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8088 SelectionDAG &DAG) const {
8090 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8091
8092 EVT VT = Op.getValueType();
8093 SDLoc DL(Op);
8094 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8095
8096 // TODO: Should this propagate fast-math-flags?
8097
8098 switch (IntrinsicID) {
8099 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8100 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8101 return emitNonHSAIntrinsicError(DAG, DL, VT);
8102 return getPreloadedValue(DAG, *MFI, VT,
8104 }
8105 case Intrinsic::amdgcn_dispatch_ptr:
8106 case Intrinsic::amdgcn_queue_ptr: {
8107 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8108 DiagnosticInfoUnsupported BadIntrin(
8109 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8110 DL.getDebugLoc());
8111 DAG.getContext()->diagnose(BadIntrin);
8112 return DAG.getUNDEF(VT);
8113 }
8114
8115 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8117 return getPreloadedValue(DAG, *MFI, VT, RegID);
8118 }
8119 case Intrinsic::amdgcn_implicitarg_ptr: {
8120 if (MFI->isEntryFunction())
8121 return getImplicitArgPtr(DAG, DL);
8122 return getPreloadedValue(DAG, *MFI, VT,
8124 }
8125 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8127 // This only makes sense to call in a kernel, so just lower to null.
8128 return DAG.getConstant(0, DL, VT);
8129 }
8130
8131 return getPreloadedValue(DAG, *MFI, VT,
8133 }
8134 case Intrinsic::amdgcn_dispatch_id: {
8135 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8136 }
8137 case Intrinsic::amdgcn_rcp:
8138 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8139 case Intrinsic::amdgcn_rsq:
8140 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8141 case Intrinsic::amdgcn_rsq_legacy:
8143 return emitRemovedIntrinsicError(DAG, DL, VT);
8144 return SDValue();
8145 case Intrinsic::amdgcn_rcp_legacy:
8147 return emitRemovedIntrinsicError(DAG, DL, VT);
8148 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8149 case Intrinsic::amdgcn_rsq_clamp: {
8151 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8152
8153 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8156
8157 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8158 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8159 DAG.getConstantFP(Max, DL, VT));
8160 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8161 DAG.getConstantFP(Min, DL, VT));
8162 }
8163 case Intrinsic::r600_read_ngroups_x:
8164 if (Subtarget->isAmdHsaOS())
8165 return emitNonHSAIntrinsicError(DAG, DL, VT);
8166
8167 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8169 false);
8170 case Intrinsic::r600_read_ngroups_y:
8171 if (Subtarget->isAmdHsaOS())
8172 return emitNonHSAIntrinsicError(DAG, DL, VT);
8173
8174 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8176 false);
8177 case Intrinsic::r600_read_ngroups_z:
8178 if (Subtarget->isAmdHsaOS())
8179 return emitNonHSAIntrinsicError(DAG, DL, VT);
8180
8181 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8183 false);
8184 case Intrinsic::r600_read_global_size_x:
8185 if (Subtarget->isAmdHsaOS())
8186 return emitNonHSAIntrinsicError(DAG, DL, VT);
8187
8188 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8190 Align(4), false);
8191 case Intrinsic::r600_read_global_size_y:
8192 if (Subtarget->isAmdHsaOS())
8193 return emitNonHSAIntrinsicError(DAG, DL, VT);
8194
8195 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8197 Align(4), false);
8198 case Intrinsic::r600_read_global_size_z:
8199 if (Subtarget->isAmdHsaOS())
8200 return emitNonHSAIntrinsicError(DAG, DL, VT);
8201
8202 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8204 Align(4), false);
8205 case Intrinsic::r600_read_local_size_x:
8206 if (Subtarget->isAmdHsaOS())
8207 return emitNonHSAIntrinsicError(DAG, DL, VT);
8208
8209 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8211 case Intrinsic::r600_read_local_size_y:
8212 if (Subtarget->isAmdHsaOS())
8213 return emitNonHSAIntrinsicError(DAG, DL, VT);
8214
8215 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8217 case Intrinsic::r600_read_local_size_z:
8218 if (Subtarget->isAmdHsaOS())
8219 return emitNonHSAIntrinsicError(DAG, DL, VT);
8220
8221 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8223 case Intrinsic::amdgcn_workgroup_id_x:
8224 return getPreloadedValue(DAG, *MFI, VT,
8226 case Intrinsic::amdgcn_workgroup_id_y:
8227 return getPreloadedValue(DAG, *MFI, VT,
8229 case Intrinsic::amdgcn_workgroup_id_z:
8230 return getPreloadedValue(DAG, *MFI, VT,
8232 case Intrinsic::amdgcn_wave_id:
8233 return lowerWaveID(DAG, Op);
8234 case Intrinsic::amdgcn_lds_kernel_id: {
8235 if (MFI->isEntryFunction())
8236 return getLDSKernelId(DAG, DL);
8237 return getPreloadedValue(DAG, *MFI, VT,
8239 }
8240 case Intrinsic::amdgcn_workitem_id_x:
8241 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8242 case Intrinsic::amdgcn_workitem_id_y:
8243 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8244 case Intrinsic::amdgcn_workitem_id_z:
8245 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8246 case Intrinsic::amdgcn_wavefrontsize:
8248 SDLoc(Op), MVT::i32);
8249 case Intrinsic::amdgcn_s_buffer_load: {
8250 unsigned CPol = Op.getConstantOperandVal(3);
8251 // s_buffer_load, because of how it's optimized, can't be volatile
8252 // so reject ones with the volatile bit set.
8253 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8256 return Op;
8257 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8258 DAG);
8259 }
8260 case Intrinsic::amdgcn_fdiv_fast:
8261 return lowerFDIV_FAST(Op, DAG);
8262 case Intrinsic::amdgcn_sin:
8263 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8264
8265 case Intrinsic::amdgcn_cos:
8266 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8267
8268 case Intrinsic::amdgcn_mul_u24:
8269 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8270 case Intrinsic::amdgcn_mul_i24:
8271 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8272
8273 case Intrinsic::amdgcn_log_clamp: {
8275 return SDValue();
8276
8277 return emitRemovedIntrinsicError(DAG, DL, VT);
8278 }
8279 case Intrinsic::amdgcn_fract:
8280 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8281
8282 case Intrinsic::amdgcn_class:
8283 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8284 Op.getOperand(1), Op.getOperand(2));
8285 case Intrinsic::amdgcn_div_fmas:
8286 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8287 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8288 Op.getOperand(4));
8289
8290 case Intrinsic::amdgcn_div_fixup:
8291 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8292 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8293
8294 case Intrinsic::amdgcn_div_scale: {
8295 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8296
8297 // Translate to the operands expected by the machine instruction. The
8298 // first parameter must be the same as the first instruction.
8299 SDValue Numerator = Op.getOperand(1);
8300 SDValue Denominator = Op.getOperand(2);
8301
8302 // Note this order is opposite of the machine instruction's operations,
8303 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8304 // intrinsic has the numerator as the first operand to match a normal
8305 // division operation.
8306
8307 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8308
8309 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8310 Denominator, Numerator);
8311 }
8312 case Intrinsic::amdgcn_icmp: {
8313 // There is a Pat that handles this variant, so return it as-is.
8314 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8315 Op.getConstantOperandVal(2) == 0 &&
8316 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8317 return Op;
8318 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8319 }
8320 case Intrinsic::amdgcn_fcmp: {
8321 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8322 }
8323 case Intrinsic::amdgcn_ballot:
8324 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8325 case Intrinsic::amdgcn_fmed3:
8326 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8327 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8328 case Intrinsic::amdgcn_fdot2:
8329 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8330 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8331 Op.getOperand(4));
8332 case Intrinsic::amdgcn_fmul_legacy:
8333 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8334 Op.getOperand(1), Op.getOperand(2));
8335 case Intrinsic::amdgcn_sffbh:
8336 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8337 case Intrinsic::amdgcn_sbfe:
8338 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8339 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8340 case Intrinsic::amdgcn_ubfe:
8341 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8342 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8343 case Intrinsic::amdgcn_cvt_pkrtz:
8344 case Intrinsic::amdgcn_cvt_pknorm_i16:
8345 case Intrinsic::amdgcn_cvt_pknorm_u16:
8346 case Intrinsic::amdgcn_cvt_pk_i16:
8347 case Intrinsic::amdgcn_cvt_pk_u16: {
8348 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8349 EVT VT = Op.getValueType();
8350 unsigned Opcode;
8351
8352 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8354 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8356 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8358 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8360 else
8362
8363 if (isTypeLegal(VT))
8364 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8365
8366 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8367 Op.getOperand(1), Op.getOperand(2));
8368 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8369 }
8370 case Intrinsic::amdgcn_fmad_ftz:
8371 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8372 Op.getOperand(2), Op.getOperand(3));
8373
8374 case Intrinsic::amdgcn_if_break:
8375 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8376 Op->getOperand(1), Op->getOperand(2)), 0);
8377
8378 case Intrinsic::amdgcn_groupstaticsize: {
8380 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8381 return Op;
8382
8383 const Module *M = MF.getFunction().getParent();
8384 const GlobalValue *GV =
8385 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8386 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8388 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8389 }
8390 case Intrinsic::amdgcn_is_shared:
8391 case Intrinsic::amdgcn_is_private: {
8392 SDLoc SL(Op);
8393 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8395 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8396 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8397 Op.getOperand(1));
8398
8399 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8400 DAG.getConstant(1, SL, MVT::i32));
8401 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8402 }
8403 case Intrinsic::amdgcn_perm:
8404 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8405 Op.getOperand(2), Op.getOperand(3));
8406 case Intrinsic::amdgcn_reloc_constant: {
8407 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8408 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8409 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8410 auto RelocSymbol = cast<GlobalVariable>(
8411 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8412 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8414 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8415 }
8416 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8417 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8418 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8419 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8420 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8421 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8422 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8423 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8424 if (Op.getOperand(4).getValueType() == MVT::i32)
8425 return SDValue();
8426
8427 SDLoc SL(Op);
8428 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8429 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8430 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8431 Op.getOperand(3), IndexKeyi32);
8432 }
8433 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8434 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8435 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8436 if (Op.getOperand(6).getValueType() == MVT::i32)
8437 return SDValue();
8438
8439 SDLoc SL(Op);
8440 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8441 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8442 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8443 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8444 IndexKeyi32, Op.getOperand(7)});
8445 }
8446 case Intrinsic::amdgcn_addrspacecast_nonnull:
8447 return lowerADDRSPACECAST(Op, DAG);
8448 default:
8449 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8451 return lowerImage(Op, ImageDimIntr, DAG, false);
8452
8453 return Op;
8454 }
8455}
8456
8457// On targets not supporting constant in soffset field, turn zero to
8458// SGPR_NULL to avoid generating an extra s_mov with zero.
8460 const GCNSubtarget *Subtarget) {
8461 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8462 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8463 return SOffset;
8464}
8465
8466SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8467 SelectionDAG &DAG,
8468 unsigned NewOpcode) const {
8469 SDLoc DL(Op);
8470
8471 SDValue VData = Op.getOperand(2);
8472 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8473 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8474 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8475 SDValue Ops[] = {
8476 Op.getOperand(0), // Chain
8477 VData, // vdata
8478 Rsrc, // rsrc
8479 DAG.getConstant(0, DL, MVT::i32), // vindex
8480 Offsets.first, // voffset
8481 SOffset, // soffset
8482 Offsets.second, // offset
8483 Op.getOperand(6), // cachepolicy
8484 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8485 };
8486
8487 auto *M = cast<MemSDNode>(Op);
8488
8489 EVT MemVT = VData.getValueType();
8490 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8491 M->getMemOperand());
8492}
8493
8494// Return a value to use for the idxen operand by examining the vindex operand.
8495static unsigned getIdxEn(SDValue VIndex) {
8496 // No need to set idxen if vindex is known to be zero.
8497 return isNullConstant(VIndex) ? 0 : 1;
8498}
8499
8500SDValue
8501SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8502 unsigned NewOpcode) const {
8503 SDLoc DL(Op);
8504
8505 SDValue VData = Op.getOperand(2);
8506 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8507 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8508 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8509 SDValue Ops[] = {
8510 Op.getOperand(0), // Chain
8511 VData, // vdata
8512 Rsrc, // rsrc
8513 Op.getOperand(4), // vindex
8514 Offsets.first, // voffset
8515 SOffset, // soffset
8516 Offsets.second, // offset
8517 Op.getOperand(7), // cachepolicy
8518 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8519 };
8520
8521 auto *M = cast<MemSDNode>(Op);
8522
8523 EVT MemVT = VData.getValueType();
8524 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8525 M->getMemOperand());
8526}
8527
8528SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8529 SelectionDAG &DAG) const {
8530 unsigned IntrID = Op.getConstantOperandVal(1);
8531 SDLoc DL(Op);
8532
8533 switch (IntrID) {
8534 case Intrinsic::amdgcn_ds_ordered_add:
8535 case Intrinsic::amdgcn_ds_ordered_swap: {
8536 MemSDNode *M = cast<MemSDNode>(Op);
8537 SDValue Chain = M->getOperand(0);
8538 SDValue M0 = M->getOperand(2);
8539 SDValue Value = M->getOperand(3);
8540 unsigned IndexOperand = M->getConstantOperandVal(7);
8541 unsigned WaveRelease = M->getConstantOperandVal(8);
8542 unsigned WaveDone = M->getConstantOperandVal(9);
8543
8544 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8545 IndexOperand &= ~0x3f;
8546 unsigned CountDw = 0;
8547
8548 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8549 CountDw = (IndexOperand >> 24) & 0xf;
8550 IndexOperand &= ~(0xf << 24);
8551
8552 if (CountDw < 1 || CountDw > 4) {
8554 "ds_ordered_count: dword count must be between 1 and 4");
8555 }
8556 }
8557
8558 if (IndexOperand)
8559 report_fatal_error("ds_ordered_count: bad index operand");
8560
8561 if (WaveDone && !WaveRelease)
8562 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8563
8564 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8565 unsigned ShaderType =
8567 unsigned Offset0 = OrderedCountIndex << 2;
8568 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8569
8570 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8571 Offset1 |= (CountDw - 1) << 6;
8572
8573 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8574 Offset1 |= ShaderType << 2;
8575
8576 unsigned Offset = Offset0 | (Offset1 << 8);
8577
8578 SDValue Ops[] = {
8579 Chain,
8580 Value,
8581 DAG.getTargetConstant(Offset, DL, MVT::i16),
8582 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8583 };
8585 M->getVTList(), Ops, M->getMemoryVT(),
8586 M->getMemOperand());
8587 }
8588 case Intrinsic::amdgcn_ds_fadd: {
8589 MemSDNode *M = cast<MemSDNode>(Op);
8590 unsigned Opc;
8591 switch (IntrID) {
8592 case Intrinsic::amdgcn_ds_fadd:
8594 break;
8595 }
8596
8597 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
8598 M->getOperand(0), M->getOperand(2), M->getOperand(3),
8599 M->getMemOperand());
8600 }
8601 case Intrinsic::amdgcn_ds_fmin:
8602 case Intrinsic::amdgcn_ds_fmax: {
8603 MemSDNode *M = cast<MemSDNode>(Op);
8604 unsigned Opc;
8605 switch (IntrID) {
8606 case Intrinsic::amdgcn_ds_fmin:
8608 break;
8609 case Intrinsic::amdgcn_ds_fmax:
8611 break;
8612 default:
8613 llvm_unreachable("Unknown intrinsic!");
8614 }
8615 SDValue Ops[] = {
8616 M->getOperand(0), // Chain
8617 M->getOperand(2), // Ptr
8618 M->getOperand(3) // Value
8619 };
8620
8621 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
8622 M->getMemoryVT(), M->getMemOperand());
8623 }
8624 case Intrinsic::amdgcn_buffer_load:
8625 case Intrinsic::amdgcn_buffer_load_format: {
8626 unsigned Glc = Op.getConstantOperandVal(5);
8627 unsigned Slc = Op.getConstantOperandVal(6);
8628 unsigned IdxEn = getIdxEn(Op.getOperand(3));
8629 SDValue Ops[] = {
8630 Op.getOperand(0), // Chain
8631 Op.getOperand(2), // rsrc
8632 Op.getOperand(3), // vindex
8633 SDValue(), // voffset -- will be set by setBufferOffsets
8634 SDValue(), // soffset -- will be set by setBufferOffsets
8635 SDValue(), // offset -- will be set by setBufferOffsets
8636 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8637 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8638 };
8639 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
8640
8641 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
8643
8644 EVT VT = Op.getValueType();
8645 EVT IntVT = VT.changeTypeToInteger();
8646 auto *M = cast<MemSDNode>(Op);
8647 EVT LoadVT = Op.getValueType();
8648
8649 if (LoadVT.getScalarType() == MVT::f16)
8650 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
8651 M, DAG, Ops);
8652
8653 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
8654 if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
8655 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
8656 M->getMemOperand());
8657
8658 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
8659 M->getMemOperand(), DAG);
8660 }
8661 case Intrinsic::amdgcn_raw_buffer_load:
8662 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8663 case Intrinsic::amdgcn_raw_buffer_load_format:
8664 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8665 const bool IsFormat =
8666 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8667 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8668
8669 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8670 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8671 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8672 SDValue Ops[] = {
8673 Op.getOperand(0), // Chain
8674 Rsrc, // rsrc
8675 DAG.getConstant(0, DL, MVT::i32), // vindex
8676 Offsets.first, // voffset
8677 SOffset, // soffset
8678 Offsets.second, // offset
8679 Op.getOperand(5), // cachepolicy, swizzled buffer
8680 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8681 };
8682
8683 auto *M = cast<MemSDNode>(Op);
8684 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8685 }
8686 case Intrinsic::amdgcn_struct_buffer_load:
8687 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8688 case Intrinsic::amdgcn_struct_buffer_load_format:
8689 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8690 const bool IsFormat =
8691 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8692 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8693
8694 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8695 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8696 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8697 SDValue Ops[] = {
8698 Op.getOperand(0), // Chain
8699 Rsrc, // rsrc
8700 Op.getOperand(3), // vindex
8701 Offsets.first, // voffset
8702 SOffset, // soffset
8703 Offsets.second, // offset
8704 Op.getOperand(6), // cachepolicy, swizzled buffer
8705 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8706 };
8707
8708 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8709 }
8710 case Intrinsic::amdgcn_tbuffer_load: {
8711 MemSDNode *M = cast<MemSDNode>(Op);
8712 EVT LoadVT = Op.getValueType();
8713
8714 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8715 unsigned Dfmt = Op.getConstantOperandVal(7);
8716 unsigned Nfmt = Op.getConstantOperandVal(8);
8717 unsigned Glc = Op.getConstantOperandVal(9);
8718 unsigned Slc = Op.getConstantOperandVal(10);
8719 unsigned IdxEn = getIdxEn(Op.getOperand(3));
8720 SDValue Ops[] = {
8721 Op.getOperand(0), // Chain
8722 Op.getOperand(2), // rsrc
8723 Op.getOperand(3), // vindex
8724 Op.getOperand(4), // voffset
8725 SOffset, // soffset
8726 Op.getOperand(6), // offset
8727 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
8728 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8729 DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
8730 };
8731
8732 if (LoadVT.getScalarType() == MVT::f16)
8733 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8734 M, DAG, Ops);
8735 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8736 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8737 DAG);
8738 }
8739 case Intrinsic::amdgcn_raw_tbuffer_load:
8740 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8741 MemSDNode *M = cast<MemSDNode>(Op);
8742 EVT LoadVT = Op.getValueType();
8743 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8744 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8745 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8746
8747 SDValue Ops[] = {
8748 Op.getOperand(0), // Chain
8749 Rsrc, // rsrc
8750 DAG.getConstant(0, DL, MVT::i32), // vindex
8751 Offsets.first, // voffset
8752 SOffset, // soffset
8753 Offsets.second, // offset
8754 Op.getOperand(5), // format
8755 Op.getOperand(6), // cachepolicy, swizzled buffer
8756 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8757 };
8758
8759 if (LoadVT.getScalarType() == MVT::f16)
8760 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8761 M, DAG, Ops);
8762 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8763 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8764 DAG);
8765 }
8766 case Intrinsic::amdgcn_struct_tbuffer_load:
8767 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8768 MemSDNode *M = cast<MemSDNode>(Op);
8769 EVT LoadVT = Op.getValueType();
8770 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8771 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8772 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8773
8774 SDValue Ops[] = {
8775 Op.getOperand(0), // Chain
8776 Rsrc, // rsrc
8777 Op.getOperand(3), // vindex
8778 Offsets.first, // voffset
8779 SOffset, // soffset
8780 Offsets.second, // offset
8781 Op.getOperand(6), // format
8782 Op.getOperand(7), // cachepolicy, swizzled buffer
8783 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8784 };
8785
8786 if (LoadVT.getScalarType() == MVT::f16)
8787 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8788 M, DAG, Ops);
8789 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8790 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8791 DAG);
8792 }
8793 case Intrinsic::amdgcn_buffer_atomic_swap:
8794 case Intrinsic::amdgcn_buffer_atomic_add:
8795 case Intrinsic::amdgcn_buffer_atomic_sub:
8796 case Intrinsic::amdgcn_buffer_atomic_csub:
8797 case Intrinsic::amdgcn_buffer_atomic_smin:
8798 case Intrinsic::amdgcn_buffer_atomic_umin:
8799 case Intrinsic::amdgcn_buffer_atomic_smax:
8800 case Intrinsic::amdgcn_buffer_atomic_umax:
8801 case Intrinsic::amdgcn_buffer_atomic_and:
8802 case Intrinsic::amdgcn_buffer_atomic_or:
8803 case Intrinsic::amdgcn_buffer_atomic_xor:
8804 case Intrinsic::amdgcn_buffer_atomic_fadd: {
8805 unsigned Slc = Op.getConstantOperandVal(6);
8806 unsigned IdxEn = getIdxEn(Op.getOperand(4));
8807 SDValue Ops[] = {
8808 Op.getOperand(0), // Chain
8809 Op.getOperand(2), // vdata
8810 Op.getOperand(3), // rsrc
8811 Op.getOperand(4), // vindex
8812 SDValue(), // voffset -- will be set by setBufferOffsets
8813 SDValue(), // soffset -- will be set by setBufferOffsets
8814 SDValue(), // offset -- will be set by setBufferOffsets
8815 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
8816 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8817 };
8818 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
8819
8820 EVT VT = Op.getValueType();
8821
8822 auto *M = cast<MemSDNode>(Op);
8823 unsigned Opcode = 0;
8824
8825 switch (IntrID) {
8826 case Intrinsic::amdgcn_buffer_atomic_swap:
8828 break;
8829 case Intrinsic::amdgcn_buffer_atomic_add:
8831 break;
8832 case Intrinsic::amdgcn_buffer_atomic_sub:
8834 break;
8835 case Intrinsic::amdgcn_buffer_atomic_csub:
8837 break;
8838 case Intrinsic::amdgcn_buffer_atomic_smin:
8840 break;
8841 case Intrinsic::amdgcn_buffer_atomic_umin:
8843 break;
8844 case Intrinsic::amdgcn_buffer_atomic_smax:
8846 break;
8847 case Intrinsic::amdgcn_buffer_atomic_umax:
8849 break;
8850 case Intrinsic::amdgcn_buffer_atomic_and:
8852 break;
8853 case Intrinsic::amdgcn_buffer_atomic_or:
8855 break;
8856 case Intrinsic::amdgcn_buffer_atomic_xor:
8858 break;
8859 case Intrinsic::amdgcn_buffer_atomic_fadd:
8861 break;
8862 default:
8863 llvm_unreachable("unhandled atomic opcode");
8864 }
8865
8866 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
8867 M->getMemOperand());
8868 }
8869 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8870 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8871 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8872 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
8873 return lowerRawBufferAtomicIntrin(Op, DAG,
8875 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8876 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8877 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8878 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
8879 return lowerStructBufferAtomicIntrin(Op, DAG,
8881 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8883 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8884 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8885 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8886 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8887 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8888 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8889 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
8890 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8891 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8892 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
8893 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8895 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
8896 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8898 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
8899 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8900 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8901 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
8902 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8904 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
8905 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8907 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
8908 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8910 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
8911 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8913 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
8914 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8916 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
8917 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8918 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8919 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
8920 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8921 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8922 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
8923 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8924 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8925 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
8926 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8928 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
8929 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8930 return lowerRawBufferAtomicIntrin(Op, DAG,
8932 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8933 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8934 return lowerStructBufferAtomicIntrin(Op, DAG,
8936 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8937 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8938 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
8939 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8940 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8941 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
8942 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8943 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8944 return lowerStructBufferAtomicIntrin(Op, DAG,
8946 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8947 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8948 return lowerStructBufferAtomicIntrin(Op, DAG,
8950 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8951 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8952 return lowerStructBufferAtomicIntrin(Op, DAG,
8954 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8955 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8956 return lowerStructBufferAtomicIntrin(Op, DAG,
8958 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8960 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
8961 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8963 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
8964 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8965 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8966 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
8967 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8968 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8969 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
8970 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8971 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8972 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
8973 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8974 return lowerStructBufferAtomicIntrin(Op, DAG,
8976
8977 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
8978 unsigned Slc = Op.getConstantOperandVal(7);
8979 unsigned IdxEn = getIdxEn(Op.getOperand(5));
8980 SDValue Ops[] = {
8981 Op.getOperand(0), // Chain
8982 Op.getOperand(2), // src
8983 Op.getOperand(3), // cmp
8984 Op.getOperand(4), // rsrc
8985 Op.getOperand(5), // vindex
8986 SDValue(), // voffset -- will be set by setBufferOffsets
8987 SDValue(), // soffset -- will be set by setBufferOffsets
8988 SDValue(), // offset -- will be set by setBufferOffsets
8989 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
8990 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8991 };
8992 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
8993
8994 EVT VT = Op.getValueType();
8995 auto *M = cast<MemSDNode>(Op);
8996
8998 Op->getVTList(), Ops, VT, M->getMemOperand());
8999 }
9000 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9001 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9002 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9003 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9004 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9005 SDValue Ops[] = {
9006 Op.getOperand(0), // Chain
9007 Op.getOperand(2), // src
9008 Op.getOperand(3), // cmp
9009 Rsrc, // rsrc
9010 DAG.getConstant(0, DL, MVT::i32), // vindex
9011 Offsets.first, // voffset
9012 SOffset, // soffset
9013 Offsets.second, // offset
9014 Op.getOperand(7), // cachepolicy
9015 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9016 };
9017 EVT VT = Op.getValueType();
9018 auto *M = cast<MemSDNode>(Op);
9019
9021 Op->getVTList(), Ops, VT, M->getMemOperand());
9022 }
9023 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9024 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9025 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9026 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
9027 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9028 SDValue Ops[] = {
9029 Op.getOperand(0), // Chain
9030 Op.getOperand(2), // src
9031 Op.getOperand(3), // cmp
9032 Rsrc, // rsrc
9033 Op.getOperand(5), // vindex
9034 Offsets.first, // voffset
9035 SOffset, // soffset
9036 Offsets.second, // offset
9037 Op.getOperand(8), // cachepolicy
9038 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9039 };
9040 EVT VT = Op.getValueType();
9041 auto *M = cast<MemSDNode>(Op);
9042
9044 Op->getVTList(), Ops, VT, M->getMemOperand());
9045 }
9046 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9047 MemSDNode *M = cast<MemSDNode>(Op);
9048 SDValue NodePtr = M->getOperand(2);
9049 SDValue RayExtent = M->getOperand(3);
9050 SDValue RayOrigin = M->getOperand(4);
9051 SDValue RayDir = M->getOperand(5);
9052 SDValue RayInvDir = M->getOperand(6);
9053 SDValue TDescr = M->getOperand(7);
9054
9055 assert(NodePtr.getValueType() == MVT::i32 ||
9056 NodePtr.getValueType() == MVT::i64);
9057 assert(RayDir.getValueType() == MVT::v3f16 ||
9058 RayDir.getValueType() == MVT::v3f32);
9059
9060 if (!Subtarget->hasGFX10_AEncoding()) {
9061 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9062 return SDValue();
9063 }
9064
9065 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9066 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9067 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9068 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9069 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9070 const unsigned NumVDataDwords = 4;
9071 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9072 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9073 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9074 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9075 IsGFX12Plus;
9076 const unsigned BaseOpcodes[2][2] = {
9077 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9078 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9079 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9080 int Opcode;
9081 if (UseNSA) {
9082 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9083 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9084 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9085 : AMDGPU::MIMGEncGfx10NSA,
9086 NumVDataDwords, NumVAddrDwords);
9087 } else {
9088 assert(!IsGFX12Plus);
9089 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9090 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9091 : AMDGPU::MIMGEncGfx10Default,
9092 NumVDataDwords, NumVAddrDwords);
9093 }
9094 assert(Opcode != -1);
9095
9097
9098 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9100 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9101 if (Lanes[0].getValueSizeInBits() == 32) {
9102 for (unsigned I = 0; I < 3; ++I)
9103 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9104 } else {
9105 if (IsAligned) {
9106 Ops.push_back(
9107 DAG.getBitcast(MVT::i32,
9108 DAG.getBuildVector(MVT::v2f16, DL,
9109 { Lanes[0], Lanes[1] })));
9110 Ops.push_back(Lanes[2]);
9111 } else {
9112 SDValue Elt0 = Ops.pop_back_val();
9113 Ops.push_back(
9114 DAG.getBitcast(MVT::i32,
9115 DAG.getBuildVector(MVT::v2f16, DL,
9116 { Elt0, Lanes[0] })));
9117 Ops.push_back(
9118 DAG.getBitcast(MVT::i32,
9119 DAG.getBuildVector(MVT::v2f16, DL,
9120 { Lanes[1], Lanes[2] })));
9121 }
9122 }
9123 };
9124
9125 if (UseNSA && IsGFX11Plus) {
9126 Ops.push_back(NodePtr);
9127 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9128 Ops.push_back(RayOrigin);
9129 if (IsA16) {
9130 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9131 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9132 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9133 for (unsigned I = 0; I < 3; ++I) {
9134 MergedLanes.push_back(DAG.getBitcast(
9135 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9136 {DirLanes[I], InvDirLanes[I]})));
9137 }
9138 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9139 } else {
9140 Ops.push_back(RayDir);
9141 Ops.push_back(RayInvDir);
9142 }
9143 } else {
9144 if (Is64)
9145 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9146 2);
9147 else
9148 Ops.push_back(NodePtr);
9149
9150 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9151 packLanes(RayOrigin, true);
9152 packLanes(RayDir, true);
9153 packLanes(RayInvDir, false);
9154 }
9155
9156 if (!UseNSA) {
9157 // Build a single vector containing all the operands so far prepared.
9158 if (NumVAddrDwords > 12) {
9159 SDValue Undef = DAG.getUNDEF(MVT::i32);
9160 Ops.append(16 - Ops.size(), Undef);
9161 }
9162 assert(Ops.size() >= 8 && Ops.size() <= 12);
9163 SDValue MergedOps = DAG.getBuildVector(
9164 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9165 Ops.clear();
9166 Ops.push_back(MergedOps);
9167 }
9168
9169 Ops.push_back(TDescr);
9170 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9171 Ops.push_back(M->getChain());
9172
9173 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9174 MachineMemOperand *MemRef = M->getMemOperand();
9175 DAG.setNodeMemRefs(NewNode, {MemRef});
9176 return SDValue(NewNode, 0);
9177 }
9178 case Intrinsic::amdgcn_global_atomic_fmin:
9179 case Intrinsic::amdgcn_global_atomic_fmax:
9180 case Intrinsic::amdgcn_global_atomic_fmin_num:
9181 case Intrinsic::amdgcn_global_atomic_fmax_num:
9182 case Intrinsic::amdgcn_flat_atomic_fmin:
9183 case Intrinsic::amdgcn_flat_atomic_fmax:
9184 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9185 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9186 MemSDNode *M = cast<MemSDNode>(Op);
9187 SDValue Ops[] = {
9188 M->getOperand(0), // Chain
9189 M->getOperand(2), // Ptr
9190 M->getOperand(3) // Value
9191 };
9192 unsigned Opcode = 0;
9193 switch (IntrID) {
9194 case Intrinsic::amdgcn_global_atomic_fmin:
9195 case Intrinsic::amdgcn_global_atomic_fmin_num:
9196 case Intrinsic::amdgcn_flat_atomic_fmin:
9197 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9199 break;
9200 }
9201 case Intrinsic::amdgcn_global_atomic_fmax:
9202 case Intrinsic::amdgcn_global_atomic_fmax_num:
9203 case Intrinsic::amdgcn_flat_atomic_fmax:
9204 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9206 break;
9207 }
9208 default:
9209 llvm_unreachable("unhandled atomic opcode");
9210 }
9211 return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
9212 M->getVTList(), Ops, M->getMemoryVT(),
9213 M->getMemOperand());
9214 }
9215 case Intrinsic::amdgcn_s_get_barrier_state: {
9216 SDValue Chain = Op->getOperand(0);
9218 unsigned Opc;
9219 bool IsInlinableBarID = false;
9220 int64_t BarID;
9221
9222 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9223 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9224 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9225 }
9226
9227 if (IsInlinableBarID) {
9228 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9229 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9230 Ops.push_back(K);
9231 } else {
9232 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9233 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9234 Ops.push_back(M0Val.getValue(0));
9235 }
9236
9237 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9238 return SDValue(NewMI, 0);
9239 }
9240 default:
9241
9242 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9244 return lowerImage(Op, ImageDimIntr, DAG, true);
9245
9246 return SDValue();
9247 }
9248}
9249
9250// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9251// dwordx4 if on SI and handle TFE loads.
9252SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9253 SDVTList VTList,
9254 ArrayRef<SDValue> Ops, EVT MemVT,
9255 MachineMemOperand *MMO,
9256 SelectionDAG &DAG) const {
9257 LLVMContext &C = *DAG.getContext();
9259 EVT VT = VTList.VTs[0];
9260
9261 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9262 bool IsTFE = VTList.NumVTs == 3;
9263 if (IsTFE) {
9264 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9265 unsigned NumOpDWords = NumValueDWords + 1;
9266 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9267 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9268 MachineMemOperand *OpDWordsMMO =
9269 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9270 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9271 OpDWordsVT, OpDWordsMMO, DAG);
9273 DAG.getVectorIdxConstant(NumValueDWords, DL));
9274 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9275 SDValue ValueDWords =
9276 NumValueDWords == 1
9277 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9279 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9280 ZeroIdx);
9281 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9282 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9283 }
9284
9285 if (!Subtarget->hasDwordx3LoadStores() &&
9286 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9287 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9288 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9289 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9290 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9291 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9292 WidenedMemVT, WidenedMMO);
9294 DAG.getVectorIdxConstant(0, DL));
9295 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9296 }
9297
9298 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9299}
9300
9301SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9302 bool ImageStore) const {
9303 EVT StoreVT = VData.getValueType();
9304
9305 // No change for f16 and legal vector D16 types.
9306 if (!StoreVT.isVector())
9307 return VData;
9308
9309 SDLoc DL(VData);
9310 unsigned NumElements = StoreVT.getVectorNumElements();
9311
9312 if (Subtarget->hasUnpackedD16VMem()) {
9313 // We need to unpack the packed data to store.
9314 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9315 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9316
9317 EVT EquivStoreVT =
9318 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9319 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9320 return DAG.UnrollVectorOp(ZExt.getNode());
9321 }
9322
9323 // The sq block of gfx8.1 does not estimate register use correctly for d16
9324 // image store instructions. The data operand is computed as if it were not a
9325 // d16 image instruction.
9326 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9327 // Bitcast to i16
9328 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9329 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9330
9331 // Decompose into scalars
9333 DAG.ExtractVectorElements(IntVData, Elts);
9334
9335 // Group pairs of i16 into v2i16 and bitcast to i32
9336 SmallVector<SDValue, 4> PackedElts;
9337 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9338 SDValue Pair =
9339 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9340 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9341 PackedElts.push_back(IntPair);
9342 }
9343 if ((NumElements % 2) == 1) {
9344 // Handle v3i16
9345 unsigned I = Elts.size() / 2;
9346 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9347 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9348 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9349 PackedElts.push_back(IntPair);
9350 }
9351
9352 // Pad using UNDEF
9353 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9354
9355 // Build final vector
9356 EVT VecVT =
9357 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9358 return DAG.getBuildVector(VecVT, DL, PackedElts);
9359 }
9360
9361 if (NumElements == 3) {
9362 EVT IntStoreVT =
9364 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9365
9366 EVT WidenedStoreVT = EVT::getVectorVT(
9367 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9368 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9369 WidenedStoreVT.getStoreSizeInBits());
9370 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9371 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9372 }
9373
9374 assert(isTypeLegal(StoreVT));
9375 return VData;
9376}
9377
9378SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9379 SelectionDAG &DAG) const {
9380 SDLoc DL(Op);
9381 SDValue Chain = Op.getOperand(0);
9382 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9384
9385 switch (IntrinsicID) {
9386 case Intrinsic::amdgcn_exp_compr: {
9387 if (!Subtarget->hasCompressedExport()) {
9388 DiagnosticInfoUnsupported BadIntrin(
9390 "intrinsic not supported on subtarget", DL.getDebugLoc());
9391 DAG.getContext()->diagnose(BadIntrin);
9392 }
9393 SDValue Src0 = Op.getOperand(4);
9394 SDValue Src1 = Op.getOperand(5);
9395 // Hack around illegal type on SI by directly selecting it.
9396 if (isTypeLegal(Src0.getValueType()))
9397 return SDValue();
9398
9399 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9400 SDValue Undef = DAG.getUNDEF(MVT::f32);
9401 const SDValue Ops[] = {
9402 Op.getOperand(2), // tgt
9403 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9404 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9405 Undef, // src2
9406 Undef, // src3
9407 Op.getOperand(7), // vm
9408 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9409 Op.getOperand(3), // en
9410 Op.getOperand(0) // Chain
9411 };
9412
9413 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9414 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9415 }
9416 case Intrinsic::amdgcn_s_barrier: {
9419 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9420 if (WGSize <= ST.getWavefrontSize())
9421 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9422 Op.getOperand(0)), 0);
9423 }
9424
9425 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9426 if (ST.hasSplitBarriers()) {
9427 SDValue K =
9429 SDValue BarSignal =
9430 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9431 MVT::Other, K, Op.getOperand(0)),
9432 0);
9433 SDValue BarWait =
9434 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9435 BarSignal.getValue(0)),
9436 0);
9437 return BarWait;
9438 }
9439
9440 return SDValue();
9441 };
9442 case Intrinsic::amdgcn_tbuffer_store: {
9443 SDValue VData = Op.getOperand(2);
9444 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9445 if (IsD16)
9446 VData = handleD16VData(VData, DAG);
9447 unsigned Dfmt = Op.getConstantOperandVal(8);
9448 unsigned Nfmt = Op.getConstantOperandVal(9);
9449 unsigned Glc = Op.getConstantOperandVal(10);
9450 unsigned Slc = Op.getConstantOperandVal(11);
9451 unsigned IdxEn = getIdxEn(Op.getOperand(4));
9452 SDValue Ops[] = {
9453 Chain,
9454 VData, // vdata
9455 Op.getOperand(3), // rsrc
9456 Op.getOperand(4), // vindex
9457 Op.getOperand(5), // voffset
9458 Op.getOperand(6), // soffset
9459 Op.getOperand(7), // offset
9460 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
9461 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9462 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9463 };
9464 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9466 MemSDNode *M = cast<MemSDNode>(Op);
9467 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9468 M->getMemoryVT(), M->getMemOperand());
9469 }
9470
9471 case Intrinsic::amdgcn_struct_tbuffer_store:
9472 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9473 SDValue VData = Op.getOperand(2);
9474 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9475 if (IsD16)
9476 VData = handleD16VData(VData, DAG);
9477 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9478 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9479 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9480 SDValue Ops[] = {
9481 Chain,
9482 VData, // vdata
9483 Rsrc, // rsrc
9484 Op.getOperand(4), // vindex
9485 Offsets.first, // voffset
9486 SOffset, // soffset
9487 Offsets.second, // offset
9488 Op.getOperand(7), // format
9489 Op.getOperand(8), // cachepolicy, swizzled buffer
9490 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9491 };
9492 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9494 MemSDNode *M = cast<MemSDNode>(Op);
9495 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9496 M->getMemoryVT(), M->getMemOperand());
9497 }
9498
9499 case Intrinsic::amdgcn_raw_tbuffer_store:
9500 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9501 SDValue VData = Op.getOperand(2);
9502 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9503 if (IsD16)
9504 VData = handleD16VData(VData, DAG);
9505 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9506 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9507 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9508 SDValue Ops[] = {
9509 Chain,
9510 VData, // vdata
9511 Rsrc, // rsrc
9512 DAG.getConstant(0, DL, MVT::i32), // vindex
9513 Offsets.first, // voffset
9514 SOffset, // soffset
9515 Offsets.second, // offset
9516 Op.getOperand(6), // format
9517 Op.getOperand(7), // cachepolicy, swizzled buffer
9518 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9519 };
9520 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9522 MemSDNode *M = cast<MemSDNode>(Op);
9523 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9524 M->getMemoryVT(), M->getMemOperand());
9525 }
9526
9527 case Intrinsic::amdgcn_buffer_store:
9528 case Intrinsic::amdgcn_buffer_store_format: {
9529 SDValue VData = Op.getOperand(2);
9530 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9531 if (IsD16)
9532 VData = handleD16VData(VData, DAG);
9533 unsigned Glc = Op.getConstantOperandVal(6);
9534 unsigned Slc = Op.getConstantOperandVal(7);
9535 unsigned IdxEn = getIdxEn(Op.getOperand(4));
9536 SDValue Ops[] = {
9537 Chain,
9538 VData,
9539 Op.getOperand(3), // rsrc
9540 Op.getOperand(4), // vindex
9541 SDValue(), // voffset -- will be set by setBufferOffsets
9542 SDValue(), // soffset -- will be set by setBufferOffsets
9543 SDValue(), // offset -- will be set by setBufferOffsets
9544 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9545 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9546 };
9547 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
9548
9549 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
9551 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9552 MemSDNode *M = cast<MemSDNode>(Op);
9553
9554 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9555 EVT VDataType = VData.getValueType().getScalarType();
9556 if (VDataType == MVT::i8 || VDataType == MVT::i16)
9557 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9558
9559 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9560 M->getMemoryVT(), M->getMemOperand());
9561 }
9562
9563 case Intrinsic::amdgcn_raw_buffer_store:
9564 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9565 case Intrinsic::amdgcn_raw_buffer_store_format:
9566 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9567 const bool IsFormat =
9568 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9569 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9570
9571 SDValue VData = Op.getOperand(2);
9572 EVT VDataVT = VData.getValueType();
9573 EVT EltType = VDataVT.getScalarType();
9574 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9575 if (IsD16) {
9576 VData = handleD16VData(VData, DAG);
9577 VDataVT = VData.getValueType();
9578 }
9579
9580 if (!isTypeLegal(VDataVT)) {
9581 VData =
9582 DAG.getNode(ISD::BITCAST, DL,
9583 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9584 }
9585
9586 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9587 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9588 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9589 SDValue Ops[] = {
9590 Chain,
9591 VData,
9592 Rsrc,
9593 DAG.getConstant(0, DL, MVT::i32), // vindex
9594 Offsets.first, // voffset
9595 SOffset, // soffset
9596 Offsets.second, // offset
9597 Op.getOperand(6), // cachepolicy, swizzled buffer
9598 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9599 };
9600 unsigned Opc =
9602 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9603 MemSDNode *M = cast<MemSDNode>(Op);
9604
9605 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9606 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9607 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9608
9609 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9610 M->getMemoryVT(), M->getMemOperand());
9611 }
9612
9613 case Intrinsic::amdgcn_struct_buffer_store:
9614 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9615 case Intrinsic::amdgcn_struct_buffer_store_format:
9616 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9617 const bool IsFormat =
9618 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9619 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9620
9621 SDValue VData = Op.getOperand(2);
9622 EVT VDataVT = VData.getValueType();
9623 EVT EltType = VDataVT.getScalarType();
9624 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9625
9626 if (IsD16) {
9627 VData = handleD16VData(VData, DAG);
9628 VDataVT = VData.getValueType();
9629 }
9630
9631 if (!isTypeLegal(VDataVT)) {
9632 VData =
9633 DAG.getNode(ISD::BITCAST, DL,
9634 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9635 }
9636
9637 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9638 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9639 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9640 SDValue Ops[] = {
9641 Chain,
9642 VData,
9643 Rsrc,
9644 Op.getOperand(4), // vindex
9645 Offsets.first, // voffset
9646 SOffset, // soffset
9647 Offsets.second, // offset
9648 Op.getOperand(7), // cachepolicy, swizzled buffer
9649 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9650 };
9651 unsigned Opc =
9653 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9654 MemSDNode *M = cast<MemSDNode>(Op);
9655
9656 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9657 EVT VDataType = VData.getValueType().getScalarType();
9658 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9659 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9660
9661 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9662 M->getMemoryVT(), M->getMemOperand());
9663 }
9664 case Intrinsic::amdgcn_raw_buffer_load_lds:
9665 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9666 case Intrinsic::amdgcn_struct_buffer_load_lds:
9667 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9668 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9669 unsigned Opc;
9670 bool HasVIndex =
9671 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9672 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9673 unsigned OpOffset = HasVIndex ? 1 : 0;
9674 SDValue VOffset = Op.getOperand(5 + OpOffset);
9675 bool HasVOffset = !isNullConstant(VOffset);
9676 unsigned Size = Op->getConstantOperandVal(4);
9677
9678 switch (Size) {
9679 default:
9680 return SDValue();
9681 case 1:
9682 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9683 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9684 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9685 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9686 break;
9687 case 2:
9688 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9689 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9690 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9691 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9692 break;
9693 case 4:
9694 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9695 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9696 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9697 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9698 break;
9699 }
9700
9701 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9702
9704
9705 if (HasVIndex && HasVOffset)
9706 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9707 { Op.getOperand(5), // VIndex
9708 VOffset }));
9709 else if (HasVIndex)
9710 Ops.push_back(Op.getOperand(5));
9711 else if (HasVOffset)
9712 Ops.push_back(VOffset);
9713
9714 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9715 Ops.push_back(Rsrc);
9716 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9717 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9718 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9719 Ops.push_back(
9720 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9722 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9723 Ops.push_back(M0Val.getValue(0)); // Chain
9724 Ops.push_back(M0Val.getValue(1)); // Glue
9725
9726 auto *M = cast<MemSDNode>(Op);
9727 MachineMemOperand *LoadMMO = M->getMemOperand();
9728 // Don't set the offset value here because the pointer points to the base of
9729 // the buffer.
9730 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9731
9732 MachinePointerInfo StorePtrI = LoadPtrI;
9733 LoadPtrI.V = PoisonValue::get(
9737
9738 auto F = LoadMMO->getFlags() &
9740 LoadMMO =
9742 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9743
9745 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9746 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9747
9748 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9749 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9750
9751 return SDValue(Load, 0);
9752 }
9753 case Intrinsic::amdgcn_global_load_lds: {
9754 unsigned Opc;
9755 unsigned Size = Op->getConstantOperandVal(4);
9756 switch (Size) {
9757 default:
9758 return SDValue();
9759 case 1:
9760 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9761 break;
9762 case 2:
9763 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9764 break;
9765 case 4:
9766 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9767 break;
9768 }
9769
9770 auto *M = cast<MemSDNode>(Op);
9771 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9772
9774
9775 SDValue Addr = Op.getOperand(2); // Global ptr
9776 SDValue VOffset;
9777 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9778 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9779 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9780 SDValue LHS = Addr.getOperand(0);
9781 SDValue RHS = Addr.getOperand(1);
9782
9783 if (LHS->isDivergent())
9784 std::swap(LHS, RHS);
9785
9786 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9787 RHS.getOperand(0).getValueType() == MVT::i32) {
9788 // add (i64 sgpr), (zero_extend (i32 vgpr))
9789 Addr = LHS;
9790 VOffset = RHS.getOperand(0);
9791 }
9792 }
9793
9794 Ops.push_back(Addr);
9795 if (!Addr->isDivergent()) {
9796 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9797 if (!VOffset)
9798 VOffset = SDValue(
9799 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9800 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9801 Ops.push_back(VOffset);
9802 }
9803
9804 Ops.push_back(Op.getOperand(5)); // Offset
9805 Ops.push_back(Op.getOperand(6)); // CPol
9806 Ops.push_back(M0Val.getValue(0)); // Chain
9807 Ops.push_back(M0Val.getValue(1)); // Glue
9808
9809 MachineMemOperand *LoadMMO = M->getMemOperand();
9810 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9811 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9812 MachinePointerInfo StorePtrI = LoadPtrI;
9813 LoadPtrI.V = PoisonValue::get(
9817 auto F = LoadMMO->getFlags() &
9819 LoadMMO =
9821 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9823 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9824 LoadMMO->getAAInfo());
9825
9826 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9827 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9828
9829 return SDValue(Load, 0);
9830 }
9831 case Intrinsic::amdgcn_end_cf:
9832 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9833 Op->getOperand(2), Chain), 0);
9834 case Intrinsic::amdgcn_s_barrier_init:
9835 case Intrinsic::amdgcn_s_barrier_join:
9836 case Intrinsic::amdgcn_s_wakeup_barrier: {
9837 SDValue Chain = Op->getOperand(0);
9839 SDValue BarOp = Op->getOperand(2);
9840 unsigned Opc;
9841 bool IsInlinableBarID = false;
9842 int64_t BarVal;
9843
9844 if (isa<ConstantSDNode>(BarOp)) {
9845 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9846 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9847 }
9848
9849 if (IsInlinableBarID) {
9850 switch (IntrinsicID) {
9851 default:
9852 return SDValue();
9853 case Intrinsic::amdgcn_s_barrier_init:
9854 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9855 break;
9856 case Intrinsic::amdgcn_s_barrier_join:
9857 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9858 break;
9859 case Intrinsic::amdgcn_s_wakeup_barrier:
9860 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9861 break;
9862 }
9863
9864 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9865 Ops.push_back(K);
9866 } else {
9867 switch (IntrinsicID) {
9868 default:
9869 return SDValue();
9870 case Intrinsic::amdgcn_s_barrier_init:
9871 Opc = AMDGPU::S_BARRIER_INIT_M0;
9872 break;
9873 case Intrinsic::amdgcn_s_barrier_join:
9874 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9875 break;
9876 case Intrinsic::amdgcn_s_wakeup_barrier:
9877 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9878 break;
9879 }
9880 }
9881
9882 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9883 SDValue M0Val;
9884 // Member count will be read from M0[16:22]
9885 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9886 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9887
9888 if (!IsInlinableBarID) {
9889 // If reference to barrier id is not an inline constant then it must be
9890 // referenced with M0[4:0]. Perform an OR with the member count to
9891 // include it in M0.
9892 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9893 Op.getOperand(2), M0Val),
9894 0);
9895 }
9896 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9897 } else if (!IsInlinableBarID) {
9898 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
9899 }
9900
9901 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9902 return SDValue(NewMI, 0);
9903 }
9904 default: {
9905 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9907 return lowerImage(Op, ImageDimIntr, DAG, true);
9908
9909 return Op;
9910 }
9911 }
9912}
9913
9914// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
9915// offset (the offset that is included in bounds checking and swizzling, to be
9916// split between the instruction's voffset and immoffset fields) and soffset
9917// (the offset that is excluded from bounds checking and swizzling, to go in
9918// the instruction's soffset field). This function takes the first kind of
9919// offset and figures out how to split it between voffset and immoffset.
9920std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9921 SDValue Offset, SelectionDAG &DAG) const {
9922 SDLoc DL(Offset);
9923 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
9924 SDValue N0 = Offset;
9925 ConstantSDNode *C1 = nullptr;
9926
9927 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9928 N0 = SDValue();
9929 else if (DAG.isBaseWithConstantOffset(N0)) {
9930 C1 = cast<ConstantSDNode>(N0.getOperand(1));
9931 N0 = N0.getOperand(0);
9932 }
9933
9934 if (C1) {
9935 unsigned ImmOffset = C1->getZExtValue();
9936 // If the immediate value is too big for the immoffset field, put only bits
9937 // that would normally fit in the immoffset field. The remaining value that
9938 // is copied/added for the voffset field is a large power of 2, and it
9939 // stands more chance of being CSEd with the copy/add for another similar
9940 // load/store.
9941 // However, do not do that rounding down if that is a negative
9942 // number, as it appears to be illegal to have a negative offset in the
9943 // vgpr, even if adding the immediate offset makes it positive.
9944 unsigned Overflow = ImmOffset & ~MaxImm;
9945 ImmOffset -= Overflow;
9946 if ((int32_t)Overflow < 0) {
9947 Overflow += ImmOffset;
9948 ImmOffset = 0;
9949 }
9950 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
9951 if (Overflow) {
9952 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
9953 if (!N0)
9954 N0 = OverflowVal;
9955 else {
9956 SDValue Ops[] = { N0, OverflowVal };
9957 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
9958 }
9959 }
9960 }
9961 if (!N0)
9962 N0 = DAG.getConstant(0, DL, MVT::i32);
9963 if (!C1)
9964 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
9965 return {N0, SDValue(C1, 0)};
9966}
9967
9968// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
9969// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
9970// pointed to by Offsets.
9971void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
9972 SelectionDAG &DAG, SDValue *Offsets,
9973 Align Alignment) const {
9975 SDLoc DL(CombinedOffset);
9976 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
9977 uint32_t Imm = C->getZExtValue();
9978 uint32_t SOffset, ImmOffset;
9979 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
9980 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
9981 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
9982 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
9983 return;
9984 }
9985 }
9986 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
9987 SDValue N0 = CombinedOffset.getOperand(0);
9988 SDValue N1 = CombinedOffset.getOperand(1);
9989 uint32_t SOffset, ImmOffset;
9990 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
9991 if (Offset >= 0 &&
9992 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
9993 Offsets[0] = N0;
9994 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
9995 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
9996 return;
9997 }
9998 }
9999
10000 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10001 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10002 : DAG.getConstant(0, DL, MVT::i32);
10003
10004 Offsets[0] = CombinedOffset;
10005 Offsets[1] = SOffsetZero;
10006 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10007}
10008
10009SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10010 SelectionDAG &DAG) const {
10011 if (!MaybePointer.getValueType().isScalarInteger())
10012 return MaybePointer;
10013
10014 SDLoc DL(MaybePointer);
10015
10016 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10017 return Rsrc;
10018}
10019
10020// Wrap a global or flat pointer into a buffer intrinsic using the flags
10021// specified in the intrinsic.
10022SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10023 SelectionDAG &DAG) const {
10024 SDLoc Loc(Op);
10025
10026 SDValue Pointer = Op->getOperand(1);
10027 SDValue Stride = Op->getOperand(2);
10028 SDValue NumRecords = Op->getOperand(3);
10029 SDValue Flags = Op->getOperand(4);
10030
10031 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10032 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10033 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10034 std::optional<uint32_t> ConstStride = std::nullopt;
10035 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10036 ConstStride = ConstNode->getZExtValue();
10037
10038 SDValue NewHighHalf = Masked;
10039 if (!ConstStride || *ConstStride != 0) {
10040 SDValue ShiftedStride;
10041 if (ConstStride) {
10042 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10043 } else {
10044 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10045 ShiftedStride =
10046 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10047 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10048 }
10049 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10050 }
10051
10052 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10053 NewHighHalf, NumRecords, Flags);
10054 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10055 return RsrcPtr;
10056}
10057
10058// Handle 8 bit and 16 bit buffer loads
10059SDValue
10060SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
10062 MachineMemOperand *MMO) const {
10063 EVT IntVT = LoadVT.changeTypeToInteger();
10064 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10066
10067 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10068 SDValue BufferLoad =
10069 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10070 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10071 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10072
10073 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10074}
10075
10076// Handle 8 bit and 16 bit buffer stores
10077SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10078 EVT VDataType, SDLoc DL,
10079 SDValue Ops[],
10080 MemSDNode *M) const {
10081 if (VDataType == MVT::f16)
10082 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10083
10084 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10085 Ops[1] = BufferStoreExt;
10086 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10088 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10089 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10090 M->getMemOperand());
10091}
10092
10094 ISD::LoadExtType ExtType, SDValue Op,
10095 const SDLoc &SL, EVT VT) {
10096 if (VT.bitsLT(Op.getValueType()))
10097 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10098
10099 switch (ExtType) {
10100 case ISD::SEXTLOAD:
10101 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10102 case ISD::ZEXTLOAD:
10103 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10104 case ISD::EXTLOAD:
10105 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10106 case ISD::NON_EXTLOAD:
10107 return Op;
10108 }
10109
10110 llvm_unreachable("invalid ext type");
10111}
10112
10113// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10114// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10115SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10116 SelectionDAG &DAG = DCI.DAG;
10117 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10118 return SDValue();
10119
10120 // FIXME: Constant loads should all be marked invariant.
10121 unsigned AS = Ld->getAddressSpace();
10122 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10124 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10125 return SDValue();
10126
10127 // Don't do this early, since it may interfere with adjacent load merging for
10128 // illegal types. We can avoid losing alignment information for exotic types
10129 // pre-legalize.
10130 EVT MemVT = Ld->getMemoryVT();
10131 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10132 MemVT.getSizeInBits() >= 32)
10133 return SDValue();
10134
10135 SDLoc SL(Ld);
10136
10137 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10138 "unexpected vector extload");
10139
10140 // TODO: Drop only high part of range.
10141 SDValue Ptr = Ld->getBasePtr();
10142 SDValue NewLoad = DAG.getLoad(
10143 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10144 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10145 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10146 nullptr); // Drop ranges
10147
10148 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10149 if (MemVT.isFloatingPoint()) {
10151 "unexpected fp extload");
10152 TruncVT = MemVT.changeTypeToInteger();
10153 }
10154
10155 SDValue Cvt = NewLoad;
10156 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10157 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10158 DAG.getValueType(TruncVT));
10159 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10161 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10162 } else {
10164 }
10165
10166 EVT VT = Ld->getValueType(0);
10167 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10168
10169 DCI.AddToWorklist(Cvt.getNode());
10170
10171 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10172 // the appropriate extension from the 32-bit load.
10173 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10174 DCI.AddToWorklist(Cvt.getNode());
10175
10176 // Handle conversion back to floating point if necessary.
10177 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10178
10179 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10180}
10181
10183 const SIMachineFunctionInfo &Info) {
10184 // TODO: Should check if the address can definitely not access stack.
10185 if (Info.isEntryFunction())
10186 return Info.getUserSGPRInfo().hasFlatScratchInit();
10187 return true;
10188}
10189
10190SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10191 SDLoc DL(Op);
10192 LoadSDNode *Load = cast<LoadSDNode>(Op);
10193 ISD::LoadExtType ExtType = Load->getExtensionType();
10194 EVT MemVT = Load->getMemoryVT();
10195
10196 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10197 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10198 return SDValue();
10199
10200 // FIXME: Copied from PPC
10201 // First, load into 32 bits, then truncate to 1 bit.
10202
10203 SDValue Chain = Load->getChain();
10204 SDValue BasePtr = Load->getBasePtr();
10205 MachineMemOperand *MMO = Load->getMemOperand();
10206
10207 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10208
10209 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10210 BasePtr, RealMemVT, MMO);
10211
10212 if (!MemVT.isVector()) {
10213 SDValue Ops[] = {
10214 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10215 NewLD.getValue(1)
10216 };
10217
10218 return DAG.getMergeValues(Ops, DL);
10219 }
10220
10222 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10223 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10224 DAG.getConstant(I, DL, MVT::i32));
10225
10226 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10227 }
10228
10229 SDValue Ops[] = {
10230 DAG.getBuildVector(MemVT, DL, Elts),
10231 NewLD.getValue(1)
10232 };
10233
10234 return DAG.getMergeValues(Ops, DL);
10235 }
10236
10237 if (!MemVT.isVector())
10238 return SDValue();
10239
10240 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10241 "Custom lowering for non-i32 vectors hasn't been implemented.");
10242
10243 Align Alignment = Load->getAlign();
10244 unsigned AS = Load->getAddressSpace();
10245 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10246 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10247 return SplitVectorLoad(Op, DAG);
10248 }
10249
10252 // If there is a possibility that flat instruction access scratch memory
10253 // then we need to use the same legalization rules we use for private.
10254 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10256 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10258
10259 unsigned NumElements = MemVT.getVectorNumElements();
10260
10261 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10263 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10264 if (MemVT.isPow2VectorType() ||
10265 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10266 return SDValue();
10267 return WidenOrSplitVectorLoad(Op, DAG);
10268 }
10269 // Non-uniform loads will be selected to MUBUF instructions, so they
10270 // have the same legalization requirements as global and private
10271 // loads.
10272 //
10273 }
10274
10275 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10278 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10279 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10280 Alignment >= Align(4) && NumElements < 32) {
10281 if (MemVT.isPow2VectorType() ||
10282 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10283 return SDValue();
10284 return WidenOrSplitVectorLoad(Op, DAG);
10285 }
10286 // Non-uniform loads will be selected to MUBUF instructions, so they
10287 // have the same legalization requirements as global and private
10288 // loads.
10289 //
10290 }
10291 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10294 AS == AMDGPUAS::FLAT_ADDRESS) {
10295 if (NumElements > 4)
10296 return SplitVectorLoad(Op, DAG);
10297 // v3 loads not supported on SI.
10298 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10299 return WidenOrSplitVectorLoad(Op, DAG);
10300
10301 // v3 and v4 loads are supported for private and global memory.
10302 return SDValue();
10303 }
10304 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10305 // Depending on the setting of the private_element_size field in the
10306 // resource descriptor, we can only make private accesses up to a certain
10307 // size.
10308 switch (Subtarget->getMaxPrivateElementSize()) {
10309 case 4: {
10310 SDValue Ops[2];
10311 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10312 return DAG.getMergeValues(Ops, DL);
10313 }
10314 case 8:
10315 if (NumElements > 2)
10316 return SplitVectorLoad(Op, DAG);
10317 return SDValue();
10318 case 16:
10319 // Same as global/flat
10320 if (NumElements > 4)
10321 return SplitVectorLoad(Op, DAG);
10322 // v3 loads not supported on SI.
10323 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10324 return WidenOrSplitVectorLoad(Op, DAG);
10325
10326 return SDValue();
10327 default:
10328 llvm_unreachable("unsupported private_element_size");
10329 }
10330 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10331 unsigned Fast = 0;
10332 auto Flags = Load->getMemOperand()->getFlags();
10334 Load->getAlign(), Flags, &Fast) &&
10335 Fast > 1)
10336 return SDValue();
10337
10338 if (MemVT.isVector())
10339 return SplitVectorLoad(Op, DAG);
10340 }
10341
10343 MemVT, *Load->getMemOperand())) {
10344 SDValue Ops[2];
10345 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10346 return DAG.getMergeValues(Ops, DL);
10347 }
10348
10349 return SDValue();
10350}
10351
10352SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10353 EVT VT = Op.getValueType();
10354 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10355 VT.getSizeInBits() == 512)
10356 return splitTernaryVectorOp(Op, DAG);
10357
10358 assert(VT.getSizeInBits() == 64);
10359
10360 SDLoc DL(Op);
10361 SDValue Cond = Op.getOperand(0);
10362
10363 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10364 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10365
10366 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10367 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10368
10369 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10370 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10371
10372 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10373
10374 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10375 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10376
10377 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10378
10379 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10380 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10381}
10382
10383// Catch division cases where we can use shortcuts with rcp and rsq
10384// instructions.
10385SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10386 SelectionDAG &DAG) const {
10387 SDLoc SL(Op);
10388 SDValue LHS = Op.getOperand(0);
10389 SDValue RHS = Op.getOperand(1);
10390 EVT VT = Op.getValueType();
10391 const SDNodeFlags Flags = Op->getFlags();
10392
10393 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10395
10396 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10397 // Without !fpmath accuracy information, we can't do more because we don't
10398 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10399 // f16 is always accurate enough
10400 if (!AllowInaccurateRcp && VT != MVT::f16)
10401 return SDValue();
10402
10403 if (CLHS->isExactlyValue(1.0)) {
10404 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10405 // the CI documentation has a worst case error of 1 ulp.
10406 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10407 // use it as long as we aren't trying to use denormals.
10408 //
10409 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10410
10411 // 1.0 / sqrt(x) -> rsq(x)
10412
10413 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10414 // error seems really high at 2^29 ULP.
10415 // 1.0 / x -> rcp(x)
10416 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10417 }
10418
10419 // Same as for 1.0, but expand the sign out of the constant.
10420 if (CLHS->isExactlyValue(-1.0)) {
10421 // -1.0 / x -> rcp (fneg x)
10422 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10423 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10424 }
10425 }
10426
10427 // For f16 require afn or arcp.
10428 // For f32 require afn.
10429 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10430 return SDValue();
10431
10432 // Turn into multiply by the reciprocal.
10433 // x / y -> x * (1.0 / y)
10434 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10435 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10436}
10437
10438SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10439 SelectionDAG &DAG) const {
10440 SDLoc SL(Op);
10441 SDValue X = Op.getOperand(0);
10442 SDValue Y = Op.getOperand(1);
10443 EVT VT = Op.getValueType();
10444 const SDNodeFlags Flags = Op->getFlags();
10445
10446 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10448 if (!AllowInaccurateDiv)
10449 return SDValue();
10450
10451 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10452 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10453
10454 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10455 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10456
10457 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10458 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10459 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10460 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10461 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10462 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10463}
10464
10465static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10466 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10467 SDNodeFlags Flags) {
10468 if (GlueChain->getNumValues() <= 1) {
10469 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10470 }
10471
10472 assert(GlueChain->getNumValues() == 3);
10473
10474 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10475 switch (Opcode) {
10476 default: llvm_unreachable("no chain equivalent for opcode");
10477 case ISD::FMUL:
10478 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10479 break;
10480 }
10481
10482 return DAG.getNode(Opcode, SL, VTList,
10483 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10484 Flags);
10485}
10486
10487static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10488 EVT VT, SDValue A, SDValue B, SDValue C,
10489 SDValue GlueChain, SDNodeFlags Flags) {
10490 if (GlueChain->getNumValues() <= 1) {
10491 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10492 }
10493
10494 assert(GlueChain->getNumValues() == 3);
10495
10496 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10497 switch (Opcode) {
10498 default: llvm_unreachable("no chain equivalent for opcode");
10499 case ISD::FMA:
10500 Opcode = AMDGPUISD::FMA_W_CHAIN;
10501 break;
10502 }
10503
10504 return DAG.getNode(Opcode, SL, VTList,
10505 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10506 Flags);
10507}
10508
10509SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10510 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10511 return FastLowered;
10512
10513 SDLoc SL(Op);
10514 SDValue Src0 = Op.getOperand(0);
10515 SDValue Src1 = Op.getOperand(1);
10516
10517 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10518 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10519
10520 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10521 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10522
10523 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10524 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10525
10526 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10527}
10528
10529// Faster 2.5 ULP division that does not support denormals.
10530SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10531 SDNodeFlags Flags = Op->getFlags();
10532 SDLoc SL(Op);
10533 SDValue LHS = Op.getOperand(1);
10534 SDValue RHS = Op.getOperand(2);
10535
10536 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10537
10538 const APFloat K0Val(0x1p+96f);
10539 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10540
10541 const APFloat K1Val(0x1p-32f);
10542 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10543
10544 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10545
10546 EVT SetCCVT =
10547 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10548
10549 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10550
10551 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10552
10553 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10554
10555 // rcp does not support denormals.
10556 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10557
10558 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10559
10560 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10561}
10562
10563// Returns immediate value for setting the F32 denorm mode when using the
10564// S_DENORM_MODE instruction.
10566 const SIMachineFunctionInfo *Info,
10567 const GCNSubtarget *ST) {
10568 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10569 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10570 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10571 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10572}
10573
10574SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10575 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10576 return FastLowered;
10577
10578 // The selection matcher assumes anything with a chain selecting to a
10579 // mayRaiseFPException machine instruction. Since we're introducing a chain
10580 // here, we need to explicitly report nofpexcept for the regular fdiv
10581 // lowering.
10582 SDNodeFlags Flags = Op->getFlags();
10583 Flags.setNoFPExcept(true);
10584
10585 SDLoc SL(Op);
10586 SDValue LHS = Op.getOperand(0);
10587 SDValue RHS = Op.getOperand(1);
10588
10589 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10590
10591 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10592
10593 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10594 {RHS, RHS, LHS}, Flags);
10595 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10596 {LHS, RHS, LHS}, Flags);
10597
10598 // Denominator is scaled to not be denormal, so using rcp is ok.
10599 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10600 DenominatorScaled, Flags);
10601 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10602 DenominatorScaled, Flags);
10603
10604 using namespace AMDGPU::Hwreg;
10605 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10606 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10607
10608 const MachineFunction &MF = DAG.getMachineFunction();
10610 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10611
10612 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10613 const bool HasDynamicDenormals =
10614 (DenormMode.Input == DenormalMode::Dynamic) ||
10615 (DenormMode.Output == DenormalMode::Dynamic);
10616
10617 SDValue SavedDenormMode;
10618
10619 if (!PreservesDenormals) {
10620 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10621 // lowering. The chain dependence is insufficient, and we need glue. We do
10622 // not need the glue variants in a strictfp function.
10623
10624 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10625
10626 SDValue Glue = DAG.getEntryNode();
10627 if (HasDynamicDenormals) {
10628 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10629 DAG.getVTList(MVT::i32, MVT::Glue),
10630 {BitField, Glue});
10631 SavedDenormMode = SDValue(GetReg, 0);
10632
10633 Glue = DAG.getMergeValues(
10634 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10635 }
10636
10637 SDNode *EnableDenorm;
10638 if (Subtarget->hasDenormModeInst()) {
10639 const SDValue EnableDenormValue =
10640 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10641
10642 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10643 EnableDenormValue)
10644 .getNode();
10645 } else {
10646 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10647 SL, MVT::i32);
10648 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10649 {EnableDenormValue, BitField, Glue});
10650 }
10651
10652 SDValue Ops[3] = {
10653 NegDivScale0,
10654 SDValue(EnableDenorm, 0),
10655 SDValue(EnableDenorm, 1)
10656 };
10657
10658 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10659 }
10660
10661 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10662 ApproxRcp, One, NegDivScale0, Flags);
10663
10664 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10665 ApproxRcp, Fma0, Flags);
10666
10667 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10668 Fma1, Fma1, Flags);
10669
10670 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10671 NumeratorScaled, Mul, Flags);
10672
10673 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10674 Fma2, Fma1, Mul, Fma2, Flags);
10675
10676 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10677 NumeratorScaled, Fma3, Flags);
10678
10679 if (!PreservesDenormals) {
10680 SDNode *DisableDenorm;
10681 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10682 const SDValue DisableDenormValue = getSPDenormModeValue(
10683 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10684
10685 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10686 Fma4.getValue(1), DisableDenormValue,
10687 Fma4.getValue(2)).getNode();
10688 } else {
10689 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10690 const SDValue DisableDenormValue =
10691 HasDynamicDenormals
10692 ? SavedDenormMode
10693 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10694
10695 DisableDenorm = DAG.getMachineNode(
10696 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10697 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10698 }
10699
10700 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10701 SDValue(DisableDenorm, 0), DAG.getRoot());
10702 DAG.setRoot(OutputChain);
10703 }
10704
10705 SDValue Scale = NumeratorScaled.getValue(1);
10706 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10707 {Fma4, Fma1, Fma3, Scale}, Flags);
10708
10709 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10710}
10711
10712SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10713 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10714 return FastLowered;
10715
10716 SDLoc SL(Op);
10717 SDValue X = Op.getOperand(0);
10718 SDValue Y = Op.getOperand(1);
10719
10720 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10721
10722 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10723
10724 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10725
10726 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10727
10728 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10729
10730 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10731
10732 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10733
10734 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10735
10736 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10737
10738 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10739 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10740
10741 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10742 NegDivScale0, Mul, DivScale1);
10743
10744 SDValue Scale;
10745
10746 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10747 // Workaround a hardware bug on SI where the condition output from div_scale
10748 // is not usable.
10749
10750 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10751
10752 // Figure out if the scale to use for div_fmas.
10753 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10754 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10755 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10756 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10757
10758 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10759 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10760
10761 SDValue Scale0Hi
10762 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10763 SDValue Scale1Hi
10764 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10765
10766 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10767 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10768 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10769 } else {
10770 Scale = DivScale1.getValue(1);
10771 }
10772
10773 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10774 Fma4, Fma3, Mul, Scale);
10775
10776 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10777}
10778
10779SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10780 EVT VT = Op.getValueType();
10781
10782 if (VT == MVT::f32)
10783 return LowerFDIV32(Op, DAG);
10784
10785 if (VT == MVT::f64)
10786 return LowerFDIV64(Op, DAG);
10787
10788 if (VT == MVT::f16)
10789 return LowerFDIV16(Op, DAG);
10790
10791 llvm_unreachable("Unexpected type for fdiv");
10792}
10793
10794SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10795 SDLoc dl(Op);
10796 SDValue Val = Op.getOperand(0);
10797 EVT VT = Val.getValueType();
10798 EVT ResultExpVT = Op->getValueType(1);
10799 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10800
10801 SDValue Mant = DAG.getNode(
10803 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10804
10805 SDValue Exp = DAG.getNode(
10806 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10807 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10808
10809 if (Subtarget->hasFractBug()) {
10810 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10811 SDValue Inf = DAG.getConstantFP(
10813
10814 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10815 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10816 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10817 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10818 }
10819
10820 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10821 return DAG.getMergeValues({Mant, CastExp}, dl);
10822}
10823
10824SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10825 SDLoc DL(Op);
10826 StoreSDNode *Store = cast<StoreSDNode>(Op);
10827 EVT VT = Store->getMemoryVT();
10828
10829 if (VT == MVT::i1) {
10830 return DAG.getTruncStore(Store->getChain(), DL,
10831 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10832 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10833 }
10834
10835 assert(VT.isVector() &&
10836 Store->getValue().getValueType().getScalarType() == MVT::i32);
10837
10838 unsigned AS = Store->getAddressSpace();
10839 if (Subtarget->hasLDSMisalignedBug() &&
10840 AS == AMDGPUAS::FLAT_ADDRESS &&
10841 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10842 return SplitVectorStore(Op, DAG);
10843 }
10844
10847 // If there is a possibility that flat instruction access scratch memory
10848 // then we need to use the same legalization rules we use for private.
10849 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10851 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10853
10854 unsigned NumElements = VT.getVectorNumElements();
10855 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10856 AS == AMDGPUAS::FLAT_ADDRESS) {
10857 if (NumElements > 4)
10858 return SplitVectorStore(Op, DAG);
10859 // v3 stores not supported on SI.
10860 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10861 return SplitVectorStore(Op, DAG);
10862
10864 VT, *Store->getMemOperand()))
10865 return expandUnalignedStore(Store, DAG);
10866
10867 return SDValue();
10868 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10869 switch (Subtarget->getMaxPrivateElementSize()) {
10870 case 4:
10871 return scalarizeVectorStore(Store, DAG);
10872 case 8:
10873 if (NumElements > 2)
10874 return SplitVectorStore(Op, DAG);
10875 return SDValue();
10876 case 16:
10877 if (NumElements > 4 ||
10878 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10879 return SplitVectorStore(Op, DAG);
10880 return SDValue();
10881 default:
10882 llvm_unreachable("unsupported private_element_size");
10883 }
10884 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10885 unsigned Fast = 0;
10886 auto Flags = Store->getMemOperand()->getFlags();
10888 Store->getAlign(), Flags, &Fast) &&
10889 Fast > 1)
10890 return SDValue();
10891
10892 if (VT.isVector())
10893 return SplitVectorStore(Op, DAG);
10894
10895 return expandUnalignedStore(Store, DAG);
10896 }
10897
10898 // Probably an invalid store. If so we'll end up emitting a selection error.
10899 return SDValue();
10900}
10901
10902// Avoid the full correct expansion for f32 sqrt when promoting from f16.
10903SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
10904 SDLoc SL(Op);
10905 assert(!Subtarget->has16BitInsts());
10906 SDNodeFlags Flags = Op->getFlags();
10907 SDValue Ext =
10908 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
10909
10910 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
10911 SDValue Sqrt =
10912 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
10913
10914 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
10915 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
10916}
10917
10918SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
10919 SDLoc DL(Op);
10920 SDNodeFlags Flags = Op->getFlags();
10921 MVT VT = Op.getValueType().getSimpleVT();
10922 const SDValue X = Op.getOperand(0);
10923
10924 if (allowApproxFunc(DAG, Flags)) {
10925 // Instruction is 1ulp but ignores denormals.
10926 return DAG.getNode(
10928 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
10929 }
10930
10931 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
10932 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
10933
10934 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
10935
10936 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
10937
10938 SDValue SqrtX =
10939 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
10940
10941 SDValue SqrtS;
10942 if (needsDenormHandlingF32(DAG, X, Flags)) {
10943 SDValue SqrtID =
10944 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
10945 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
10946
10947 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
10948 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
10949 DAG.getConstant(-1, DL, MVT::i32));
10950 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
10951
10952 SDValue NegSqrtSNextDown =
10953 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
10954
10955 SDValue SqrtVP =
10956 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
10957
10958 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
10959 DAG.getConstant(1, DL, MVT::i32));
10960 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
10961
10962 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
10963 SDValue SqrtVS =
10964 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
10965
10966 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
10967 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
10968
10969 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
10970 Flags);
10971
10972 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
10973 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
10974 Flags);
10975 } else {
10976 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
10977
10978 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
10979
10980 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
10981 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
10982 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
10983
10984 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
10985 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
10986 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
10987
10988 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
10989 SDValue SqrtD =
10990 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
10991 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
10992 }
10993
10994 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
10995
10996 SDValue ScaledDown =
10997 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
10998
10999 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11000 SDValue IsZeroOrInf =
11001 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11002 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11003
11004 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11005}
11006
11007SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11008 // For double type, the SQRT and RSQ instructions don't have required
11009 // precision, we apply Goldschmidt's algorithm to improve the result:
11010 //
11011 // y0 = rsq(x)
11012 // g0 = x * y0
11013 // h0 = 0.5 * y0
11014 //
11015 // r0 = 0.5 - h0 * g0
11016 // g1 = g0 * r0 + g0
11017 // h1 = h0 * r0 + h0
11018 //
11019 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11020 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11021 // h2 = h1 * r1 + h1
11022 //
11023 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11024 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11025 //
11026 // sqrt(x) = g3
11027
11028 SDNodeFlags Flags = Op->getFlags();
11029
11030 SDLoc DL(Op);
11031
11032 SDValue X = Op.getOperand(0);
11033 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11034
11035 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11036
11037 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11038
11039 // Scale up input if it is too small.
11040 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11041 SDValue ScaleUp =
11042 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11043 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11044
11045 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11046
11047 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11048
11049 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11050 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11051
11052 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11053 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11054
11055 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11056
11057 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11058
11059 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11060 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11061
11062 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11063
11064 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11065 SDValue SqrtD1 =
11066 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11067
11068 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11069
11070 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11071 SDValue ScaleDown =
11072 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11073 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11074
11075 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11076 // with finite only or nsz because rsq(+/-0) = +/-inf
11077
11078 // TODO: Check for DAZ and expand to subnormals
11079 SDValue IsZeroOrInf =
11080 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11081 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11082
11083 // If x is +INF, +0, or -0, use its original value
11084 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11085 Flags);
11086}
11087
11088SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11089 SDLoc DL(Op);
11090 EVT VT = Op.getValueType();
11091 SDValue Arg = Op.getOperand(0);
11092 SDValue TrigVal;
11093
11094 // Propagate fast-math flags so that the multiply we introduce can be folded
11095 // if Arg is already the result of a multiply by constant.
11096 auto Flags = Op->getFlags();
11097
11098 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11099
11100 if (Subtarget->hasTrigReducedRange()) {
11101 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11102 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11103 } else {
11104 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11105 }
11106
11107 switch (Op.getOpcode()) {
11108 case ISD::FCOS:
11109 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11110 case ISD::FSIN:
11111 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11112 default:
11113 llvm_unreachable("Wrong trig opcode");
11114 }
11115}
11116
11117SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11118 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11119 assert(AtomicNode->isCompareAndSwap());
11120 unsigned AS = AtomicNode->getAddressSpace();
11121
11122 // No custom lowering required for local address space
11124 return Op;
11125
11126 // Non-local address space requires custom lowering for atomic compare
11127 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11128 SDLoc DL(Op);
11129 SDValue ChainIn = Op.getOperand(0);
11130 SDValue Addr = Op.getOperand(1);
11131 SDValue Old = Op.getOperand(2);
11132 SDValue New = Op.getOperand(3);
11133 EVT VT = Op.getValueType();
11134 MVT SimpleVT = VT.getSimpleVT();
11135 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11136
11137 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11138 SDValue Ops[] = { ChainIn, Addr, NewOld };
11139
11140 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11141 Ops, VT, AtomicNode->getMemOperand());
11142}
11143
11144//===----------------------------------------------------------------------===//
11145// Custom DAG optimizations
11146//===----------------------------------------------------------------------===//
11147
11148SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11149 DAGCombinerInfo &DCI) const {
11150 EVT VT = N->getValueType(0);
11151 EVT ScalarVT = VT.getScalarType();
11152 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11153 return SDValue();
11154
11155 SelectionDAG &DAG = DCI.DAG;
11156 SDLoc DL(N);
11157
11158 SDValue Src = N->getOperand(0);
11159 EVT SrcVT = Src.getValueType();
11160
11161 // TODO: We could try to match extracting the higher bytes, which would be
11162 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11163 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11164 // about in practice.
11165 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11166 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11167 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11168 DCI.AddToWorklist(Cvt.getNode());
11169
11170 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11171 if (ScalarVT != MVT::f32) {
11172 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11173 DAG.getTargetConstant(0, DL, MVT::i32));
11174 }
11175 return Cvt;
11176 }
11177 }
11178
11179 return SDValue();
11180}
11181
11182SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11183 DAGCombinerInfo &DCI) const {
11184 SDValue MagnitudeOp = N->getOperand(0);
11185 SDValue SignOp = N->getOperand(1);
11186 SelectionDAG &DAG = DCI.DAG;
11187 SDLoc DL(N);
11188
11189 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11190 // lower half with a copy.
11191 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11192 if (MagnitudeOp.getValueType() == MVT::f64) {
11193 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11194 SDValue MagLo =
11195 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11196 DAG.getConstant(0, DL, MVT::i32));
11197 SDValue MagHi =
11198 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11199 DAG.getConstant(1, DL, MVT::i32));
11200
11201 SDValue HiOp =
11202 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11203
11204 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11205
11206 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11207 }
11208
11209 if (SignOp.getValueType() != MVT::f64)
11210 return SDValue();
11211
11212 // Reduce width of sign operand, we only need the highest bit.
11213 //
11214 // fcopysign f64:x, f64:y ->
11215 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11216 // TODO: In some cases it might make sense to go all the way to f16.
11217 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11218 SDValue SignAsF32 =
11219 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11220 DAG.getConstant(1, DL, MVT::i32));
11221
11222 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11223 SignAsF32);
11224}
11225
11226// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11227// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11228// bits
11229
11230// This is a variant of
11231// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11232//
11233// The normal DAG combiner will do this, but only if the add has one use since
11234// that would increase the number of instructions.
11235//
11236// This prevents us from seeing a constant offset that can be folded into a
11237// memory instruction's addressing mode. If we know the resulting add offset of
11238// a pointer can be folded into an addressing offset, we can replace the pointer
11239// operand with the add of new constant offset. This eliminates one of the uses,
11240// and may allow the remaining use to also be simplified.
11241//
11242SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11243 unsigned AddrSpace,
11244 EVT MemVT,
11245 DAGCombinerInfo &DCI) const {
11246 SDValue N0 = N->getOperand(0);
11247 SDValue N1 = N->getOperand(1);
11248
11249 // We only do this to handle cases where it's profitable when there are
11250 // multiple uses of the add, so defer to the standard combine.
11251 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11252 N0->hasOneUse())
11253 return SDValue();
11254
11255 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11256 if (!CN1)
11257 return SDValue();
11258
11259 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11260 if (!CAdd)
11261 return SDValue();
11262
11263 SelectionDAG &DAG = DCI.DAG;
11264
11265 if (N0->getOpcode() == ISD::OR &&
11266 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11267 return SDValue();
11268
11269 // If the resulting offset is too large, we can't fold it into the
11270 // addressing mode offset.
11271 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11272 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11273
11274 AddrMode AM;
11275 AM.HasBaseReg = true;
11276 AM.BaseOffs = Offset.getSExtValue();
11277 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11278 return SDValue();
11279
11280 SDLoc SL(N);
11281 EVT VT = N->getValueType(0);
11282
11283 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11284 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11285
11287 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11288 (N0.getOpcode() == ISD::OR ||
11289 N0->getFlags().hasNoUnsignedWrap()));
11290
11291 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11292}
11293
11294/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11295/// by the chain and intrinsic ID. Theoretically we would also need to check the
11296/// specific intrinsic, but they all place the pointer operand first.
11297static unsigned getBasePtrIndex(const MemSDNode *N) {
11298 switch (N->getOpcode()) {
11299 case ISD::STORE:
11302 return 2;
11303 default:
11304 return 1;
11305 }
11306}
11307
11308SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11309 DAGCombinerInfo &DCI) const {
11310 SelectionDAG &DAG = DCI.DAG;
11311 SDLoc SL(N);
11312
11313 unsigned PtrIdx = getBasePtrIndex(N);
11314 SDValue Ptr = N->getOperand(PtrIdx);
11315
11316 // TODO: We could also do this for multiplies.
11317 if (Ptr.getOpcode() == ISD::SHL) {
11318 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11319 N->getMemoryVT(), DCI);
11320 if (NewPtr) {
11321 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11322
11323 NewOps[PtrIdx] = NewPtr;
11324 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11325 }
11326 }
11327
11328 return SDValue();
11329}
11330
11331static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11332 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11333 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11334 (Opc == ISD::XOR && Val == 0);
11335}
11336
11337// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11338// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11339// integer combine opportunities since most 64-bit operations are decomposed
11340// this way. TODO: We won't want this for SALU especially if it is an inline
11341// immediate.
11342SDValue SITargetLowering::splitBinaryBitConstantOp(
11343 DAGCombinerInfo &DCI,
11344 const SDLoc &SL,
11345 unsigned Opc, SDValue LHS,
11346 const ConstantSDNode *CRHS) const {
11347 uint64_t Val = CRHS->getZExtValue();
11348 uint32_t ValLo = Lo_32(Val);
11349 uint32_t ValHi = Hi_32(Val);
11351
11352 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11353 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11354 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11355 // If we need to materialize a 64-bit immediate, it will be split up later
11356 // anyway. Avoid creating the harder to understand 64-bit immediate
11357 // materialization.
11358 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11359 }
11360
11361 return SDValue();
11362}
11363
11365 if (V.getValueType() != MVT::i1)
11366 return false;
11367 switch (V.getOpcode()) {
11368 default:
11369 break;
11370 case ISD::SETCC:
11372 return true;
11373 case ISD::AND:
11374 case ISD::OR:
11375 case ISD::XOR:
11376 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11377 }
11378 return false;
11379}
11380
11381// If a constant has all zeroes or all ones within each byte return it.
11382// Otherwise return 0.
11384 // 0xff for any zero byte in the mask
11385 uint32_t ZeroByteMask = 0;
11386 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11387 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11388 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11389 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11390 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11391 if ((NonZeroByteMask & C) != NonZeroByteMask)
11392 return 0; // Partial bytes selected.
11393 return C;
11394}
11395
11396// Check if a node selects whole bytes from its operand 0 starting at a byte
11397// boundary while masking the rest. Returns select mask as in the v_perm_b32
11398// or -1 if not succeeded.
11399// Note byte select encoding:
11400// value 0-3 selects corresponding source byte;
11401// value 0xc selects zero;
11402// value 0xff selects 0xff.
11404 assert(V.getValueSizeInBits() == 32);
11405
11406 if (V.getNumOperands() != 2)
11407 return ~0;
11408
11409 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11410 if (!N1)
11411 return ~0;
11412
11413 uint32_t C = N1->getZExtValue();
11414
11415 switch (V.getOpcode()) {
11416 default:
11417 break;
11418 case ISD::AND:
11419 if (uint32_t ConstMask = getConstantPermuteMask(C))
11420 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11421 break;
11422
11423 case ISD::OR:
11424 if (uint32_t ConstMask = getConstantPermuteMask(C))
11425 return (0x03020100 & ~ConstMask) | ConstMask;
11426 break;
11427
11428 case ISD::SHL:
11429 if (C % 8)
11430 return ~0;
11431
11432 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11433
11434 case ISD::SRL:
11435 if (C % 8)
11436 return ~0;
11437
11438 return uint32_t(0x0c0c0c0c03020100ull >> C);
11439 }
11440
11441 return ~0;
11442}
11443
11444SDValue SITargetLowering::performAndCombine(SDNode *N,
11445 DAGCombinerInfo &DCI) const {
11446 if (DCI.isBeforeLegalize())
11447 return SDValue();
11448
11449 SelectionDAG &DAG = DCI.DAG;
11450 EVT VT = N->getValueType(0);
11451 SDValue LHS = N->getOperand(0);
11452 SDValue RHS = N->getOperand(1);
11453
11454
11455 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11456 if (VT == MVT::i64 && CRHS) {
11457 if (SDValue Split
11458 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11459 return Split;
11460 }
11461
11462 if (CRHS && VT == MVT::i32) {
11463 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11464 // nb = number of trailing zeroes in mask
11465 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11466 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11467 uint64_t Mask = CRHS->getZExtValue();
11468 unsigned Bits = llvm::popcount(Mask);
11469 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11470 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11471 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11472 unsigned Shift = CShift->getZExtValue();
11473 unsigned NB = CRHS->getAPIntValue().countr_zero();
11474 unsigned Offset = NB + Shift;
11475 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11476 SDLoc SL(N);
11477 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11478 LHS->getOperand(0),
11479 DAG.getConstant(Offset, SL, MVT::i32),
11480 DAG.getConstant(Bits, SL, MVT::i32));
11481 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11482 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11483 DAG.getValueType(NarrowVT));
11484 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11485 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11486 return Shl;
11487 }
11488 }
11489 }
11490
11491 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11492 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11493 isa<ConstantSDNode>(LHS.getOperand(2))) {
11494 uint32_t Sel = getConstantPermuteMask(Mask);
11495 if (!Sel)
11496 return SDValue();
11497
11498 // Select 0xc for all zero bytes
11499 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11500 SDLoc DL(N);
11501 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11502 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11503 }
11504 }
11505
11506 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11507 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11508 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11509 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11510 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11511
11512 SDValue X = LHS.getOperand(0);
11513 SDValue Y = RHS.getOperand(0);
11514 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11515 !isTypeLegal(X.getValueType()))
11516 return SDValue();
11517
11518 if (LCC == ISD::SETO) {
11519 if (X != LHS.getOperand(1))
11520 return SDValue();
11521
11522 if (RCC == ISD::SETUNE) {
11523 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11524 if (!C1 || !C1->isInfinity() || C1->isNegative())
11525 return SDValue();
11526
11533
11534 static_assert(((~(SIInstrFlags::S_NAN |
11537 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11538 "mask not equal");
11539
11540 SDLoc DL(N);
11541 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11542 X, DAG.getConstant(Mask, DL, MVT::i32));
11543 }
11544 }
11545 }
11546
11547 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11548 std::swap(LHS, RHS);
11549
11550 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11551 RHS.hasOneUse()) {
11552 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11553 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11554 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11555 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11556 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11557 (RHS.getOperand(0) == LHS.getOperand(0) &&
11558 LHS.getOperand(0) == LHS.getOperand(1))) {
11559 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11560 unsigned NewMask = LCC == ISD::SETO ?
11561 Mask->getZExtValue() & ~OrdMask :
11562 Mask->getZExtValue() & OrdMask;
11563
11564 SDLoc DL(N);
11565 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11566 DAG.getConstant(NewMask, DL, MVT::i32));
11567 }
11568 }
11569
11570 if (VT == MVT::i32 &&
11571 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11572 // and x, (sext cc from i1) => select cc, x, 0
11573 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11574 std::swap(LHS, RHS);
11575 if (isBoolSGPR(RHS.getOperand(0)))
11576 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11577 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11578 }
11579
11580 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11582 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11583 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11584 uint32_t LHSMask = getPermuteMask(LHS);
11585 uint32_t RHSMask = getPermuteMask(RHS);
11586 if (LHSMask != ~0u && RHSMask != ~0u) {
11587 // Canonicalize the expression in an attempt to have fewer unique masks
11588 // and therefore fewer registers used to hold the masks.
11589 if (LHSMask > RHSMask) {
11590 std::swap(LHSMask, RHSMask);
11591 std::swap(LHS, RHS);
11592 }
11593
11594 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11595 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11596 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11597 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11598
11599 // Check of we need to combine values from two sources within a byte.
11600 if (!(LHSUsedLanes & RHSUsedLanes) &&
11601 // If we select high and lower word keep it for SDWA.
11602 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11603 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11604 // Each byte in each mask is either selector mask 0-3, or has higher
11605 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11606 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11607 // mask which is not 0xff wins. By anding both masks we have a correct
11608 // result except that 0x0c shall be corrected to give 0x0c only.
11609 uint32_t Mask = LHSMask & RHSMask;
11610 for (unsigned I = 0; I < 32; I += 8) {
11611 uint32_t ByteSel = 0xff << I;
11612 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11613 Mask &= (0x0c << I) & 0xffffffff;
11614 }
11615
11616 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11617 // or 0x0c.
11618 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11619 SDLoc DL(N);
11620
11621 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11622 LHS.getOperand(0), RHS.getOperand(0),
11623 DAG.getConstant(Sel, DL, MVT::i32));
11624 }
11625 }
11626 }
11627
11628 return SDValue();
11629}
11630
11631// A key component of v_perm is a mapping between byte position of the src
11632// operands, and the byte position of the dest. To provide such, we need: 1. the
11633// node that provides x byte of the dest of the OR, and 2. the byte of the node
11634// used to provide that x byte. calculateByteProvider finds which node provides
11635// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11636// and finds an ultimate src and byte position For example: The supported
11637// LoadCombine pattern for vector loads is as follows
11638// t1
11639// or
11640// / \
11641// t2 t3
11642// zext shl
11643// | | \
11644// t4 t5 16
11645// or anyext
11646// / \ |
11647// t6 t7 t8
11648// srl shl or
11649// / | / \ / \
11650// t9 t10 t11 t12 t13 t14
11651// trunc* 8 trunc* 8 and and
11652// | | / | | \
11653// t15 t16 t17 t18 t19 t20
11654// trunc* 255 srl -256
11655// | / \
11656// t15 t15 16
11657//
11658// *In this example, the truncs are from i32->i16
11659//
11660// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11661// respectively. calculateSrcByte would find (given node) -> ultimate src &
11662// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11663// After finding the mapping, we can combine the tree into vperm t15, t16,
11664// 0x05000407
11665
11666// Find the source and byte position from a node.
11667// \p DestByte is the byte position of the dest of the or that the src
11668// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11669// dest of the or byte. \p Depth tracks how many recursive iterations we have
11670// performed.
11671static const std::optional<ByteProvider<SDValue>>
11672calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11673 unsigned Depth = 0) {
11674 // We may need to recursively traverse a series of SRLs
11675 if (Depth >= 6)
11676 return std::nullopt;
11677
11678 if (Op.getValueSizeInBits() < 8)
11679 return std::nullopt;
11680
11681 if (Op.getValueType().isVector())
11682 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11683
11684 switch (Op->getOpcode()) {
11685 case ISD::TRUNCATE: {
11686 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11687 }
11688
11689 case ISD::SIGN_EXTEND:
11690 case ISD::ZERO_EXTEND:
11692 SDValue NarrowOp = Op->getOperand(0);
11693 auto NarrowVT = NarrowOp.getValueType();
11694 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11695 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11696 NarrowVT = VTSign->getVT();
11697 }
11698 if (!NarrowVT.isByteSized())
11699 return std::nullopt;
11700 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11701
11702 if (SrcIndex >= NarrowByteWidth)
11703 return std::nullopt;
11704 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11705 }
11706
11707 case ISD::SRA:
11708 case ISD::SRL: {
11709 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11710 if (!ShiftOp)
11711 return std::nullopt;
11712
11713 uint64_t BitShift = ShiftOp->getZExtValue();
11714
11715 if (BitShift % 8 != 0)
11716 return std::nullopt;
11717
11718 SrcIndex += BitShift / 8;
11719
11720 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11721 }
11722
11723 default: {
11724 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11725 }
11726 }
11727 llvm_unreachable("fully handled switch");
11728}
11729
11730// For a byte position in the result of an Or, traverse the tree and find the
11731// node (and the byte of the node) which ultimately provides this {Or,
11732// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11733// the byte position of the Op that corresponds with the originally requested
11734// byte of the Or \p Depth tracks how many recursive iterations we have
11735// performed. \p StartingIndex is the originally requested byte of the Or
11736static const std::optional<ByteProvider<SDValue>>
11737calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11738 unsigned StartingIndex = 0) {
11739 // Finding Src tree of RHS of or typically requires at least 1 additional
11740 // depth
11741 if (Depth > 6)
11742 return std::nullopt;
11743
11744 unsigned BitWidth = Op.getScalarValueSizeInBits();
11745 if (BitWidth % 8 != 0)
11746 return std::nullopt;
11747 if (Index > BitWidth / 8 - 1)
11748 return std::nullopt;
11749
11750 bool IsVec = Op.getValueType().isVector();
11751 switch (Op.getOpcode()) {
11752 case ISD::OR: {
11753 if (IsVec)
11754 return std::nullopt;
11755
11756 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11757 StartingIndex);
11758 if (!RHS)
11759 return std::nullopt;
11760 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11761 StartingIndex);
11762 if (!LHS)
11763 return std::nullopt;
11764 // A well formed Or will have two ByteProviders for each byte, one of which
11765 // is constant zero
11766 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11767 return std::nullopt;
11768 if (!LHS || LHS->isConstantZero())
11769 return RHS;
11770 if (!RHS || RHS->isConstantZero())
11771 return LHS;
11772 return std::nullopt;
11773 }
11774
11775 case ISD::AND: {
11776 if (IsVec)
11777 return std::nullopt;
11778
11779 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11780 if (!BitMaskOp)
11781 return std::nullopt;
11782
11783 uint32_t BitMask = BitMaskOp->getZExtValue();
11784 // Bits we expect for our StartingIndex
11785 uint32_t IndexMask = 0xFF << (Index * 8);
11786
11787 if ((IndexMask & BitMask) != IndexMask) {
11788 // If the result of the and partially provides the byte, then it
11789 // is not well formatted
11790 if (IndexMask & BitMask)
11791 return std::nullopt;
11793 }
11794
11795 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11796 }
11797
11798 case ISD::FSHR: {
11799 if (IsVec)
11800 return std::nullopt;
11801
11802 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11803 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11804 if (!ShiftOp || Op.getValueType().isVector())
11805 return std::nullopt;
11806
11807 uint64_t BitsProvided = Op.getValueSizeInBits();
11808 if (BitsProvided % 8 != 0)
11809 return std::nullopt;
11810
11811 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11812 if (BitShift % 8)
11813 return std::nullopt;
11814
11815 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11816 uint64_t ByteShift = BitShift / 8;
11817
11818 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11819 uint64_t BytesProvided = BitsProvided / 8;
11820 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11821 NewIndex %= BytesProvided;
11822 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11823 }
11824
11825 case ISD::SRA:
11826 case ISD::SRL: {
11827 if (IsVec)
11828 return std::nullopt;
11829
11830 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11831 if (!ShiftOp)
11832 return std::nullopt;
11833
11834 uint64_t BitShift = ShiftOp->getZExtValue();
11835 if (BitShift % 8)
11836 return std::nullopt;
11837
11838 auto BitsProvided = Op.getScalarValueSizeInBits();
11839 if (BitsProvided % 8 != 0)
11840 return std::nullopt;
11841
11842 uint64_t BytesProvided = BitsProvided / 8;
11843 uint64_t ByteShift = BitShift / 8;
11844 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11845 // If the byte we are trying to provide (as tracked by index) falls in this
11846 // range, then the SRL provides the byte. The byte of interest of the src of
11847 // the SRL is Index + ByteShift
11848 return BytesProvided - ByteShift > Index
11849 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11850 Index + ByteShift)
11852 }
11853
11854 case ISD::SHL: {
11855 if (IsVec)
11856 return std::nullopt;
11857
11858 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11859 if (!ShiftOp)
11860 return std::nullopt;
11861
11862 uint64_t BitShift = ShiftOp->getZExtValue();
11863 if (BitShift % 8 != 0)
11864 return std::nullopt;
11865 uint64_t ByteShift = BitShift / 8;
11866
11867 // If we are shifting by an amount greater than (or equal to)
11868 // the index we are trying to provide, then it provides 0s. If not,
11869 // then this bytes are not definitively 0s, and the corresponding byte
11870 // of interest is Index - ByteShift of the src
11871 return Index < ByteShift
11873 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11874 Depth + 1, StartingIndex);
11875 }
11876 case ISD::ANY_EXTEND:
11877 case ISD::SIGN_EXTEND:
11878 case ISD::ZERO_EXTEND:
11880 case ISD::AssertZext:
11881 case ISD::AssertSext: {
11882 if (IsVec)
11883 return std::nullopt;
11884
11885 SDValue NarrowOp = Op->getOperand(0);
11886 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11887 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11888 Op->getOpcode() == ISD::AssertZext ||
11889 Op->getOpcode() == ISD::AssertSext) {
11890 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11891 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11892 }
11893 if (NarrowBitWidth % 8 != 0)
11894 return std::nullopt;
11895 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11896
11897 if (Index >= NarrowByteWidth)
11898 return Op.getOpcode() == ISD::ZERO_EXTEND
11899 ? std::optional<ByteProvider<SDValue>>(
11901 : std::nullopt;
11902 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
11903 }
11904
11905 case ISD::TRUNCATE: {
11906 if (IsVec)
11907 return std::nullopt;
11908
11909 uint64_t NarrowByteWidth = BitWidth / 8;
11910
11911 if (NarrowByteWidth >= Index) {
11912 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11913 StartingIndex);
11914 }
11915
11916 return std::nullopt;
11917 }
11918
11919 case ISD::CopyFromReg: {
11920 if (BitWidth / 8 > Index)
11921 return calculateSrcByte(Op, StartingIndex, Index);
11922
11923 return std::nullopt;
11924 }
11925
11926 case ISD::LOAD: {
11927 auto L = cast<LoadSDNode>(Op.getNode());
11928
11929 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11930 if (NarrowBitWidth % 8 != 0)
11931 return std::nullopt;
11932 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11933
11934 // If the width of the load does not reach byte we are trying to provide for
11935 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
11936 // question
11937 if (Index >= NarrowByteWidth) {
11938 return L->getExtensionType() == ISD::ZEXTLOAD
11939 ? std::optional<ByteProvider<SDValue>>(
11941 : std::nullopt;
11942 }
11943
11944 if (NarrowByteWidth > Index) {
11945 return calculateSrcByte(Op, StartingIndex, Index);
11946 }
11947
11948 return std::nullopt;
11949 }
11950
11951 case ISD::BSWAP: {
11952 if (IsVec)
11953 return std::nullopt;
11954
11955 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
11956 Depth + 1, StartingIndex);
11957 }
11958
11960 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11961 if (!IdxOp)
11962 return std::nullopt;
11963 auto VecIdx = IdxOp->getZExtValue();
11964 auto ScalarSize = Op.getScalarValueSizeInBits();
11965 if (ScalarSize != 32) {
11966 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
11967 }
11968
11969 return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
11970 StartingIndex, Index);
11971 }
11972
11973 case AMDGPUISD::PERM: {
11974 if (IsVec)
11975 return std::nullopt;
11976
11977 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11978 if (!PermMask)
11979 return std::nullopt;
11980
11981 auto IdxMask =
11982 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
11983 if (IdxMask > 0x07 && IdxMask != 0x0c)
11984 return std::nullopt;
11985
11986 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
11987 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
11988
11989 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
11992 }
11993
11994 default: {
11995 return std::nullopt;
11996 }
11997 }
11998
11999 llvm_unreachable("fully handled switch");
12000}
12001
12002// Returns true if the Operand is a scalar and is 16 bits
12003static bool isExtendedFrom16Bits(SDValue &Operand) {
12004
12005 switch (Operand.getOpcode()) {
12006 case ISD::ANY_EXTEND:
12007 case ISD::SIGN_EXTEND:
12008 case ISD::ZERO_EXTEND: {
12009 auto OpVT = Operand.getOperand(0).getValueType();
12010 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12011 }
12012 case ISD::LOAD: {
12013 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12014 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12015 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12016 ExtType == ISD::EXTLOAD) {
12017 auto MemVT = L->getMemoryVT();
12018 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12019 }
12020 return L->getMemoryVT().getSizeInBits() == 16;
12021 }
12022 default:
12023 return false;
12024 }
12025}
12026
12027// Returns true if the mask matches consecutive bytes, and the first byte
12028// begins at a power of 2 byte offset from 0th byte
12029static bool addresses16Bits(int Mask) {
12030 int Low8 = Mask & 0xff;
12031 int Hi8 = (Mask & 0xff00) >> 8;
12032
12033 assert(Low8 < 8 && Hi8 < 8);
12034 // Are the bytes contiguous in the order of increasing addresses.
12035 bool IsConsecutive = (Hi8 - Low8 == 1);
12036 // Is the first byte at location that is aligned for 16 bit instructions.
12037 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12038 // In this case, we still need code to extract the 16 bit operand, so it
12039 // is better to use i8 v_perm
12040 bool Is16Aligned = !(Low8 % 2);
12041
12042 return IsConsecutive && Is16Aligned;
12043}
12044
12045// Do not lower into v_perm if the operands are actually 16 bit
12046// and the selected bits (based on PermMask) correspond with two
12047// easily addressable 16 bit operands.
12049 SDValue &OtherOp) {
12050 int Low16 = PermMask & 0xffff;
12051 int Hi16 = (PermMask & 0xffff0000) >> 16;
12052
12053 auto TempOp = peekThroughBitcasts(Op);
12054 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12055
12056 auto OpIs16Bit =
12057 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12058 if (!OpIs16Bit)
12059 return true;
12060
12061 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12062 isExtendedFrom16Bits(TempOtherOp);
12063 if (!OtherOpIs16Bit)
12064 return true;
12065
12066 // Do we cleanly address both
12067 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12068}
12069
12071 unsigned DWordOffset) {
12072 SDValue Ret;
12073
12074 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12075 // ByteProvider must be at least 8 bits
12076 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12077
12078 if (TypeSize <= 32)
12079 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12080
12081 if (Src.getValueType().isVector()) {
12082 auto ScalarTySize = Src.getScalarValueSizeInBits();
12083 auto ScalarTy = Src.getValueType().getScalarType();
12084 if (ScalarTySize == 32) {
12085 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12086 DAG.getConstant(DWordOffset, SL, MVT::i32));
12087 }
12088 if (ScalarTySize > 32) {
12089 Ret = DAG.getNode(
12090 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12091 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12092 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12093 if (ShiftVal)
12094 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12095 DAG.getConstant(ShiftVal, SL, MVT::i32));
12096 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12097 }
12098
12099 assert(ScalarTySize < 32);
12100 auto NumElements = TypeSize / ScalarTySize;
12101 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12102 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12103 auto NumElementsIn32 = 32 / ScalarTySize;
12104 auto NumAvailElements = DWordOffset < Trunc32Elements
12105 ? NumElementsIn32
12106 : NumElements - NormalizedTrunc;
12107
12109 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12110 NumAvailElements);
12111
12112 Ret = DAG.getBuildVector(
12113 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12114 VecSrcs);
12115 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12116 }
12117
12118 /// Scalar Type
12119 auto ShiftVal = 32 * DWordOffset;
12120 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12121 DAG.getConstant(ShiftVal, SL, MVT::i32));
12122 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12123}
12124
12126 SelectionDAG &DAG = DCI.DAG;
12127 [[maybe_unused]] EVT VT = N->getValueType(0);
12129
12130 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12131 assert(VT == MVT::i32);
12132 for (int i = 0; i < 4; i++) {
12133 // Find the ByteProvider that provides the ith byte of the result of OR
12134 std::optional<ByteProvider<SDValue>> P =
12135 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12136 // TODO support constantZero
12137 if (!P || P->isConstantZero())
12138 return SDValue();
12139
12140 PermNodes.push_back(*P);
12141 }
12142 if (PermNodes.size() != 4)
12143 return SDValue();
12144
12145 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12146 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12147 uint64_t PermMask = 0x00000000;
12148 for (size_t i = 0; i < PermNodes.size(); i++) {
12149 auto PermOp = PermNodes[i];
12150 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12151 // by sizeof(Src2) = 4
12152 int SrcByteAdjust = 4;
12153
12154 // If the Src uses a byte from a different DWORD, then it corresponds
12155 // with a difference source
12156 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12157 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12158 if (SecondSrc)
12159 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12160 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12161 return SDValue();
12162
12163 // Set the index of the second distinct Src node
12164 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12165 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12166 SrcByteAdjust = 0;
12167 }
12168 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12170 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12171 }
12172 SDLoc DL(N);
12173 SDValue Op = *PermNodes[FirstSrc.first].Src;
12174 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12175 assert(Op.getValueSizeInBits() == 32);
12176
12177 // Check that we are not just extracting the bytes in order from an op
12178 if (!SecondSrc) {
12179 int Low16 = PermMask & 0xffff;
12180 int Hi16 = (PermMask & 0xffff0000) >> 16;
12181
12182 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12183 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12184
12185 // The perm op would really just produce Op. So combine into Op
12186 if (WellFormedLow && WellFormedHi)
12187 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12188 }
12189
12190 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12191
12192 if (SecondSrc) {
12193 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12194 assert(OtherOp.getValueSizeInBits() == 32);
12195 }
12196
12197 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12198
12199 assert(Op.getValueType().isByteSized() &&
12200 OtherOp.getValueType().isByteSized());
12201
12202 // If the ultimate src is less than 32 bits, then we will only be
12203 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12204 // CalculateByteProvider would not have returned Op as source if we
12205 // used a byte that is outside its ValueType. Thus, we are free to
12206 // ANY_EXTEND as the extended bits are dont-cares.
12207 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12208 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12209
12210 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12211 DAG.getConstant(PermMask, DL, MVT::i32));
12212 }
12213 return SDValue();
12214}
12215
12216SDValue SITargetLowering::performOrCombine(SDNode *N,
12217 DAGCombinerInfo &DCI) const {
12218 SelectionDAG &DAG = DCI.DAG;
12219 SDValue LHS = N->getOperand(0);
12220 SDValue RHS = N->getOperand(1);
12221
12222 EVT VT = N->getValueType(0);
12223 if (VT == MVT::i1) {
12224 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12225 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12226 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12227 SDValue Src = LHS.getOperand(0);
12228 if (Src != RHS.getOperand(0))
12229 return SDValue();
12230
12231 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12232 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12233 if (!CLHS || !CRHS)
12234 return SDValue();
12235
12236 // Only 10 bits are used.
12237 static const uint32_t MaxMask = 0x3ff;
12238
12239 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12240 SDLoc DL(N);
12241 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12242 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12243 }
12244
12245 return SDValue();
12246 }
12247
12248 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12249 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12250 LHS.getOpcode() == AMDGPUISD::PERM &&
12251 isa<ConstantSDNode>(LHS.getOperand(2))) {
12252 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12253 if (!Sel)
12254 return SDValue();
12255
12256 Sel |= LHS.getConstantOperandVal(2);
12257 SDLoc DL(N);
12258 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12259 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12260 }
12261
12262 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12264 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12265 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12266
12267 // If all the uses of an or need to extract the individual elements, do not
12268 // attempt to lower into v_perm
12269 auto usesCombinedOperand = [](SDNode *OrUse) {
12270 // If we have any non-vectorized use, then it is a candidate for v_perm
12271 if (OrUse->getOpcode() != ISD::BITCAST ||
12272 !OrUse->getValueType(0).isVector())
12273 return true;
12274
12275 // If we have any non-vectorized use, then it is a candidate for v_perm
12276 for (auto VUse : OrUse->uses()) {
12277 if (!VUse->getValueType(0).isVector())
12278 return true;
12279
12280 // If the use of a vector is a store, then combining via a v_perm
12281 // is beneficial.
12282 // TODO -- whitelist more uses
12283 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12284 if (VUse->getOpcode() == VectorwiseOp)
12285 return true;
12286 }
12287 return false;
12288 };
12289
12290 if (!any_of(N->uses(), usesCombinedOperand))
12291 return SDValue();
12292
12293 uint32_t LHSMask = getPermuteMask(LHS);
12294 uint32_t RHSMask = getPermuteMask(RHS);
12295
12296 if (LHSMask != ~0u && RHSMask != ~0u) {
12297 // Canonicalize the expression in an attempt to have fewer unique masks
12298 // and therefore fewer registers used to hold the masks.
12299 if (LHSMask > RHSMask) {
12300 std::swap(LHSMask, RHSMask);
12301 std::swap(LHS, RHS);
12302 }
12303
12304 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12305 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12306 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12307 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12308
12309 // Check of we need to combine values from two sources within a byte.
12310 if (!(LHSUsedLanes & RHSUsedLanes) &&
12311 // If we select high and lower word keep it for SDWA.
12312 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12313 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12314 // Kill zero bytes selected by other mask. Zero value is 0xc.
12315 LHSMask &= ~RHSUsedLanes;
12316 RHSMask &= ~LHSUsedLanes;
12317 // Add 4 to each active LHS lane
12318 LHSMask |= LHSUsedLanes & 0x04040404;
12319 // Combine masks
12320 uint32_t Sel = LHSMask | RHSMask;
12321 SDLoc DL(N);
12322
12323 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12324 LHS.getOperand(0), RHS.getOperand(0),
12325 DAG.getConstant(Sel, DL, MVT::i32));
12326 }
12327 }
12328 if (LHSMask == ~0u || RHSMask == ~0u) {
12329 if (SDValue Perm = matchPERM(N, DCI))
12330 return Perm;
12331 }
12332 }
12333
12334 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12335 return SDValue();
12336
12337 // TODO: This could be a generic combine with a predicate for extracting the
12338 // high half of an integer being free.
12339
12340 // (or i64:x, (zero_extend i32:y)) ->
12341 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12342 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12343 RHS.getOpcode() != ISD::ZERO_EXTEND)
12344 std::swap(LHS, RHS);
12345
12346 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12347 SDValue ExtSrc = RHS.getOperand(0);
12348 EVT SrcVT = ExtSrc.getValueType();
12349 if (SrcVT == MVT::i32) {
12350 SDLoc SL(N);
12351 SDValue LowLHS, HiBits;
12352 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12353 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12354
12355 DCI.AddToWorklist(LowOr.getNode());
12356 DCI.AddToWorklist(HiBits.getNode());
12357
12358 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12359 LowOr, HiBits);
12360 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12361 }
12362 }
12363
12364 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12365 if (CRHS) {
12366 if (SDValue Split
12367 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12368 N->getOperand(0), CRHS))
12369 return Split;
12370 }
12371
12372 return SDValue();
12373}
12374
12375SDValue SITargetLowering::performXorCombine(SDNode *N,
12376 DAGCombinerInfo &DCI) const {
12377 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12378 return RV;
12379
12380 SDValue LHS = N->getOperand(0);
12381 SDValue RHS = N->getOperand(1);
12382
12383 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12384 SelectionDAG &DAG = DCI.DAG;
12385
12386 EVT VT = N->getValueType(0);
12387 if (CRHS && VT == MVT::i64) {
12388 if (SDValue Split
12389 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12390 return Split;
12391 }
12392
12393 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12394 // fneg-like xors into 64-bit select.
12395 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12396 // This looks like an fneg, try to fold as a source modifier.
12397 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12398 shouldFoldFNegIntoSrc(N, LHS)) {
12399 // xor (select c, a, b), 0x80000000 ->
12400 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12401 SDLoc DL(N);
12402 SDValue CastLHS =
12403 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12404 SDValue CastRHS =
12405 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12406 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12407 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12408 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12409 LHS->getOperand(0), FNegLHS, FNegRHS);
12410 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12411 }
12412 }
12413
12414 return SDValue();
12415}
12416
12417SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12418 DAGCombinerInfo &DCI) const {
12419 if (!Subtarget->has16BitInsts() ||
12420 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12421 return SDValue();
12422
12423 EVT VT = N->getValueType(0);
12424 if (VT != MVT::i32)
12425 return SDValue();
12426
12427 SDValue Src = N->getOperand(0);
12428 if (Src.getValueType() != MVT::i16)
12429 return SDValue();
12430
12431 return SDValue();
12432}
12433
12434SDValue
12435SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12436 DAGCombinerInfo &DCI) const {
12437 SDValue Src = N->getOperand(0);
12438 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12439
12440 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12441 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12442 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12443 VTSign->getVT() == MVT::i8) ||
12444 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12445 VTSign->getVT() == MVT::i16))) {
12446 assert(Subtarget->hasScalarSubwordLoads() &&
12447 "s_buffer_load_{u8, i8} are supported "
12448 "in GFX12 (or newer) architectures.");
12449 EVT VT = Src.getValueType();
12450 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12453 SDLoc DL(N);
12454 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12455 SDValue Ops[] = {
12456 Src.getOperand(0), // source register
12457 Src.getOperand(1), // offset
12458 Src.getOperand(2) // cachePolicy
12459 };
12460 auto *M = cast<MemSDNode>(Src);
12461 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12462 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12463 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12464 return LoadVal;
12465 } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12466 VTSign->getVT() == MVT::i8) ||
12467 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12468 VTSign->getVT() == MVT::i16)) &&
12469 Src.hasOneUse()) {
12470 auto *M = cast<MemSDNode>(Src);
12471 SDValue Ops[] = {
12472 Src.getOperand(0), // Chain
12473 Src.getOperand(1), // rsrc
12474 Src.getOperand(2), // vindex
12475 Src.getOperand(3), // voffset
12476 Src.getOperand(4), // soffset
12477 Src.getOperand(5), // offset
12478 Src.getOperand(6),
12479 Src.getOperand(7)
12480 };
12481 // replace with BUFFER_LOAD_BYTE/SHORT
12482 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12483 Src.getOperand(0).getValueType());
12484 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12486 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12487 ResList,
12488 Ops, M->getMemoryVT(),
12489 M->getMemOperand());
12490 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12491 BufferLoadSignExt.getValue(1)}, SDLoc(N));
12492 }
12493 return SDValue();
12494}
12495
12496SDValue SITargetLowering::performClassCombine(SDNode *N,
12497 DAGCombinerInfo &DCI) const {
12498 SelectionDAG &DAG = DCI.DAG;
12499 SDValue Mask = N->getOperand(1);
12500
12501 // fp_class x, 0 -> false
12502 if (isNullConstant(Mask))
12503 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12504
12505 if (N->getOperand(0).isUndef())
12506 return DAG.getUNDEF(MVT::i1);
12507
12508 return SDValue();
12509}
12510
12511SDValue SITargetLowering::performRcpCombine(SDNode *N,
12512 DAGCombinerInfo &DCI) const {
12513 EVT VT = N->getValueType(0);
12514 SDValue N0 = N->getOperand(0);
12515
12516 if (N0.isUndef()) {
12517 return DCI.DAG.getConstantFP(
12519 VT);
12520 }
12521
12522 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12523 N0.getOpcode() == ISD::SINT_TO_FP)) {
12524 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12525 N->getFlags());
12526 }
12527
12528 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12529 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12530 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12531 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12532 N0.getOperand(0), N->getFlags());
12533 }
12534
12536}
12537
12539 unsigned MaxDepth) const {
12540 unsigned Opcode = Op.getOpcode();
12541 if (Opcode == ISD::FCANONICALIZE)
12542 return true;
12543
12544 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12545 const auto &F = CFP->getValueAPF();
12546 if (F.isNaN() && F.isSignaling())
12547 return false;
12548 if (!F.isDenormal())
12549 return true;
12550
12551 DenormalMode Mode =
12552 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12553 return Mode == DenormalMode::getIEEE();
12554 }
12555
12556 // If source is a result of another standard FP operation it is already in
12557 // canonical form.
12558 if (MaxDepth == 0)
12559 return false;
12560
12561 switch (Opcode) {
12562 // These will flush denorms if required.
12563 case ISD::FADD:
12564 case ISD::FSUB:
12565 case ISD::FMUL:
12566 case ISD::FCEIL:
12567 case ISD::FFLOOR:
12568 case ISD::FMA:
12569 case ISD::FMAD:
12570 case ISD::FSQRT:
12571 case ISD::FDIV:
12572 case ISD::FREM:
12573 case ISD::FP_ROUND:
12574 case ISD::FP_EXTEND:
12575 case ISD::FP16_TO_FP:
12576 case ISD::FP_TO_FP16:
12577 case ISD::BF16_TO_FP:
12578 case ISD::FP_TO_BF16:
12579 case ISD::FLDEXP:
12582 case AMDGPUISD::RCP:
12583 case AMDGPUISD::RSQ:
12587 case AMDGPUISD::LOG:
12588 case AMDGPUISD::EXP:
12592 case AMDGPUISD::FRACT:
12599 case AMDGPUISD::SIN_HW:
12600 case AMDGPUISD::COS_HW:
12601 return true;
12602
12603 // It can/will be lowered or combined as a bit operation.
12604 // Need to check their input recursively to handle.
12605 case ISD::FNEG:
12606 case ISD::FABS:
12607 case ISD::FCOPYSIGN:
12608 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12609
12610 case ISD::AND:
12611 if (Op.getValueType() == MVT::i32) {
12612 // Be careful as we only know it is a bitcast floating point type. It
12613 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12614 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12615 // is valid to optimize for all types.
12616 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12617 if (RHS->getZExtValue() == 0xffff0000) {
12618 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12619 }
12620 }
12621 }
12622 break;
12623
12624 case ISD::FSIN:
12625 case ISD::FCOS:
12626 case ISD::FSINCOS:
12627 return Op.getValueType().getScalarType() != MVT::f16;
12628
12629 case ISD::FMINNUM:
12630 case ISD::FMAXNUM:
12631 case ISD::FMINNUM_IEEE:
12632 case ISD::FMAXNUM_IEEE:
12633 case ISD::FMINIMUM:
12634 case ISD::FMAXIMUM:
12635 case AMDGPUISD::CLAMP:
12636 case AMDGPUISD::FMED3:
12637 case AMDGPUISD::FMAX3:
12638 case AMDGPUISD::FMIN3:
12640 case AMDGPUISD::FMINIMUM3: {
12641 // FIXME: Shouldn't treat the generic operations different based these.
12642 // However, we aren't really required to flush the result from
12643 // minnum/maxnum..
12644
12645 // snans will be quieted, so we only need to worry about denormals.
12646 if (Subtarget->supportsMinMaxDenormModes() ||
12647 // FIXME: denormalsEnabledForType is broken for dynamic
12648 denormalsEnabledForType(DAG, Op.getValueType()))
12649 return true;
12650
12651 // Flushing may be required.
12652 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12653 // targets need to check their input recursively.
12654
12655 // FIXME: Does this apply with clamp? It's implemented with max.
12656 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12657 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12658 return false;
12659 }
12660
12661 return true;
12662 }
12663 case ISD::SELECT: {
12664 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12665 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12666 }
12667 case ISD::BUILD_VECTOR: {
12668 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12669 SDValue SrcOp = Op.getOperand(i);
12670 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12671 return false;
12672 }
12673
12674 return true;
12675 }
12678 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12679 }
12681 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12682 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12683 }
12684 case ISD::UNDEF:
12685 // Could be anything.
12686 return false;
12687
12688 case ISD::BITCAST:
12689 // TODO: This is incorrect as it loses track of the operand's type. We may
12690 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12691 // same bits that are canonicalized in one type need not be in the other.
12692 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12693 case ISD::TRUNCATE: {
12694 // Hack round the mess we make when legalizing extract_vector_elt
12695 if (Op.getValueType() == MVT::i16) {
12696 SDValue TruncSrc = Op.getOperand(0);
12697 if (TruncSrc.getValueType() == MVT::i32 &&
12698 TruncSrc.getOpcode() == ISD::BITCAST &&
12699 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12700 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12701 }
12702 }
12703 return false;
12704 }
12706 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12707 // TODO: Handle more intrinsics
12708 switch (IntrinsicID) {
12709 case Intrinsic::amdgcn_cvt_pkrtz:
12710 case Intrinsic::amdgcn_cubeid:
12711 case Intrinsic::amdgcn_frexp_mant:
12712 case Intrinsic::amdgcn_fdot2:
12713 case Intrinsic::amdgcn_rcp:
12714 case Intrinsic::amdgcn_rsq:
12715 case Intrinsic::amdgcn_rsq_clamp:
12716 case Intrinsic::amdgcn_rcp_legacy:
12717 case Intrinsic::amdgcn_rsq_legacy:
12718 case Intrinsic::amdgcn_trig_preop:
12719 case Intrinsic::amdgcn_log:
12720 case Intrinsic::amdgcn_exp2:
12721 case Intrinsic::amdgcn_sqrt:
12722 return true;
12723 default:
12724 break;
12725 }
12726
12727 break;
12728 }
12729 default:
12730 break;
12731 }
12732
12733 // FIXME: denormalsEnabledForType is broken for dynamic
12734 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12735 DAG.isKnownNeverSNaN(Op);
12736}
12737
12739 unsigned MaxDepth) const {
12740 const MachineRegisterInfo &MRI = MF.getRegInfo();
12741 MachineInstr *MI = MRI.getVRegDef(Reg);
12742 unsigned Opcode = MI->getOpcode();
12743
12744 if (Opcode == AMDGPU::G_FCANONICALIZE)
12745 return true;
12746
12747 std::optional<FPValueAndVReg> FCR;
12748 // Constant splat (can be padded with undef) or scalar constant.
12749 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12750 if (FCR->Value.isSignaling())
12751 return false;
12752 if (!FCR->Value.isDenormal())
12753 return true;
12754
12755 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12756 return Mode == DenormalMode::getIEEE();
12757 }
12758
12759 if (MaxDepth == 0)
12760 return false;
12761
12762 switch (Opcode) {
12763 case AMDGPU::G_FADD:
12764 case AMDGPU::G_FSUB:
12765 case AMDGPU::G_FMUL:
12766 case AMDGPU::G_FCEIL:
12767 case AMDGPU::G_FFLOOR:
12768 case AMDGPU::G_FRINT:
12769 case AMDGPU::G_FNEARBYINT:
12770 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12771 case AMDGPU::G_INTRINSIC_TRUNC:
12772 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12773 case AMDGPU::G_FMA:
12774 case AMDGPU::G_FMAD:
12775 case AMDGPU::G_FSQRT:
12776 case AMDGPU::G_FDIV:
12777 case AMDGPU::G_FREM:
12778 case AMDGPU::G_FPOW:
12779 case AMDGPU::G_FPEXT:
12780 case AMDGPU::G_FLOG:
12781 case AMDGPU::G_FLOG2:
12782 case AMDGPU::G_FLOG10:
12783 case AMDGPU::G_FPTRUNC:
12784 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12785 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12786 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12787 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12788 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12789 return true;
12790 case AMDGPU::G_FNEG:
12791 case AMDGPU::G_FABS:
12792 case AMDGPU::G_FCOPYSIGN:
12793 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12794 case AMDGPU::G_FMINNUM:
12795 case AMDGPU::G_FMAXNUM:
12796 case AMDGPU::G_FMINNUM_IEEE:
12797 case AMDGPU::G_FMAXNUM_IEEE:
12798 case AMDGPU::G_FMINIMUM:
12799 case AMDGPU::G_FMAXIMUM: {
12800 if (Subtarget->supportsMinMaxDenormModes() ||
12801 // FIXME: denormalsEnabledForType is broken for dynamic
12802 denormalsEnabledForType(MRI.getType(Reg), MF))
12803 return true;
12804
12805 [[fallthrough]];
12806 }
12807 case AMDGPU::G_BUILD_VECTOR:
12808 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12809 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12810 return false;
12811 return true;
12812 case AMDGPU::G_INTRINSIC:
12813 case AMDGPU::G_INTRINSIC_CONVERGENT:
12814 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12815 case Intrinsic::amdgcn_fmul_legacy:
12816 case Intrinsic::amdgcn_fmad_ftz:
12817 case Intrinsic::amdgcn_sqrt:
12818 case Intrinsic::amdgcn_fmed3:
12819 case Intrinsic::amdgcn_sin:
12820 case Intrinsic::amdgcn_cos:
12821 case Intrinsic::amdgcn_log:
12822 case Intrinsic::amdgcn_exp2:
12823 case Intrinsic::amdgcn_log_clamp:
12824 case Intrinsic::amdgcn_rcp:
12825 case Intrinsic::amdgcn_rcp_legacy:
12826 case Intrinsic::amdgcn_rsq:
12827 case Intrinsic::amdgcn_rsq_clamp:
12828 case Intrinsic::amdgcn_rsq_legacy:
12829 case Intrinsic::amdgcn_div_scale:
12830 case Intrinsic::amdgcn_div_fmas:
12831 case Intrinsic::amdgcn_div_fixup:
12832 case Intrinsic::amdgcn_fract:
12833 case Intrinsic::amdgcn_cvt_pkrtz:
12834 case Intrinsic::amdgcn_cubeid:
12835 case Intrinsic::amdgcn_cubema:
12836 case Intrinsic::amdgcn_cubesc:
12837 case Intrinsic::amdgcn_cubetc:
12838 case Intrinsic::amdgcn_frexp_mant:
12839 case Intrinsic::amdgcn_fdot2:
12840 case Intrinsic::amdgcn_trig_preop:
12841 return true;
12842 default:
12843 break;
12844 }
12845
12846 [[fallthrough]];
12847 default:
12848 return false;
12849 }
12850
12851 llvm_unreachable("invalid operation");
12852}
12853
12854// Constant fold canonicalize.
12855SDValue SITargetLowering::getCanonicalConstantFP(
12856 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12857 // Flush denormals to 0 if not enabled.
12858 if (C.isDenormal()) {
12859 DenormalMode Mode =
12860 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12861 if (Mode == DenormalMode::getPreserveSign()) {
12862 return DAG.getConstantFP(
12863 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12864 }
12865
12866 if (Mode != DenormalMode::getIEEE())
12867 return SDValue();
12868 }
12869
12870 if (C.isNaN()) {
12871 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12872 if (C.isSignaling()) {
12873 // Quiet a signaling NaN.
12874 // FIXME: Is this supposed to preserve payload bits?
12875 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12876 }
12877
12878 // Make sure it is the canonical NaN bitpattern.
12879 //
12880 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12881 // immediate?
12882 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12883 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12884 }
12885
12886 // Already canonical.
12887 return DAG.getConstantFP(C, SL, VT);
12888}
12889
12891 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
12892}
12893
12894SDValue SITargetLowering::performFCanonicalizeCombine(
12895 SDNode *N,
12896 DAGCombinerInfo &DCI) const {
12897 SelectionDAG &DAG = DCI.DAG;
12898 SDValue N0 = N->getOperand(0);
12899 EVT VT = N->getValueType(0);
12900
12901 // fcanonicalize undef -> qnan
12902 if (N0.isUndef()) {
12904 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
12905 }
12906
12907 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
12908 EVT VT = N->getValueType(0);
12909 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
12910 }
12911
12912 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12913 // (fcanonicalize k)
12914 //
12915 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12916
12917 // TODO: This could be better with wider vectors that will be split to v2f16,
12918 // and to consider uses since there aren't that many packed operations.
12919 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
12920 isTypeLegal(MVT::v2f16)) {
12921 SDLoc SL(N);
12922 SDValue NewElts[2];
12923 SDValue Lo = N0.getOperand(0);
12924 SDValue Hi = N0.getOperand(1);
12925 EVT EltVT = Lo.getValueType();
12926
12928 for (unsigned I = 0; I != 2; ++I) {
12929 SDValue Op = N0.getOperand(I);
12930 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12931 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
12932 CFP->getValueAPF());
12933 } else if (Op.isUndef()) {
12934 // Handled below based on what the other operand is.
12935 NewElts[I] = Op;
12936 } else {
12937 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
12938 }
12939 }
12940
12941 // If one half is undef, and one is constant, prefer a splat vector rather
12942 // than the normal qNaN. If it's a register, prefer 0.0 since that's
12943 // cheaper to use and may be free with a packed operation.
12944 if (NewElts[0].isUndef()) {
12945 if (isa<ConstantFPSDNode>(NewElts[1]))
12946 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
12947 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
12948 }
12949
12950 if (NewElts[1].isUndef()) {
12951 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
12952 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
12953 }
12954
12955 return DAG.getBuildVector(VT, SL, NewElts);
12956 }
12957 }
12958
12959 return SDValue();
12960}
12961
12962static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
12963 switch (Opc) {
12964 case ISD::FMAXNUM:
12965 case ISD::FMAXNUM_IEEE:
12966 return AMDGPUISD::FMAX3;
12967 case ISD::FMAXIMUM:
12968 return AMDGPUISD::FMAXIMUM3;
12969 case ISD::SMAX:
12970 return AMDGPUISD::SMAX3;
12971 case ISD::UMAX:
12972 return AMDGPUISD::UMAX3;
12973 case ISD::FMINNUM:
12974 case ISD::FMINNUM_IEEE:
12975 return AMDGPUISD::FMIN3;
12976 case ISD::FMINIMUM:
12977 return AMDGPUISD::FMINIMUM3;
12978 case ISD::SMIN:
12979 return AMDGPUISD::SMIN3;
12980 case ISD::UMIN:
12981 return AMDGPUISD::UMIN3;
12982 default:
12983 llvm_unreachable("Not a min/max opcode");
12984 }
12985}
12986
12987SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
12988 const SDLoc &SL, SDValue Src,
12989 SDValue MinVal,
12990 SDValue MaxVal,
12991 bool Signed) const {
12992
12993 // med3 comes from
12994 // min(max(x, K0), K1), K0 < K1
12995 // max(min(x, K0), K1), K1 < K0
12996 //
12997 // "MinVal" and "MaxVal" respectively refer to the rhs of the
12998 // min/max op.
12999 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13000 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13001
13002 if (!MinK || !MaxK)
13003 return SDValue();
13004
13005 if (Signed) {
13006 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13007 return SDValue();
13008 } else {
13009 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13010 return SDValue();
13011 }
13012
13013 EVT VT = MinK->getValueType(0);
13014 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13015 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13016 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13017
13018 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13019 // not available, but this is unlikely to be profitable as constants
13020 // will often need to be materialized & extended, especially on
13021 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13022 return SDValue();
13023}
13024
13026 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13027 return C;
13028
13029 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13030 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13031 return C;
13032 }
13033
13034 return nullptr;
13035}
13036
13037SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13038 const SDLoc &SL,
13039 SDValue Op0,
13040 SDValue Op1) const {
13042 if (!K1)
13043 return SDValue();
13044
13046 if (!K0)
13047 return SDValue();
13048
13049 // Ordered >= (although NaN inputs should have folded away by now).
13050 if (K0->getValueAPF() > K1->getValueAPF())
13051 return SDValue();
13052
13053 const MachineFunction &MF = DAG.getMachineFunction();
13055
13056 // TODO: Check IEEE bit enabled?
13057 EVT VT = Op0.getValueType();
13058 if (Info->getMode().DX10Clamp) {
13059 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13060 // hardware fmed3 behavior converting to a min.
13061 // FIXME: Should this be allowing -0.0?
13062 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13063 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13064 }
13065
13066 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13067 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13068 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13069 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13070 // then give the other result, which is different from med3 with a NaN
13071 // input.
13072 SDValue Var = Op0.getOperand(0);
13073 if (!DAG.isKnownNeverSNaN(Var))
13074 return SDValue();
13075
13077
13078 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13079 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13080 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13081 Var, SDValue(K0, 0), SDValue(K1, 0));
13082 }
13083 }
13084
13085 return SDValue();
13086}
13087
13088SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13089 DAGCombinerInfo &DCI) const {
13090 SelectionDAG &DAG = DCI.DAG;
13091
13092 EVT VT = N->getValueType(0);
13093 unsigned Opc = N->getOpcode();
13094 SDValue Op0 = N->getOperand(0);
13095 SDValue Op1 = N->getOperand(1);
13096
13097 // Only do this if the inner op has one use since this will just increases
13098 // register pressure for no benefit.
13099
13100 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
13101 !VT.isVector() &&
13102 (VT == MVT::i32 || VT == MVT::f32 ||
13103 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
13104 // max(max(a, b), c) -> max3(a, b, c)
13105 // min(min(a, b), c) -> min3(a, b, c)
13106 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13107 SDLoc DL(N);
13108 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13109 DL,
13110 N->getValueType(0),
13111 Op0.getOperand(0),
13112 Op0.getOperand(1),
13113 Op1);
13114 }
13115
13116 // Try commuted.
13117 // max(a, max(b, c)) -> max3(a, b, c)
13118 // min(a, min(b, c)) -> min3(a, b, c)
13119 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13120 SDLoc DL(N);
13121 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13122 DL,
13123 N->getValueType(0),
13124 Op0,
13125 Op1.getOperand(0),
13126 Op1.getOperand(1));
13127 }
13128 }
13129
13130 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13131 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13132 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13133 if (SDValue Med3 = performIntMed3ImmCombine(
13134 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13135 return Med3;
13136 }
13137 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13138 if (SDValue Med3 = performIntMed3ImmCombine(
13139 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13140 return Med3;
13141 }
13142
13143 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13144 if (SDValue Med3 = performIntMed3ImmCombine(
13145 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13146 return Med3;
13147 }
13148 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13149 if (SDValue Med3 = performIntMed3ImmCombine(
13150 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13151 return Med3;
13152 }
13153
13154 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13155 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13156 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13157 (Opc == AMDGPUISD::FMIN_LEGACY &&
13158 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13159 (VT == MVT::f32 || VT == MVT::f64 ||
13160 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13161 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13162 Op0.hasOneUse()) {
13163 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13164 return Res;
13165 }
13166
13167 return SDValue();
13168}
13169
13171 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13172 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13173 // FIXME: Should this be allowing -0.0?
13174 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13175 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13176 }
13177 }
13178
13179 return false;
13180}
13181
13182// FIXME: Should only worry about snans for version with chain.
13183SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13184 DAGCombinerInfo &DCI) const {
13185 EVT VT = N->getValueType(0);
13186 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13187 // NaNs. With a NaN input, the order of the operands may change the result.
13188
13189 SelectionDAG &DAG = DCI.DAG;
13190 SDLoc SL(N);
13191
13192 SDValue Src0 = N->getOperand(0);
13193 SDValue Src1 = N->getOperand(1);
13194 SDValue Src2 = N->getOperand(2);
13195
13196 if (isClampZeroToOne(Src0, Src1)) {
13197 // const_a, const_b, x -> clamp is safe in all cases including signaling
13198 // nans.
13199 // FIXME: Should this be allowing -0.0?
13200 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13201 }
13202
13203 const MachineFunction &MF = DAG.getMachineFunction();
13205
13206 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13207 // handling no dx10-clamp?
13208 if (Info->getMode().DX10Clamp) {
13209 // If NaNs is clamped to 0, we are free to reorder the inputs.
13210
13211 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13212 std::swap(Src0, Src1);
13213
13214 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13215 std::swap(Src1, Src2);
13216
13217 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13218 std::swap(Src0, Src1);
13219
13220 if (isClampZeroToOne(Src1, Src2))
13221 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13222 }
13223
13224 return SDValue();
13225}
13226
13227SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13228 DAGCombinerInfo &DCI) const {
13229 SDValue Src0 = N->getOperand(0);
13230 SDValue Src1 = N->getOperand(1);
13231 if (Src0.isUndef() && Src1.isUndef())
13232 return DCI.DAG.getUNDEF(N->getValueType(0));
13233 return SDValue();
13234}
13235
13236// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13237// expanded into a set of cmp/select instructions.
13239 unsigned NumElem,
13240 bool IsDivergentIdx,
13241 const GCNSubtarget *Subtarget) {
13243 return false;
13244
13245 unsigned VecSize = EltSize * NumElem;
13246
13247 // Sub-dword vectors of size 2 dword or less have better implementation.
13248 if (VecSize <= 64 && EltSize < 32)
13249 return false;
13250
13251 // Always expand the rest of sub-dword instructions, otherwise it will be
13252 // lowered via memory.
13253 if (EltSize < 32)
13254 return true;
13255
13256 // Always do this if var-idx is divergent, otherwise it will become a loop.
13257 if (IsDivergentIdx)
13258 return true;
13259
13260 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13261 unsigned NumInsts = NumElem /* Number of compares */ +
13262 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13263
13264 // On some architectures (GFX9) movrel is not available and it's better
13265 // to expand.
13266 if (!Subtarget->hasMovrel())
13267 return NumInsts <= 16;
13268
13269 // If movrel is available, use it instead of expanding for vector of 8
13270 // elements.
13271 return NumInsts <= 15;
13272}
13273
13275 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13276 if (isa<ConstantSDNode>(Idx))
13277 return false;
13278
13279 SDValue Vec = N->getOperand(0);
13280 EVT VecVT = Vec.getValueType();
13281 EVT EltVT = VecVT.getVectorElementType();
13282 unsigned EltSize = EltVT.getSizeInBits();
13283 unsigned NumElem = VecVT.getVectorNumElements();
13284
13286 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13287}
13288
13289SDValue SITargetLowering::performExtractVectorEltCombine(
13290 SDNode *N, DAGCombinerInfo &DCI) const {
13291 SDValue Vec = N->getOperand(0);
13292 SelectionDAG &DAG = DCI.DAG;
13293
13294 EVT VecVT = Vec.getValueType();
13295 EVT VecEltVT = VecVT.getVectorElementType();
13296 EVT ResVT = N->getValueType(0);
13297
13298 unsigned VecSize = VecVT.getSizeInBits();
13299 unsigned VecEltSize = VecEltVT.getSizeInBits();
13300
13301 if ((Vec.getOpcode() == ISD::FNEG ||
13303 SDLoc SL(N);
13304 SDValue Idx = N->getOperand(1);
13305 SDValue Elt =
13306 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13307 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13308 }
13309
13310 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13311 // =>
13312 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13313 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13314 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13315 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13316 SDLoc SL(N);
13317 SDValue Idx = N->getOperand(1);
13318 unsigned Opc = Vec.getOpcode();
13319
13320 switch(Opc) {
13321 default:
13322 break;
13323 // TODO: Support other binary operations.
13324 case ISD::FADD:
13325 case ISD::FSUB:
13326 case ISD::FMUL:
13327 case ISD::ADD:
13328 case ISD::UMIN:
13329 case ISD::UMAX:
13330 case ISD::SMIN:
13331 case ISD::SMAX:
13332 case ISD::FMAXNUM:
13333 case ISD::FMINNUM:
13334 case ISD::FMAXNUM_IEEE:
13335 case ISD::FMINNUM_IEEE:
13336 case ISD::FMAXIMUM:
13337 case ISD::FMINIMUM: {
13338 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13339 Vec.getOperand(0), Idx);
13340 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13341 Vec.getOperand(1), Idx);
13342
13343 DCI.AddToWorklist(Elt0.getNode());
13344 DCI.AddToWorklist(Elt1.getNode());
13345 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13346 }
13347 }
13348 }
13349
13350 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13352 SDLoc SL(N);
13353 SDValue Idx = N->getOperand(1);
13354 SDValue V;
13355 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13356 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13357 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13358 if (I == 0)
13359 V = Elt;
13360 else
13361 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13362 }
13363 return V;
13364 }
13365
13366 if (!DCI.isBeforeLegalize())
13367 return SDValue();
13368
13369 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13370 // elements. This exposes more load reduction opportunities by replacing
13371 // multiple small extract_vector_elements with a single 32-bit extract.
13372 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13373 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13374 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13375 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13376
13377 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13378 unsigned EltIdx = BitIndex / 32;
13379 unsigned LeftoverBitIdx = BitIndex % 32;
13380 SDLoc SL(N);
13381
13382 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13383 DCI.AddToWorklist(Cast.getNode());
13384
13385 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13386 DAG.getConstant(EltIdx, SL, MVT::i32));
13387 DCI.AddToWorklist(Elt.getNode());
13388 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13389 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13390 DCI.AddToWorklist(Srl.getNode());
13391
13392 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13393 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13394 DCI.AddToWorklist(Trunc.getNode());
13395
13396 if (VecEltVT == ResVT) {
13397 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13398 }
13399
13400 assert(ResVT.isScalarInteger());
13401 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13402 }
13403
13404 return SDValue();
13405}
13406
13407SDValue
13408SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13409 DAGCombinerInfo &DCI) const {
13410 SDValue Vec = N->getOperand(0);
13411 SDValue Idx = N->getOperand(2);
13412 EVT VecVT = Vec.getValueType();
13413 EVT EltVT = VecVT.getVectorElementType();
13414
13415 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13416 // => BUILD_VECTOR n x select (e, const-idx)
13418 return SDValue();
13419
13420 SelectionDAG &DAG = DCI.DAG;
13421 SDLoc SL(N);
13422 SDValue Ins = N->getOperand(1);
13423 EVT IdxVT = Idx.getValueType();
13424
13426 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13427 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13428 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13429 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13430 Ops.push_back(V);
13431 }
13432
13433 return DAG.getBuildVector(VecVT, SL, Ops);
13434}
13435
13436/// Return the source of an fp_extend from f16 to f32, or a converted FP
13437/// constant.
13439 if (Src.getOpcode() == ISD::FP_EXTEND &&
13440 Src.getOperand(0).getValueType() == MVT::f16) {
13441 return Src.getOperand(0);
13442 }
13443
13444 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13445 APFloat Val = CFP->getValueAPF();
13446 bool LosesInfo = true;
13448 if (!LosesInfo)
13449 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13450 }
13451
13452 return SDValue();
13453}
13454
13455SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13456 DAGCombinerInfo &DCI) const {
13457 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13458 "combine only useful on gfx8");
13459
13460 SDValue TruncSrc = N->getOperand(0);
13461 EVT VT = N->getValueType(0);
13462 if (VT != MVT::f16)
13463 return SDValue();
13464
13465 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13466 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13467 return SDValue();
13468
13469 SelectionDAG &DAG = DCI.DAG;
13470 SDLoc SL(N);
13471
13472 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13473 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13474 // casting back.
13475
13476 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13477 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13478 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13479 if (!A)
13480 return SDValue();
13481
13482 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13483 if (!B)
13484 return SDValue();
13485
13486 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13487 if (!C)
13488 return SDValue();
13489
13490 // This changes signaling nan behavior. If an input is a signaling nan, it
13491 // would have been quieted by the fpext originally. We don't care because
13492 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13493 // we would be worse off than just doing the promotion.
13494 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13495 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13496 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13497 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13498}
13499
13500unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13501 const SDNode *N0,
13502 const SDNode *N1) const {
13503 EVT VT = N0->getValueType(0);
13504
13505 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13506 // support denormals ever.
13507 if (((VT == MVT::f32 &&
13509 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13512 return ISD::FMAD;
13513
13514 const TargetOptions &Options = DAG.getTarget().Options;
13515 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13516 (N0->getFlags().hasAllowContract() &&
13517 N1->getFlags().hasAllowContract())) &&
13519 return ISD::FMA;
13520 }
13521
13522 return 0;
13523}
13524
13525// For a reassociatable opcode perform:
13526// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13527SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13528 SelectionDAG &DAG) const {
13529 EVT VT = N->getValueType(0);
13530 if (VT != MVT::i32 && VT != MVT::i64)
13531 return SDValue();
13532
13533 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13534 return SDValue();
13535
13536 unsigned Opc = N->getOpcode();
13537 SDValue Op0 = N->getOperand(0);
13538 SDValue Op1 = N->getOperand(1);
13539
13540 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13541 return SDValue();
13542
13543 if (Op0->isDivergent())
13544 std::swap(Op0, Op1);
13545
13546 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13547 return SDValue();
13548
13549 SDValue Op2 = Op1.getOperand(1);
13550 Op1 = Op1.getOperand(0);
13551 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13552 return SDValue();
13553
13554 if (Op1->isDivergent())
13555 std::swap(Op1, Op2);
13556
13557 SDLoc SL(N);
13558 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13559 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13560}
13561
13562static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13563 EVT VT,
13564 SDValue N0, SDValue N1, SDValue N2,
13565 bool Signed) {
13567 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13568 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13569 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13570}
13571
13572// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13573// multiplies, if any.
13574//
13575// Full 64-bit multiplies that feed into an addition are lowered here instead
13576// of using the generic expansion. The generic expansion ends up with
13577// a tree of ADD nodes that prevents us from using the "add" part of the
13578// MAD instruction. The expansion produced here results in a chain of ADDs
13579// instead of a tree.
13580SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13581 DAGCombinerInfo &DCI) const {
13582 assert(N->getOpcode() == ISD::ADD);
13583
13584 SelectionDAG &DAG = DCI.DAG;
13585 EVT VT = N->getValueType(0);
13586 SDLoc SL(N);
13587 SDValue LHS = N->getOperand(0);
13588 SDValue RHS = N->getOperand(1);
13589
13590 if (VT.isVector())
13591 return SDValue();
13592
13593 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13594 // result in scalar registers for uniform values.
13595 if (!N->isDivergent() && Subtarget->hasSMulHi())
13596 return SDValue();
13597
13598 unsigned NumBits = VT.getScalarSizeInBits();
13599 if (NumBits <= 32 || NumBits > 64)
13600 return SDValue();
13601
13602 if (LHS.getOpcode() != ISD::MUL) {
13603 assert(RHS.getOpcode() == ISD::MUL);
13604 std::swap(LHS, RHS);
13605 }
13606
13607 // Avoid the fold if it would unduly increase the number of multiplies due to
13608 // multiple uses, except on hardware with full-rate multiply-add (which is
13609 // part of full-rate 64-bit ops).
13610 if (!Subtarget->hasFullRate64Ops()) {
13611 unsigned NumUsers = 0;
13612 for (SDNode *Use : LHS->uses()) {
13613 // There is a use that does not feed into addition, so the multiply can't
13614 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13615 if (Use->getOpcode() != ISD::ADD)
13616 return SDValue();
13617
13618 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13619 // MUL + 3xADD + 3xADDC over 3xMAD.
13620 ++NumUsers;
13621 if (NumUsers >= 3)
13622 return SDValue();
13623 }
13624 }
13625
13626 SDValue MulLHS = LHS.getOperand(0);
13627 SDValue MulRHS = LHS.getOperand(1);
13628 SDValue AddRHS = RHS;
13629
13630 // Always check whether operands are small unsigned values, since that
13631 // knowledge is useful in more cases. Check for small signed values only if
13632 // doing so can unlock a shorter code sequence.
13633 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13634 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13635
13636 bool MulSignedLo = false;
13637 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13638 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13639 numBitsSigned(MulRHS, DAG) <= 32;
13640 }
13641
13642 // The operands and final result all have the same number of bits. If
13643 // operands need to be extended, they can be extended with garbage. The
13644 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13645 // truncated away in the end.
13646 if (VT != MVT::i64) {
13647 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13648 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13649 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13650 }
13651
13652 // The basic code generated is conceptually straightforward. Pseudo code:
13653 //
13654 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13655 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13656 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13657 //
13658 // The second and third lines are optional, depending on whether the factors
13659 // are {sign,zero}-extended or not.
13660 //
13661 // The actual DAG is noisier than the pseudo code, but only due to
13662 // instructions that disassemble values into low and high parts, and
13663 // assemble the final result.
13664 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13665
13666 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13667 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13668 SDValue Accum =
13669 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13670
13671 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13672 SDValue AccumLo, AccumHi;
13673 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13674
13675 if (!MulLHSUnsigned32) {
13676 auto MulLHSHi =
13677 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13678 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13679 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13680 }
13681
13682 if (!MulRHSUnsigned32) {
13683 auto MulRHSHi =
13684 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13685 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13686 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13687 }
13688
13689 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13690 Accum = DAG.getBitcast(MVT::i64, Accum);
13691 }
13692
13693 if (VT != MVT::i64)
13694 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13695 return Accum;
13696}
13697
13698// Collect the ultimate src of each of the mul node's operands, and confirm
13699// each operand is 8 bytes.
13700static std::optional<ByteProvider<SDValue>>
13701handleMulOperand(const SDValue &MulOperand) {
13702 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13703 if (!Byte0 || Byte0->isConstantZero()) {
13704 return std::nullopt;
13705 }
13706 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13707 if (Byte1 && !Byte1->isConstantZero()) {
13708 return std::nullopt;
13709 }
13710 return Byte0;
13711}
13712
13713static unsigned addPermMasks(unsigned First, unsigned Second) {
13714 unsigned FirstCs = First & 0x0c0c0c0c;
13715 unsigned SecondCs = Second & 0x0c0c0c0c;
13716 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13717 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13718
13719 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13720 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13721 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13722 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13723
13724 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13725}
13726
13727struct DotSrc {
13729 int64_t PermMask;
13731};
13732
13736 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13737
13738 assert(Src0.Src.has_value() && Src1.Src.has_value());
13739 // Src0s and Src1s are empty, just place arbitrarily.
13740 if (Step == 0) {
13741 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13742 Src0.SrcOffset / 4});
13743 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13744 Src1.SrcOffset / 4});
13745 return;
13746 }
13747
13748 for (int BPI = 0; BPI < 2; BPI++) {
13749 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13750 if (BPI == 1) {
13751 BPP = {Src1, Src0};
13752 }
13753 unsigned ZeroMask = 0x0c0c0c0c;
13754 unsigned FMask = 0xFF << (8 * (3 - Step));
13755
13756 unsigned FirstMask =
13757 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13758 unsigned SecondMask =
13759 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13760 // Attempt to find Src vector which contains our SDValue, if so, add our
13761 // perm mask to the existing one. If we are unable to find a match for the
13762 // first SDValue, attempt to find match for the second.
13763 int FirstGroup = -1;
13764 for (int I = 0; I < 2; I++) {
13765 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13766 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13767 return IterElt.SrcOp == *BPP.first.Src &&
13768 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13769 };
13770
13771 auto Match = llvm::find_if(Srcs, MatchesFirst);
13772 if (Match != Srcs.end()) {
13773 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13774 FirstGroup = I;
13775 break;
13776 }
13777 }
13778 if (FirstGroup != -1) {
13779 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13780 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13781 return IterElt.SrcOp == *BPP.second.Src &&
13782 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13783 };
13784 auto Match = llvm::find_if(Srcs, MatchesSecond);
13785 if (Match != Srcs.end()) {
13786 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13787 } else
13788 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13789 return;
13790 }
13791 }
13792
13793 // If we have made it here, then we could not find a match in Src0s or Src1s
13794 // for either Src0 or Src1, so just place them arbitrarily.
13795
13796 unsigned ZeroMask = 0x0c0c0c0c;
13797 unsigned FMask = 0xFF << (8 * (3 - Step));
13798
13799 Src0s.push_back(
13800 {*Src0.Src,
13801 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13802 Src1.SrcOffset / 4});
13803 Src1s.push_back(
13804 {*Src1.Src,
13805 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13806 Src1.SrcOffset / 4});
13807
13808 return;
13809}
13810
13812 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13813 bool IsAny) {
13814
13815 // If we just have one source, just permute it accordingly.
13816 if (Srcs.size() == 1) {
13817 auto Elt = Srcs.begin();
13818 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13819
13820 // v_perm will produce the original value
13821 if (Elt->PermMask == 0x3020100)
13822 return EltOp;
13823
13824 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13825 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13826 }
13827
13828 auto FirstElt = Srcs.begin();
13829 auto SecondElt = std::next(FirstElt);
13830
13832
13833 // If we have multiple sources in the chain, combine them via perms (using
13834 // calculated perm mask) and Ors.
13835 while (true) {
13836 auto FirstMask = FirstElt->PermMask;
13837 auto SecondMask = SecondElt->PermMask;
13838
13839 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13840 unsigned FirstPlusFour = FirstMask | 0x04040404;
13841 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13842 // original 0x0C.
13843 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13844
13845 auto PermMask = addPermMasks(FirstMask, SecondMask);
13846 auto FirstVal =
13847 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13848 auto SecondVal =
13849 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13850
13851 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13852 SecondVal,
13853 DAG.getConstant(PermMask, SL, MVT::i32)));
13854
13855 FirstElt = std::next(SecondElt);
13856 if (FirstElt == Srcs.end())
13857 break;
13858
13859 SecondElt = std::next(FirstElt);
13860 // If we only have a FirstElt, then just combine that into the cumulative
13861 // source node.
13862 if (SecondElt == Srcs.end()) {
13863 auto EltOp =
13864 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13865
13866 Perms.push_back(
13867 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13868 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13869 break;
13870 }
13871 }
13872
13873 assert(Perms.size() == 1 || Perms.size() == 2);
13874 return Perms.size() == 2
13875 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13876 : Perms[0];
13877}
13878
13879static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13880 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13881 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13882 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13883 EntryMask += ZeroMask;
13884 }
13885}
13886
13887static bool isMul(const SDValue Op) {
13888 auto Opcode = Op.getOpcode();
13889
13890 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13891 Opcode == AMDGPUISD::MUL_I24);
13892}
13893
13894static std::optional<bool>
13896 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
13897 const SDValue &S1Op, const SelectionDAG &DAG) {
13898 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
13899 // of the dot4 is irrelevant.
13900 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
13901 return false;
13902
13903 auto Known0 = DAG.computeKnownBits(S0Op, 0);
13904 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
13905 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13906 auto Known1 = DAG.computeKnownBits(S1Op, 0);
13907 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
13908 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13909
13910 assert(!(S0IsUnsigned && S0IsSigned));
13911 assert(!(S1IsUnsigned && S1IsSigned));
13912
13913 // There are 9 possible permutations of
13914 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
13915
13916 // In two permutations, the sign bits are known to be the same for both Ops,
13917 // so simply return Signed / Unsigned corresponding to the MSB
13918
13919 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
13920 return S0IsSigned;
13921
13922 // In another two permutations, the sign bits are known to be opposite. In
13923 // this case return std::nullopt to indicate a bad match.
13924
13925 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
13926 return std::nullopt;
13927
13928 // In the remaining five permutations, we don't know the value of the sign
13929 // bit for at least one Op. Since we have a valid ByteProvider, we know that
13930 // the upper bits must be extension bits. Thus, the only ways for the sign
13931 // bit to be unknown is if it was sign extended from unknown value, or if it
13932 // was any extended. In either case, it is correct to use the signed
13933 // version of the signedness semantics of dot4
13934
13935 // In two of such permutations, we known the sign bit is set for
13936 // one op, and the other is unknown. It is okay to used signed version of
13937 // dot4.
13938 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
13939 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
13940 return true;
13941
13942 // In one such permutation, we don't know either of the sign bits. It is okay
13943 // to used the signed version of dot4.
13944 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
13945 return true;
13946
13947 // In two of such permutations, we known the sign bit is unset for
13948 // one op, and the other is unknown. Return std::nullopt to indicate a
13949 // bad match.
13950 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
13951 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
13952 return std::nullopt;
13953
13954 llvm_unreachable("Fully covered condition");
13955}
13956
13957SDValue SITargetLowering::performAddCombine(SDNode *N,
13958 DAGCombinerInfo &DCI) const {
13959 SelectionDAG &DAG = DCI.DAG;
13960 EVT VT = N->getValueType(0);
13961 SDLoc SL(N);
13962 SDValue LHS = N->getOperand(0);
13963 SDValue RHS = N->getOperand(1);
13964
13965 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
13966 if (Subtarget->hasMad64_32()) {
13967 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
13968 return Folded;
13969 }
13970 }
13971
13972 if (SDValue V = reassociateScalarOps(N, DAG)) {
13973 return V;
13974 }
13975
13976 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
13977 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
13978 SDValue TempNode(N, 0);
13979 std::optional<bool> IsSigned;
13983
13984 // Match the v_dot4 tree, while collecting src nodes.
13985 int ChainLength = 0;
13986 for (int I = 0; I < 4; I++) {
13987 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
13988 if (MulIdx == -1)
13989 break;
13990 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
13991 if (!Src0)
13992 break;
13993 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
13994 if (!Src1)
13995 break;
13996
13997 auto IterIsSigned = checkDot4MulSignedness(
13998 TempNode->getOperand(MulIdx), *Src0, *Src1,
13999 TempNode->getOperand(MulIdx)->getOperand(0),
14000 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14001 if (!IterIsSigned)
14002 break;
14003 if (!IsSigned)
14004 IsSigned = *IterIsSigned;
14005 if (*IterIsSigned != *IsSigned)
14006 break;
14007 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14008 auto AddIdx = 1 - MulIdx;
14009 // Allow the special case where add (add (mul24, 0), mul24) became ->
14010 // add (mul24, mul24).
14011 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14012 Src2s.push_back(TempNode->getOperand(AddIdx));
14013 auto Src0 =
14014 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14015 if (!Src0)
14016 break;
14017 auto Src1 =
14018 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14019 if (!Src1)
14020 break;
14021 auto IterIsSigned = checkDot4MulSignedness(
14022 TempNode->getOperand(AddIdx), *Src0, *Src1,
14023 TempNode->getOperand(AddIdx)->getOperand(0),
14024 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14025 if (!IterIsSigned)
14026 break;
14027 assert(IsSigned);
14028 if (*IterIsSigned != *IsSigned)
14029 break;
14030 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14031 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14032 ChainLength = I + 2;
14033 break;
14034 }
14035
14036 TempNode = TempNode->getOperand(AddIdx);
14037 Src2s.push_back(TempNode);
14038 ChainLength = I + 1;
14039 if (TempNode->getNumOperands() < 2)
14040 break;
14041 LHS = TempNode->getOperand(0);
14042 RHS = TempNode->getOperand(1);
14043 }
14044
14045 if (ChainLength < 2)
14046 return SDValue();
14047
14048 // Masks were constructed with assumption that we would find a chain of
14049 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14050 // 0x0c) so they do not affect dot calculation.
14051 if (ChainLength < 4) {
14052 fixMasks(Src0s, ChainLength);
14053 fixMasks(Src1s, ChainLength);
14054 }
14055
14056 SDValue Src0, Src1;
14057
14058 // If we are just using a single source for both, and have permuted the
14059 // bytes consistently, we can just use the sources without permuting
14060 // (commutation).
14061 bool UseOriginalSrc = false;
14062 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14063 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14064 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14065 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14066 SmallVector<unsigned, 4> SrcBytes;
14067 auto Src0Mask = Src0s.begin()->PermMask;
14068 SrcBytes.push_back(Src0Mask & 0xFF000000);
14069 bool UniqueEntries = true;
14070 for (auto I = 1; I < 4; I++) {
14071 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14072
14073 if (is_contained(SrcBytes, NextByte)) {
14074 UniqueEntries = false;
14075 break;
14076 }
14077 SrcBytes.push_back(NextByte);
14078 }
14079
14080 if (UniqueEntries) {
14081 UseOriginalSrc = true;
14082
14083 auto FirstElt = Src0s.begin();
14084 auto FirstEltOp =
14085 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14086
14087 auto SecondElt = Src1s.begin();
14088 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14089 SecondElt->DWordOffset);
14090
14091 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14092 MVT::getIntegerVT(32));
14093 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14094 MVT::getIntegerVT(32));
14095 }
14096 }
14097
14098 if (!UseOriginalSrc) {
14099 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14100 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14101 }
14102
14103 assert(IsSigned);
14104 SDValue Src2 =
14105 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14106
14107 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14108 : Intrinsic::amdgcn_udot4,
14109 SL, MVT::i64);
14110
14111 assert(!VT.isVector());
14112 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14113 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14114
14115 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14116 }
14117
14118 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14119 return SDValue();
14120
14121 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14122 // add x, sext (setcc) => usubo_carry x, 0, setcc
14123 unsigned Opc = LHS.getOpcode();
14124 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14125 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14126 std::swap(RHS, LHS);
14127
14128 Opc = RHS.getOpcode();
14129 switch (Opc) {
14130 default: break;
14131 case ISD::ZERO_EXTEND:
14132 case ISD::SIGN_EXTEND:
14133 case ISD::ANY_EXTEND: {
14134 auto Cond = RHS.getOperand(0);
14135 // If this won't be a real VOPC output, we would still need to insert an
14136 // extra instruction anyway.
14137 if (!isBoolSGPR(Cond))
14138 break;
14139 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14140 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14142 return DAG.getNode(Opc, SL, VTList, Args);
14143 }
14144 case ISD::UADDO_CARRY: {
14145 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14146 if (!isNullConstant(RHS.getOperand(1)))
14147 break;
14148 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
14149 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14150 }
14151 }
14152 return SDValue();
14153}
14154
14155SDValue SITargetLowering::performSubCombine(SDNode *N,
14156 DAGCombinerInfo &DCI) const {
14157 SelectionDAG &DAG = DCI.DAG;
14158 EVT VT = N->getValueType(0);
14159
14160 if (VT != MVT::i32)
14161 return SDValue();
14162
14163 SDLoc SL(N);
14164 SDValue LHS = N->getOperand(0);
14165 SDValue RHS = N->getOperand(1);
14166
14167 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14168 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14169 unsigned Opc = RHS.getOpcode();
14170 switch (Opc) {
14171 default: break;
14172 case ISD::ZERO_EXTEND:
14173 case ISD::SIGN_EXTEND:
14174 case ISD::ANY_EXTEND: {
14175 auto Cond = RHS.getOperand(0);
14176 // If this won't be a real VOPC output, we would still need to insert an
14177 // extra instruction anyway.
14178 if (!isBoolSGPR(Cond))
14179 break;
14180 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14181 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14183 return DAG.getNode(Opc, SL, VTList, Args);
14184 }
14185 }
14186
14187 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14188 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14189 if (!isNullConstant(LHS.getOperand(1)))
14190 return SDValue();
14191 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
14192 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14193 }
14194 return SDValue();
14195}
14196
14197SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14198 DAGCombinerInfo &DCI) const {
14199
14200 if (N->getValueType(0) != MVT::i32)
14201 return SDValue();
14202
14203 if (!isNullConstant(N->getOperand(1)))
14204 return SDValue();
14205
14206 SelectionDAG &DAG = DCI.DAG;
14207 SDValue LHS = N->getOperand(0);
14208
14209 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14210 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14211 unsigned LHSOpc = LHS.getOpcode();
14212 unsigned Opc = N->getOpcode();
14213 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14214 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14215 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14216 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14217 }
14218 return SDValue();
14219}
14220
14221SDValue SITargetLowering::performFAddCombine(SDNode *N,
14222 DAGCombinerInfo &DCI) const {
14223 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14224 return SDValue();
14225
14226 SelectionDAG &DAG = DCI.DAG;
14227 EVT VT = N->getValueType(0);
14228
14229 SDLoc SL(N);
14230 SDValue LHS = N->getOperand(0);
14231 SDValue RHS = N->getOperand(1);
14232
14233 // These should really be instruction patterns, but writing patterns with
14234 // source modifiers is a pain.
14235
14236 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14237 if (LHS.getOpcode() == ISD::FADD) {
14238 SDValue A = LHS.getOperand(0);
14239 if (A == LHS.getOperand(1)) {
14240 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14241 if (FusedOp != 0) {
14242 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14243 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14244 }
14245 }
14246 }
14247
14248 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14249 if (RHS.getOpcode() == ISD::FADD) {
14250 SDValue A = RHS.getOperand(0);
14251 if (A == RHS.getOperand(1)) {
14252 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14253 if (FusedOp != 0) {
14254 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14255 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14256 }
14257 }
14258 }
14259
14260 return SDValue();
14261}
14262
14263SDValue SITargetLowering::performFSubCombine(SDNode *N,
14264 DAGCombinerInfo &DCI) const {
14265 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14266 return SDValue();
14267
14268 SelectionDAG &DAG = DCI.DAG;
14269 SDLoc SL(N);
14270 EVT VT = N->getValueType(0);
14271 assert(!VT.isVector());
14272
14273 // Try to get the fneg to fold into the source modifier. This undoes generic
14274 // DAG combines and folds them into the mad.
14275 //
14276 // Only do this if we are not trying to support denormals. v_mad_f32 does
14277 // not support denormals ever.
14278 SDValue LHS = N->getOperand(0);
14279 SDValue RHS = N->getOperand(1);
14280 if (LHS.getOpcode() == ISD::FADD) {
14281 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14282 SDValue A = LHS.getOperand(0);
14283 if (A == LHS.getOperand(1)) {
14284 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14285 if (FusedOp != 0){
14286 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14287 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14288
14289 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14290 }
14291 }
14292 }
14293
14294 if (RHS.getOpcode() == ISD::FADD) {
14295 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14296
14297 SDValue A = RHS.getOperand(0);
14298 if (A == RHS.getOperand(1)) {
14299 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14300 if (FusedOp != 0){
14301 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14302 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14303 }
14304 }
14305 }
14306
14307 return SDValue();
14308}
14309
14310SDValue SITargetLowering::performFDivCombine(SDNode *N,
14311 DAGCombinerInfo &DCI) const {
14312 SelectionDAG &DAG = DCI.DAG;
14313 SDLoc SL(N);
14314 EVT VT = N->getValueType(0);
14315 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14316 return SDValue();
14317
14318 SDValue LHS = N->getOperand(0);
14319 SDValue RHS = N->getOperand(1);
14320
14321 SDNodeFlags Flags = N->getFlags();
14322 SDNodeFlags RHSFlags = RHS->getFlags();
14323 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14324 !RHS->hasOneUse())
14325 return SDValue();
14326
14327 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14328 bool IsNegative = false;
14329 if (CLHS->isExactlyValue(1.0) ||
14330 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14331 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14332 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14333 if (RHS.getOpcode() == ISD::FSQRT) {
14334 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14335 SDValue Rsq =
14336 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14337 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14338 }
14339 }
14340 }
14341
14342 return SDValue();
14343}
14344
14345SDValue SITargetLowering::performFMACombine(SDNode *N,
14346 DAGCombinerInfo &DCI) const {
14347 SelectionDAG &DAG = DCI.DAG;
14348 EVT VT = N->getValueType(0);
14349 SDLoc SL(N);
14350
14351 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14352 return SDValue();
14353
14354 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14355 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14356 SDValue Op1 = N->getOperand(0);
14357 SDValue Op2 = N->getOperand(1);
14358 SDValue FMA = N->getOperand(2);
14359
14360 if (FMA.getOpcode() != ISD::FMA ||
14361 Op1.getOpcode() != ISD::FP_EXTEND ||
14362 Op2.getOpcode() != ISD::FP_EXTEND)
14363 return SDValue();
14364
14365 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14366 // regardless of the denorm mode setting. Therefore,
14367 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14368 const TargetOptions &Options = DAG.getTarget().Options;
14369 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14370 (N->getFlags().hasAllowContract() &&
14371 FMA->getFlags().hasAllowContract())) {
14372 Op1 = Op1.getOperand(0);
14373 Op2 = Op2.getOperand(0);
14374 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14376 return SDValue();
14377
14378 SDValue Vec1 = Op1.getOperand(0);
14379 SDValue Idx1 = Op1.getOperand(1);
14380 SDValue Vec2 = Op2.getOperand(0);
14381
14382 SDValue FMAOp1 = FMA.getOperand(0);
14383 SDValue FMAOp2 = FMA.getOperand(1);
14384 SDValue FMAAcc = FMA.getOperand(2);
14385
14386 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14387 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14388 return SDValue();
14389
14390 FMAOp1 = FMAOp1.getOperand(0);
14391 FMAOp2 = FMAOp2.getOperand(0);
14392 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14394 return SDValue();
14395
14396 SDValue Vec3 = FMAOp1.getOperand(0);
14397 SDValue Vec4 = FMAOp2.getOperand(0);
14398 SDValue Idx2 = FMAOp1.getOperand(1);
14399
14400 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14401 // Idx1 and Idx2 cannot be the same.
14402 Idx1 == Idx2)
14403 return SDValue();
14404
14405 if (Vec1 == Vec2 || Vec3 == Vec4)
14406 return SDValue();
14407
14408 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14409 return SDValue();
14410
14411 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14412 (Vec1 == Vec4 && Vec2 == Vec3)) {
14413 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14414 DAG.getTargetConstant(0, SL, MVT::i1));
14415 }
14416 }
14417 return SDValue();
14418}
14419
14420SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14421 DAGCombinerInfo &DCI) const {
14422 SelectionDAG &DAG = DCI.DAG;
14423 SDLoc SL(N);
14424
14425 SDValue LHS = N->getOperand(0);
14426 SDValue RHS = N->getOperand(1);
14427 EVT VT = LHS.getValueType();
14428 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14429
14430 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14431 if (!CRHS) {
14432 CRHS = dyn_cast<ConstantSDNode>(LHS);
14433 if (CRHS) {
14434 std::swap(LHS, RHS);
14436 }
14437 }
14438
14439 if (CRHS) {
14440 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14441 isBoolSGPR(LHS.getOperand(0))) {
14442 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14443 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14444 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14445 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14446 if ((CRHS->isAllOnes() &&
14447 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14448 (CRHS->isZero() &&
14449 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14450 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14451 DAG.getConstant(-1, SL, MVT::i1));
14452 if ((CRHS->isAllOnes() &&
14453 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14454 (CRHS->isZero() &&
14455 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14456 return LHS.getOperand(0);
14457 }
14458
14459 const APInt &CRHSVal = CRHS->getAPIntValue();
14460 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14461 LHS.getOpcode() == ISD::SELECT &&
14462 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14463 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14464 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14465 isBoolSGPR(LHS.getOperand(0))) {
14466 // Given CT != FT:
14467 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14468 // setcc (select cc, CT, CF), CF, ne => cc
14469 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14470 // setcc (select cc, CT, CF), CT, eq => cc
14471 const APInt &CT = LHS.getConstantOperandAPInt(1);
14472 const APInt &CF = LHS.getConstantOperandAPInt(2);
14473
14474 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14475 (CT == CRHSVal && CC == ISD::SETNE))
14476 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14477 DAG.getConstant(-1, SL, MVT::i1));
14478 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14479 (CT == CRHSVal && CC == ISD::SETEQ))
14480 return LHS.getOperand(0);
14481 }
14482 }
14483
14484 if (VT != MVT::f32 && VT != MVT::f64 &&
14485 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14486 return SDValue();
14487
14488 // Match isinf/isfinite pattern
14489 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14490 // (fcmp one (fabs x), inf) -> (fp_class x,
14491 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14492 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14493 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14494 if (!CRHS)
14495 return SDValue();
14496
14497 const APFloat &APF = CRHS->getValueAPF();
14498 if (APF.isInfinity() && !APF.isNegative()) {
14499 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14501 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14507 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14508 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14509 DAG.getConstant(Mask, SL, MVT::i32));
14510 }
14511 }
14512
14513 return SDValue();
14514}
14515
14516SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14517 DAGCombinerInfo &DCI) const {
14518 SelectionDAG &DAG = DCI.DAG;
14519 SDLoc SL(N);
14520 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14521
14522 SDValue Src = N->getOperand(0);
14523 SDValue Shift = N->getOperand(0);
14524
14525 // TODO: Extend type shouldn't matter (assuming legal types).
14526 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14527 Shift = Shift.getOperand(0);
14528
14529 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14530 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14531 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14532 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14533 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14534 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14535 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14536 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14537 SDLoc(Shift.getOperand(0)), MVT::i32);
14538
14539 unsigned ShiftOffset = 8 * Offset;
14540 if (Shift.getOpcode() == ISD::SHL)
14541 ShiftOffset -= C->getZExtValue();
14542 else
14543 ShiftOffset += C->getZExtValue();
14544
14545 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14546 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14547 MVT::f32, Shifted);
14548 }
14549 }
14550 }
14551
14552 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14553 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14554 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14555 // We simplified Src. If this node is not dead, visit it again so it is
14556 // folded properly.
14557 if (N->getOpcode() != ISD::DELETED_NODE)
14558 DCI.AddToWorklist(N);
14559 return SDValue(N, 0);
14560 }
14561
14562 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14563 if (SDValue DemandedSrc =
14565 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14566
14567 return SDValue();
14568}
14569
14570SDValue SITargetLowering::performClampCombine(SDNode *N,
14571 DAGCombinerInfo &DCI) const {
14572 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14573 if (!CSrc)
14574 return SDValue();
14575
14576 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14577 const APFloat &F = CSrc->getValueAPF();
14578 APFloat Zero = APFloat::getZero(F.getSemantics());
14579 if (F < Zero ||
14580 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14581 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14582 }
14583
14584 APFloat One(F.getSemantics(), "1.0");
14585 if (F > One)
14586 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14587
14588 return SDValue(CSrc, 0);
14589}
14590
14591
14593 DAGCombinerInfo &DCI) const {
14594 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14595 return SDValue();
14596 switch (N->getOpcode()) {
14597 case ISD::ADD:
14598 return performAddCombine(N, DCI);
14599 case ISD::SUB:
14600 return performSubCombine(N, DCI);
14601 case ISD::UADDO_CARRY:
14602 case ISD::USUBO_CARRY:
14603 return performAddCarrySubCarryCombine(N, DCI);
14604 case ISD::FADD:
14605 return performFAddCombine(N, DCI);
14606 case ISD::FSUB:
14607 return performFSubCombine(N, DCI);
14608 case ISD::FDIV:
14609 return performFDivCombine(N, DCI);
14610 case ISD::SETCC:
14611 return performSetCCCombine(N, DCI);
14612 case ISD::FMAXNUM:
14613 case ISD::FMINNUM:
14614 case ISD::FMAXNUM_IEEE:
14615 case ISD::FMINNUM_IEEE:
14616 case ISD::FMAXIMUM:
14617 case ISD::FMINIMUM:
14618 case ISD::SMAX:
14619 case ISD::SMIN:
14620 case ISD::UMAX:
14621 case ISD::UMIN:
14624 return performMinMaxCombine(N, DCI);
14625 case ISD::FMA:
14626 return performFMACombine(N, DCI);
14627 case ISD::AND:
14628 return performAndCombine(N, DCI);
14629 case ISD::OR:
14630 return performOrCombine(N, DCI);
14631 case ISD::FSHR: {
14633 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14634 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14635 return matchPERM(N, DCI);
14636 }
14637 break;
14638 }
14639 case ISD::XOR:
14640 return performXorCombine(N, DCI);
14641 case ISD::ZERO_EXTEND:
14642 return performZeroExtendCombine(N, DCI);
14644 return performSignExtendInRegCombine(N , DCI);
14646 return performClassCombine(N, DCI);
14647 case ISD::FCANONICALIZE:
14648 return performFCanonicalizeCombine(N, DCI);
14649 case AMDGPUISD::RCP:
14650 return performRcpCombine(N, DCI);
14651 case ISD::FLDEXP:
14652 case AMDGPUISD::FRACT:
14653 case AMDGPUISD::RSQ:
14656 case AMDGPUISD::RSQ_CLAMP: {
14657 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14658 SDValue Src = N->getOperand(0);
14659 if (Src.isUndef())
14660 return Src;
14661 break;
14662 }
14663 case ISD::SINT_TO_FP:
14664 case ISD::UINT_TO_FP:
14665 return performUCharToFloatCombine(N, DCI);
14666 case ISD::FCOPYSIGN:
14667 return performFCopySignCombine(N, DCI);
14672 return performCvtF32UByteNCombine(N, DCI);
14673 case AMDGPUISD::FMED3:
14674 return performFMed3Combine(N, DCI);
14676 return performCvtPkRTZCombine(N, DCI);
14677 case AMDGPUISD::CLAMP:
14678 return performClampCombine(N, DCI);
14679 case ISD::SCALAR_TO_VECTOR: {
14680 SelectionDAG &DAG = DCI.DAG;
14681 EVT VT = N->getValueType(0);
14682
14683 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14684 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14685 SDLoc SL(N);
14686 SDValue Src = N->getOperand(0);
14687 EVT EltVT = Src.getValueType();
14688 if (EltVT != MVT::i16)
14689 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14690
14691 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14692 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14693 }
14694
14695 break;
14696 }
14698 return performExtractVectorEltCombine(N, DCI);
14700 return performInsertVectorEltCombine(N, DCI);
14701 case ISD::FP_ROUND:
14702 return performFPRoundCombine(N, DCI);
14703 case ISD::LOAD: {
14704 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14705 return Widened;
14706 [[fallthrough]];
14707 }
14708 default: {
14709 if (!DCI.isBeforeLegalize()) {
14710 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14711 return performMemSDNodeCombine(MemNode, DCI);
14712 }
14713
14714 break;
14715 }
14716 }
14717
14719}
14720
14721/// Helper function for adjustWritemask
14722static unsigned SubIdx2Lane(unsigned Idx) {
14723 switch (Idx) {
14724 default: return ~0u;
14725 case AMDGPU::sub0: return 0;
14726 case AMDGPU::sub1: return 1;
14727 case AMDGPU::sub2: return 2;
14728 case AMDGPU::sub3: return 3;
14729 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14730 }
14731}
14732
14733/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14734SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14735 SelectionDAG &DAG) const {
14736 unsigned Opcode = Node->getMachineOpcode();
14737
14738 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14739 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14740 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14741 return Node; // not implemented for D16
14742
14743 SDNode *Users[5] = { nullptr };
14744 unsigned Lane = 0;
14745 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14746 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14747 unsigned NewDmask = 0;
14748 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14749 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14750 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14751 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14752 ? true
14753 : false;
14754 unsigned TFCLane = 0;
14755 bool HasChain = Node->getNumValues() > 1;
14756
14757 if (OldDmask == 0) {
14758 // These are folded out, but on the chance it happens don't assert.
14759 return Node;
14760 }
14761
14762 unsigned OldBitsSet = llvm::popcount(OldDmask);
14763 // Work out which is the TFE/LWE lane if that is enabled.
14764 if (UsesTFC) {
14765 TFCLane = OldBitsSet;
14766 }
14767
14768 // Try to figure out the used register components
14769 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14770 I != E; ++I) {
14771
14772 // Don't look at users of the chain.
14773 if (I.getUse().getResNo() != 0)
14774 continue;
14775
14776 // Abort if we can't understand the usage
14777 if (!I->isMachineOpcode() ||
14778 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14779 return Node;
14780
14781 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14782 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14783 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14784 // set, etc.
14785 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14786 if (Lane == ~0u)
14787 return Node;
14788
14789 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14790 if (UsesTFC && Lane == TFCLane) {
14791 Users[Lane] = *I;
14792 } else {
14793 // Set which texture component corresponds to the lane.
14794 unsigned Comp;
14795 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14796 Comp = llvm::countr_zero(Dmask);
14797 Dmask &= ~(1 << Comp);
14798 }
14799
14800 // Abort if we have more than one user per component.
14801 if (Users[Lane])
14802 return Node;
14803
14804 Users[Lane] = *I;
14805 NewDmask |= 1 << Comp;
14806 }
14807 }
14808
14809 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14810 bool NoChannels = !NewDmask;
14811 if (NoChannels) {
14812 if (!UsesTFC) {
14813 // No uses of the result and not using TFC. Then do nothing.
14814 return Node;
14815 }
14816 // If the original dmask has one channel - then nothing to do
14817 if (OldBitsSet == 1)
14818 return Node;
14819 // Use an arbitrary dmask - required for the instruction to work
14820 NewDmask = 1;
14821 }
14822 // Abort if there's no change
14823 if (NewDmask == OldDmask)
14824 return Node;
14825
14826 unsigned BitsSet = llvm::popcount(NewDmask);
14827
14828 // Check for TFE or LWE - increase the number of channels by one to account
14829 // for the extra return value
14830 // This will need adjustment for D16 if this is also included in
14831 // adjustWriteMask (this function) but at present D16 are excluded.
14832 unsigned NewChannels = BitsSet + UsesTFC;
14833
14834 int NewOpcode =
14835 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14836 assert(NewOpcode != -1 &&
14837 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14838 "failed to find equivalent MIMG op");
14839
14840 // Adjust the writemask in the node
14842 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14843 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14844 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14845
14846 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14847
14848 MVT ResultVT = NewChannels == 1 ?
14849 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
14850 NewChannels == 5 ? 8 : NewChannels);
14851 SDVTList NewVTList = HasChain ?
14852 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14853
14854
14855 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
14856 NewVTList, Ops);
14857
14858 if (HasChain) {
14859 // Update chain.
14860 DAG.setNodeMemRefs(NewNode, Node->memoperands());
14861 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
14862 }
14863
14864 if (NewChannels == 1) {
14865 assert(Node->hasNUsesOfValue(1, 0));
14866 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
14867 SDLoc(Node), Users[Lane]->getValueType(0),
14868 SDValue(NewNode, 0));
14869 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
14870 return nullptr;
14871 }
14872
14873 // Update the users of the node with the new indices
14874 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14875 SDNode *User = Users[i];
14876 if (!User) {
14877 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14878 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14879 if (i || !NoChannels)
14880 continue;
14881 } else {
14882 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14883 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
14884 if (NewUser != User) {
14885 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
14886 DAG.RemoveDeadNode(User);
14887 }
14888 }
14889
14890 switch (Idx) {
14891 default: break;
14892 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14893 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14894 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14895 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
14896 }
14897 }
14898
14899 DAG.RemoveDeadNode(Node);
14900 return nullptr;
14901}
14902
14904 if (Op.getOpcode() == ISD::AssertZext)
14905 Op = Op.getOperand(0);
14906
14907 return isa<FrameIndexSDNode>(Op);
14908}
14909
14910/// Legalize target independent instructions (e.g. INSERT_SUBREG)
14911/// with frame index operands.
14912/// LLVM assumes that inputs are to these instructions are registers.
14914 SelectionDAG &DAG) const {
14915 if (Node->getOpcode() == ISD::CopyToReg) {
14916 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
14917 SDValue SrcVal = Node->getOperand(2);
14918
14919 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
14920 // to try understanding copies to physical registers.
14921 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
14922 SDLoc SL(Node);
14924 SDValue VReg = DAG.getRegister(
14925 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
14926
14927 SDNode *Glued = Node->getGluedNode();
14928 SDValue ToVReg
14929 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
14930 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
14931 SDValue ToResultReg
14932 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
14933 VReg, ToVReg.getValue(1));
14934 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
14935 DAG.RemoveDeadNode(Node);
14936 return ToResultReg.getNode();
14937 }
14938 }
14939
14941 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
14942 if (!isFrameIndexOp(Node->getOperand(i))) {
14943 Ops.push_back(Node->getOperand(i));
14944 continue;
14945 }
14946
14947 SDLoc DL(Node);
14948 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
14949 Node->getOperand(i).getValueType(),
14950 Node->getOperand(i)), 0));
14951 }
14952
14953 return DAG.UpdateNodeOperands(Node, Ops);
14954}
14955
14956/// Fold the instructions after selecting them.
14957/// Returns null if users were already updated.
14959 SelectionDAG &DAG) const {
14961 unsigned Opcode = Node->getMachineOpcode();
14962
14963 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
14964 !TII->isGather4(Opcode) &&
14965 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
14966 return adjustWritemask(Node, DAG);
14967 }
14968
14969 if (Opcode == AMDGPU::INSERT_SUBREG ||
14970 Opcode == AMDGPU::REG_SEQUENCE) {
14972 return Node;
14973 }
14974
14975 switch (Opcode) {
14976 case AMDGPU::V_DIV_SCALE_F32_e64:
14977 case AMDGPU::V_DIV_SCALE_F64_e64: {
14978 // Satisfy the operand register constraint when one of the inputs is
14979 // undefined. Ordinarily each undef value will have its own implicit_def of
14980 // a vreg, so force these to use a single register.
14981 SDValue Src0 = Node->getOperand(1);
14982 SDValue Src1 = Node->getOperand(3);
14983 SDValue Src2 = Node->getOperand(5);
14984
14985 if ((Src0.isMachineOpcode() &&
14986 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
14987 (Src0 == Src1 || Src0 == Src2))
14988 break;
14989
14990 MVT VT = Src0.getValueType().getSimpleVT();
14991 const TargetRegisterClass *RC =
14992 getRegClassFor(VT, Src0.getNode()->isDivergent());
14993
14995 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
14996
14997 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
14998 UndefReg, Src0, SDValue());
14999
15000 // src0 must be the same register as src1 or src2, even if the value is
15001 // undefined, so make sure we don't violate this constraint.
15002 if (Src0.isMachineOpcode() &&
15003 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15004 if (Src1.isMachineOpcode() &&
15005 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15006 Src0 = Src1;
15007 else if (Src2.isMachineOpcode() &&
15008 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15009 Src0 = Src2;
15010 else {
15011 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15012 Src0 = UndefReg;
15013 Src1 = UndefReg;
15014 }
15015 } else
15016 break;
15017
15018 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15019 Ops[1] = Src0;
15020 Ops[3] = Src1;
15021 Ops[5] = Src2;
15022 Ops.push_back(ImpDef.getValue(1));
15023 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15024 }
15025 default:
15026 break;
15027 }
15028
15029 return Node;
15030}
15031
15032// Any MIMG instructions that use tfe or lwe require an initialization of the
15033// result register that will be written in the case of a memory access failure.
15034// The required code is also added to tie this init code to the result of the
15035// img instruction.
15038 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15039 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15040 MachineBasicBlock &MBB = *MI.getParent();
15041
15042 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15043 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15044 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15045
15046 if (!TFE && !LWE) // intersect_ray
15047 return;
15048
15049 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15050 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15051 unsigned D16Val = D16 ? D16->getImm() : 0;
15052
15053 if (!TFEVal && !LWEVal)
15054 return;
15055
15056 // At least one of TFE or LWE are non-zero
15057 // We have to insert a suitable initialization of the result value and
15058 // tie this to the dest of the image instruction.
15059
15060 const DebugLoc &DL = MI.getDebugLoc();
15061
15062 int DstIdx =
15063 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15064
15065 // Calculate which dword we have to initialize to 0.
15066 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15067
15068 // check that dmask operand is found.
15069 assert(MO_Dmask && "Expected dmask operand in instruction");
15070
15071 unsigned dmask = MO_Dmask->getImm();
15072 // Determine the number of active lanes taking into account the
15073 // Gather4 special case
15074 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15075
15076 bool Packed = !Subtarget->hasUnpackedD16VMem();
15077
15078 unsigned InitIdx =
15079 D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15080
15081 // Abandon attempt if the dst size isn't large enough
15082 // - this is in fact an error but this is picked up elsewhere and
15083 // reported correctly.
15084 uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15085 if (DstSize < InitIdx)
15086 return;
15087
15088 // Create a register for the initialization value.
15089 Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15090 unsigned NewDst = 0; // Final initialized value will be in here
15091
15092 // If PRTStrictNull feature is enabled (the default) then initialize
15093 // all the result registers to 0, otherwise just the error indication
15094 // register (VGPRn+1)
15095 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15096 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15097
15098 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15099 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15100 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15101 // Initialize dword
15102 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15103 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15104 .addImm(0);
15105 // Insert into the super-reg
15106 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15107 .addReg(PrevDst)
15108 .addReg(SubReg)
15110
15111 PrevDst = NewDst;
15112 }
15113
15114 // Add as an implicit operand
15115 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15116
15117 // Tie the just added implicit operand to the dst
15118 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15119}
15120
15121/// Assign the register class depending on the number of
15122/// bits set in the writemask
15124 SDNode *Node) const {
15126
15127 MachineFunction *MF = MI.getParent()->getParent();
15130
15131 if (TII->isVOP3(MI.getOpcode())) {
15132 // Make sure constant bus requirements are respected.
15133 TII->legalizeOperandsVOP3(MRI, MI);
15134
15135 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15136 // This saves a chain-copy of registers and better balance register
15137 // use between vgpr and agpr as agpr tuples tend to be big.
15138 if (!MI.getDesc().operands().empty()) {
15139 unsigned Opc = MI.getOpcode();
15140 bool HasAGPRs = Info->mayNeedAGPRs();
15141 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15142 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15143 for (auto I :
15144 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15145 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15146 if (I == -1)
15147 break;
15148 if ((I == Src2Idx) && (HasAGPRs))
15149 break;
15150 MachineOperand &Op = MI.getOperand(I);
15151 if (!Op.isReg() || !Op.getReg().isVirtual())
15152 continue;
15153 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15154 if (!TRI->hasAGPRs(RC))
15155 continue;
15156 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15157 if (!Src || !Src->isCopy() ||
15158 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15159 continue;
15160 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15161 // All uses of agpr64 and agpr32 can also accept vgpr except for
15162 // v_accvgpr_read, but we do not produce agpr reads during selection,
15163 // so no use checks are needed.
15164 MRI.setRegClass(Op.getReg(), NewRC);
15165 }
15166
15167 if (!HasAGPRs)
15168 return;
15169
15170 // Resolve the rest of AV operands to AGPRs.
15171 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15172 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15173 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15174 if (TRI->isVectorSuperClass(RC)) {
15175 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15176 MRI.setRegClass(Src2->getReg(), NewRC);
15177 if (Src2->isTied())
15178 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15179 }
15180 }
15181 }
15182 }
15183
15184 return;
15185 }
15186
15187 if (TII->isImage(MI)) {
15188 if (!MI.mayStore())
15189 AddIMGInit(MI);
15190 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15191 }
15192}
15193
15195 uint64_t Val) {
15196 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15197 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15198}
15199
15201 const SDLoc &DL,
15202 SDValue Ptr) const {
15204
15205 // Build the half of the subregister with the constants before building the
15206 // full 128-bit register. If we are building multiple resource descriptors,
15207 // this will allow CSEing of the 2-component register.
15208 const SDValue Ops0[] = {
15209 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15210 buildSMovImm32(DAG, DL, 0),
15211 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15212 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15213 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15214 };
15215
15216 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15217 MVT::v2i32, Ops0), 0);
15218
15219 // Combine the constants and the pointer.
15220 const SDValue Ops1[] = {
15221 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15222 Ptr,
15223 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15224 SubRegHi,
15225 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15226 };
15227
15228 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15229}
15230
15231/// Return a resource descriptor with the 'Add TID' bit enabled
15232/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15233/// of the resource descriptor) to create an offset, which is added to
15234/// the resource pointer.
15236 SDValue Ptr, uint32_t RsrcDword1,
15237 uint64_t RsrcDword2And3) const {
15238 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15239 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15240 if (RsrcDword1) {
15241 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15242 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15243 0);
15244 }
15245
15246 SDValue DataLo = buildSMovImm32(DAG, DL,
15247 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15248 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15249
15250 const SDValue Ops[] = {
15251 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15252 PtrLo,
15253 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15254 PtrHi,
15255 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15256 DataLo,
15257 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15258 DataHi,
15259 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15260 };
15261
15262 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15263}
15264
15265//===----------------------------------------------------------------------===//
15266// SI Inline Assembly Support
15267//===----------------------------------------------------------------------===//
15268
15269std::pair<unsigned, const TargetRegisterClass *>
15271 StringRef Constraint,
15272 MVT VT) const {
15273 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15274
15275 const TargetRegisterClass *RC = nullptr;
15276 if (Constraint.size() == 1) {
15277 const unsigned BitWidth = VT.getSizeInBits();
15278 switch (Constraint[0]) {
15279 default:
15280 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15281 case 's':
15282 case 'r':
15283 switch (BitWidth) {
15284 case 16:
15285 RC = &AMDGPU::SReg_32RegClass;
15286 break;
15287 case 64:
15288 RC = &AMDGPU::SGPR_64RegClass;
15289 break;
15290 default:
15292 if (!RC)
15293 return std::pair(0U, nullptr);
15294 break;
15295 }
15296 break;
15297 case 'v':
15298 switch (BitWidth) {
15299 case 16:
15300 RC = &AMDGPU::VGPR_32RegClass;
15301 break;
15302 default:
15303 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15304 if (!RC)
15305 return std::pair(0U, nullptr);
15306 break;
15307 }
15308 break;
15309 case 'a':
15310 if (!Subtarget->hasMAIInsts())
15311 break;
15312 switch (BitWidth) {
15313 case 16:
15314 RC = &AMDGPU::AGPR_32RegClass;
15315 break;
15316 default:
15317 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15318 if (!RC)
15319 return std::pair(0U, nullptr);
15320 break;
15321 }
15322 break;
15323 }
15324 // We actually support i128, i16 and f16 as inline parameters
15325 // even if they are not reported as legal
15326 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15327 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15328 return std::pair(0U, RC);
15329 }
15330
15331 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15332 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15333 if (RegName.consume_front("v")) {
15334 RC = &AMDGPU::VGPR_32RegClass;
15335 } else if (RegName.consume_front("s")) {
15336 RC = &AMDGPU::SGPR_32RegClass;
15337 } else if (RegName.consume_front("a")) {
15338 RC = &AMDGPU::AGPR_32RegClass;
15339 }
15340
15341 if (RC) {
15342 uint32_t Idx;
15343 if (RegName.consume_front("[")) {
15344 uint32_t End;
15345 bool Failed = RegName.consumeInteger(10, Idx);
15346 Failed |= !RegName.consume_front(":");
15347 Failed |= RegName.consumeInteger(10, End);
15348 Failed |= !RegName.consume_back("]");
15349 if (!Failed) {
15350 uint32_t Width = (End - Idx + 1) * 32;
15351 MCRegister Reg = RC->getRegister(Idx);
15353 RC = TRI->getVGPRClassForBitWidth(Width);
15354 else if (SIRegisterInfo::isSGPRClass(RC))
15355 RC = TRI->getSGPRClassForBitWidth(Width);
15356 else if (SIRegisterInfo::isAGPRClass(RC))
15357 RC = TRI->getAGPRClassForBitWidth(Width);
15358 if (RC) {
15359 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15360 return std::pair(Reg, RC);
15361 }
15362 }
15363 } else {
15364 bool Failed = RegName.getAsInteger(10, Idx);
15365 if (!Failed && Idx < RC->getNumRegs())
15366 return std::pair(RC->getRegister(Idx), RC);
15367 }
15368 }
15369 }
15370
15371 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15372 if (Ret.first)
15373 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15374
15375 return Ret;
15376}
15377
15378static bool isImmConstraint(StringRef Constraint) {
15379 if (Constraint.size() == 1) {
15380 switch (Constraint[0]) {
15381 default: break;
15382 case 'I':
15383 case 'J':
15384 case 'A':
15385 case 'B':
15386 case 'C':
15387 return true;
15388 }
15389 } else if (Constraint == "DA" ||
15390 Constraint == "DB") {
15391 return true;
15392 }
15393 return false;
15394}
15395
15398 if (Constraint.size() == 1) {
15399 switch (Constraint[0]) {
15400 default: break;
15401 case 's':
15402 case 'v':
15403 case 'a':
15404 return C_RegisterClass;
15405 }
15406 }
15407 if (isImmConstraint(Constraint)) {
15408 return C_Other;
15409 }
15410 return TargetLowering::getConstraintType(Constraint);
15411}
15412
15413static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15415 Val = Val & maskTrailingOnes<uint64_t>(Size);
15416 }
15417 return Val;
15418}
15419
15421 StringRef Constraint,
15422 std::vector<SDValue> &Ops,
15423 SelectionDAG &DAG) const {
15424 if (isImmConstraint(Constraint)) {
15425 uint64_t Val;
15426 if (getAsmOperandConstVal(Op, Val) &&
15427 checkAsmConstraintVal(Op, Constraint, Val)) {
15428 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15429 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15430 }
15431 } else {
15432 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15433 }
15434}
15435
15437 unsigned Size = Op.getScalarValueSizeInBits();
15438 if (Size > 64)
15439 return false;
15440
15441 if (Size == 16 && !Subtarget->has16BitInsts())
15442 return false;
15443
15444 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15445 Val = C->getSExtValue();
15446 return true;
15447 }
15448 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15449 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15450 return true;
15451 }
15452 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15453 if (Size != 16 || Op.getNumOperands() != 2)
15454 return false;
15455 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15456 return false;
15457 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15458 Val = C->getSExtValue();
15459 return true;
15460 }
15461 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15462 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15463 return true;
15464 }
15465 }
15466
15467 return false;
15468}
15469
15471 uint64_t Val) const {
15472 if (Constraint.size() == 1) {
15473 switch (Constraint[0]) {
15474 case 'I':
15476 case 'J':
15477 return isInt<16>(Val);
15478 case 'A':
15479 return checkAsmConstraintValA(Op, Val);
15480 case 'B':
15481 return isInt<32>(Val);
15482 case 'C':
15483 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15485 default:
15486 break;
15487 }
15488 } else if (Constraint.size() == 2) {
15489 if (Constraint == "DA") {
15490 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15491 int64_t LoBits = static_cast<int32_t>(Val);
15492 return checkAsmConstraintValA(Op, HiBits, 32) &&
15493 checkAsmConstraintValA(Op, LoBits, 32);
15494 }
15495 if (Constraint == "DB") {
15496 return true;
15497 }
15498 }
15499 llvm_unreachable("Invalid asm constraint");
15500}
15501
15503 unsigned MaxSize) const {
15504 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15505 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15506 if (Size == 16) {
15507 MVT VT = Op.getSimpleValueType();
15508 switch (VT.SimpleTy) {
15509 default:
15510 return false;
15511 case MVT::i16:
15512 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15513 case MVT::f16:
15514 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15515 case MVT::bf16:
15516 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15517 case MVT::v2i16:
15518 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15519 case MVT::v2f16:
15520 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15521 case MVT::v2bf16:
15522 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15523 }
15524 }
15525 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15526 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15527 return true;
15528 return false;
15529}
15530
15531static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15532 switch (UnalignedClassID) {
15533 case AMDGPU::VReg_64RegClassID:
15534 return AMDGPU::VReg_64_Align2RegClassID;
15535 case AMDGPU::VReg_96RegClassID:
15536 return AMDGPU::VReg_96_Align2RegClassID;
15537 case AMDGPU::VReg_128RegClassID:
15538 return AMDGPU::VReg_128_Align2RegClassID;
15539 case AMDGPU::VReg_160RegClassID:
15540 return AMDGPU::VReg_160_Align2RegClassID;
15541 case AMDGPU::VReg_192RegClassID:
15542 return AMDGPU::VReg_192_Align2RegClassID;
15543 case AMDGPU::VReg_224RegClassID:
15544 return AMDGPU::VReg_224_Align2RegClassID;
15545 case AMDGPU::VReg_256RegClassID:
15546 return AMDGPU::VReg_256_Align2RegClassID;
15547 case AMDGPU::VReg_288RegClassID:
15548 return AMDGPU::VReg_288_Align2RegClassID;
15549 case AMDGPU::VReg_320RegClassID:
15550 return AMDGPU::VReg_320_Align2RegClassID;
15551 case AMDGPU::VReg_352RegClassID:
15552 return AMDGPU::VReg_352_Align2RegClassID;
15553 case AMDGPU::VReg_384RegClassID:
15554 return AMDGPU::VReg_384_Align2RegClassID;
15555 case AMDGPU::VReg_512RegClassID:
15556 return AMDGPU::VReg_512_Align2RegClassID;
15557 case AMDGPU::VReg_1024RegClassID:
15558 return AMDGPU::VReg_1024_Align2RegClassID;
15559 case AMDGPU::AReg_64RegClassID:
15560 return AMDGPU::AReg_64_Align2RegClassID;
15561 case AMDGPU::AReg_96RegClassID:
15562 return AMDGPU::AReg_96_Align2RegClassID;
15563 case AMDGPU::AReg_128RegClassID:
15564 return AMDGPU::AReg_128_Align2RegClassID;
15565 case AMDGPU::AReg_160RegClassID:
15566 return AMDGPU::AReg_160_Align2RegClassID;
15567 case AMDGPU::AReg_192RegClassID:
15568 return AMDGPU::AReg_192_Align2RegClassID;
15569 case AMDGPU::AReg_256RegClassID:
15570 return AMDGPU::AReg_256_Align2RegClassID;
15571 case AMDGPU::AReg_512RegClassID:
15572 return AMDGPU::AReg_512_Align2RegClassID;
15573 case AMDGPU::AReg_1024RegClassID:
15574 return AMDGPU::AReg_1024_Align2RegClassID;
15575 default:
15576 return -1;
15577 }
15578}
15579
15580// Figure out which registers should be reserved for stack access. Only after
15581// the function is legalized do we know all of the non-spill stack objects or if
15582// calls are present.
15586 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15587 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15588 const SIInstrInfo *TII = ST.getInstrInfo();
15589
15590 if (Info->isEntryFunction()) {
15591 // Callable functions have fixed registers used for stack access.
15593 }
15594
15595 // TODO: Move this logic to getReservedRegs()
15596 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15597 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15598 Register SReg = ST.isWave32()
15599 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15600 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15601 &AMDGPU::SGPR_64RegClass);
15602 Info->setSGPRForEXECCopy(SReg);
15603
15604 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15605 Info->getStackPtrOffsetReg()));
15606 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15607 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15608
15609 // We need to worry about replacing the default register with itself in case
15610 // of MIR testcases missing the MFI.
15611 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15612 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15613
15614 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15615 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15616
15617 Info->limitOccupancy(MF);
15618
15619 if (ST.isWave32() && !MF.empty()) {
15620 for (auto &MBB : MF) {
15621 for (auto &MI : MBB) {
15622 TII->fixImplicitOperands(MI);
15623 }
15624 }
15625 }
15626
15627 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15628 // classes if required. Ideally the register class constraints would differ
15629 // per-subtarget, but there's no easy way to achieve that right now. This is
15630 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15631 // from using them as the register class for legal types.
15632 if (ST.needsAlignedVGPRs()) {
15633 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15634 const Register Reg = Register::index2VirtReg(I);
15635 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15636 if (!RC)
15637 continue;
15638 int NewClassID = getAlignedAGPRClassID(RC->getID());
15639 if (NewClassID != -1)
15640 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15641 }
15642 }
15643
15645}
15646
15648 KnownBits &Known,
15649 const APInt &DemandedElts,
15650 const SelectionDAG &DAG,
15651 unsigned Depth) const {
15652 Known.resetAll();
15653 unsigned Opc = Op.getOpcode();
15654 switch (Opc) {
15656 unsigned IID = Op.getConstantOperandVal(0);
15657 switch (IID) {
15658 case Intrinsic::amdgcn_mbcnt_lo:
15659 case Intrinsic::amdgcn_mbcnt_hi: {
15660 const GCNSubtarget &ST =
15662 // These return at most the (wavefront size - 1) + src1
15663 // As long as src1 is an immediate we can calc known bits
15664 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15665 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15666 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15667 // Cater for potential carry
15668 MaxActiveBits += Src1ValBits ? 1 : 0;
15669 unsigned Size = Op.getValueType().getSizeInBits();
15670 if (MaxActiveBits < Size)
15671 Known.Zero.setHighBits(Size - MaxActiveBits);
15672 return;
15673 }
15674 }
15675 break;
15676 }
15677 }
15679 Op, Known, DemandedElts, DAG, Depth);
15680}
15681
15683 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15685
15686 // Set the high bits to zero based on the maximum allowed scratch size per
15687 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15688 // calculation won't overflow, so assume the sign bit is never set.
15689 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15690}
15691
15693 KnownBits &Known, unsigned Dim) {
15694 unsigned MaxValue =
15695 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15696 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15697}
15698
15700 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15701 const MachineRegisterInfo &MRI, unsigned Depth) const {
15702 const MachineInstr *MI = MRI.getVRegDef(R);
15703 switch (MI->getOpcode()) {
15704 case AMDGPU::G_INTRINSIC:
15705 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15706 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15707 case Intrinsic::amdgcn_workitem_id_x:
15708 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15709 break;
15710 case Intrinsic::amdgcn_workitem_id_y:
15711 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15712 break;
15713 case Intrinsic::amdgcn_workitem_id_z:
15714 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15715 break;
15716 case Intrinsic::amdgcn_mbcnt_lo:
15717 case Intrinsic::amdgcn_mbcnt_hi: {
15718 // These return at most the wavefront size - 1.
15719 unsigned Size = MRI.getType(R).getSizeInBits();
15720 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15721 break;
15722 }
15723 case Intrinsic::amdgcn_groupstaticsize: {
15724 // We can report everything over the maximum size as 0. We can't report
15725 // based on the actual size because we don't know if it's accurate or not
15726 // at any given point.
15727 Known.Zero.setHighBits(
15728 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15729 break;
15730 }
15731 }
15732 break;
15733 }
15734 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15735 Known.Zero.setHighBits(24);
15736 break;
15737 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15738 Known.Zero.setHighBits(16);
15739 break;
15740 case AMDGPU::G_AMDGPU_SMED3:
15741 case AMDGPU::G_AMDGPU_UMED3: {
15742 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15743
15744 KnownBits Known2;
15745 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15746 if (Known2.isUnknown())
15747 break;
15748
15749 KnownBits Known1;
15750 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15751 if (Known1.isUnknown())
15752 break;
15753
15754 KnownBits Known0;
15755 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15756 if (Known0.isUnknown())
15757 break;
15758
15759 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15760 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15761 Known.One = Known0.One & Known1.One & Known2.One;
15762 break;
15763 }
15764 }
15765}
15766
15769 unsigned Depth) const {
15770 const MachineInstr *MI = MRI.getVRegDef(R);
15771 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15772 // FIXME: Can this move to generic code? What about the case where the call
15773 // site specifies a lower alignment?
15774 Intrinsic::ID IID = GI->getIntrinsicID();
15776 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15777 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15778 return *RetAlign;
15779 }
15780 return Align(1);
15781}
15782
15785 const Align CacheLineAlign = Align(64);
15786
15787 // Pre-GFX10 target did not benefit from loop alignment
15788 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15789 getSubtarget()->hasInstFwdPrefetchBug())
15790 return PrefAlign;
15791
15792 // On GFX10 I$ is 4 x 64 bytes cache lines.
15793 // By default prefetcher keeps one cache line behind and reads two ahead.
15794 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15795 // behind and one ahead.
15796 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15797 // If loop fits 64 bytes it always spans no more than two cache lines and
15798 // does not need an alignment.
15799 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15800 // Else if loop is less or equal 192 bytes we need two lines behind.
15801
15803 const MachineBasicBlock *Header = ML->getHeader();
15804 if (Header->getAlignment() != PrefAlign)
15805 return Header->getAlignment(); // Already processed.
15806
15807 unsigned LoopSize = 0;
15808 for (const MachineBasicBlock *MBB : ML->blocks()) {
15809 // If inner loop block is aligned assume in average half of the alignment
15810 // size to be added as nops.
15811 if (MBB != Header)
15812 LoopSize += MBB->getAlignment().value() / 2;
15813
15814 for (const MachineInstr &MI : *MBB) {
15815 LoopSize += TII->getInstSizeInBytes(MI);
15816 if (LoopSize > 192)
15817 return PrefAlign;
15818 }
15819 }
15820
15821 if (LoopSize <= 64)
15822 return PrefAlign;
15823
15824 if (LoopSize <= 128)
15825 return CacheLineAlign;
15826
15827 // If any of parent loops is surrounded by prefetch instructions do not
15828 // insert new for inner loop, which would reset parent's settings.
15829 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15830 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15831 auto I = Exit->getFirstNonDebugInstr();
15832 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15833 return CacheLineAlign;
15834 }
15835 }
15836
15837 MachineBasicBlock *Pre = ML->getLoopPreheader();
15838 MachineBasicBlock *Exit = ML->getExitBlock();
15839
15840 if (Pre && Exit) {
15841 auto PreTerm = Pre->getFirstTerminator();
15842 if (PreTerm == Pre->begin() ||
15843 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15844 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15845 .addImm(1); // prefetch 2 lines behind PC
15846
15847 auto ExitHead = Exit->getFirstNonDebugInstr();
15848 if (ExitHead == Exit->end() ||
15849 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15850 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15851 .addImm(2); // prefetch 1 line behind PC
15852 }
15853
15854 return CacheLineAlign;
15855}
15856
15858static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15859 assert(N->getOpcode() == ISD::CopyFromReg);
15860 do {
15861 // Follow the chain until we find an INLINEASM node.
15862 N = N->getOperand(0).getNode();
15863 if (N->getOpcode() == ISD::INLINEASM ||
15864 N->getOpcode() == ISD::INLINEASM_BR)
15865 return true;
15866 } while (N->getOpcode() == ISD::CopyFromReg);
15867 return false;
15868}
15869
15872 UniformityInfo *UA) const {
15873 switch (N->getOpcode()) {
15874 case ISD::CopyFromReg: {
15875 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15876 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15877 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15878 Register Reg = R->getReg();
15879
15880 // FIXME: Why does this need to consider isLiveIn?
15881 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15882 return !TRI->isSGPRReg(MRI, Reg);
15883
15884 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15885 return UA->isDivergent(V);
15886
15888 return !TRI->isSGPRReg(MRI, Reg);
15889 }
15890 case ISD::LOAD: {
15891 const LoadSDNode *L = cast<LoadSDNode>(N);
15892 unsigned AS = L->getAddressSpace();
15893 // A flat load may access private memory.
15895 }
15896 case ISD::CALLSEQ_END:
15897 return true;
15899 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
15901 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
15923 // Target-specific read-modify-write atomics are sources of divergence.
15924 return true;
15925 default:
15926 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
15927 // Generic read-modify-write atomics are sources of divergence.
15928 return A->readMem() && A->writeMem();
15929 }
15930 return false;
15931 }
15932}
15933
15935 EVT VT) const {
15936 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
15937 case MVT::f32:
15939 case MVT::f64:
15940 case MVT::f16:
15942 default:
15943 return false;
15944 }
15945}
15946
15948 LLT Ty, const MachineFunction &MF) const {
15949 switch (Ty.getScalarSizeInBits()) {
15950 case 32:
15951 return !denormalModeIsFlushAllF32(MF);
15952 case 64:
15953 case 16:
15954 return !denormalModeIsFlushAllF64F16(MF);
15955 default:
15956 return false;
15957 }
15958}
15959
15961 const SelectionDAG &DAG,
15962 bool SNaN,
15963 unsigned Depth) const {
15964 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
15965 const MachineFunction &MF = DAG.getMachineFunction();
15967
15968 if (Info->getMode().DX10Clamp)
15969 return true; // Clamped to 0.
15970 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
15971 }
15972
15974 SNaN, Depth);
15975}
15976
15977// Global FP atomic instructions have a hardcoded FP mode and do not support
15978// FP32 denormals, and only support v2f16 denormals.
15981 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
15982 if (&Flt == &APFloat::IEEEsingle())
15983 return DenormMode == DenormalMode::getPreserveSign();
15984 return DenormMode == DenormalMode::getIEEE();
15985}
15986
15987// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
15988// floating point atomic instructions. May generate more efficient code,
15989// but may not respect rounding and denormal modes, and may give incorrect
15990// results for certain memory destinations.
15992 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
15993 "true";
15994}
15995
15998 unsigned AS = RMW->getPointerAddressSpace();
15999 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16001
16002 auto SSID = RMW->getSyncScopeID();
16003
16004 auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) {
16006 LLVMContext &Ctx = RMW->getFunction()->getContext();
16008 Ctx.getSyncScopeNames(SSNs);
16009 auto MemScope = SSNs[RMW->getSyncScopeID()].empty()
16010 ? "system"
16011 : SSNs[RMW->getSyncScopeID()];
16012 ORE.emit([&]() {
16013 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16014 << "Hardware instruction generated for atomic "
16015 << RMW->getOperationName(RMW->getOperation())
16016 << " operation at memory scope " << MemScope
16017 << " due to an unsafe request.";
16018 });
16019 return Kind;
16020 };
16021
16022 bool HasSystemScope =
16023 SSID == SyncScope::System ||
16024 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16025
16026 switch (RMW->getOperation()) {
16027 case AtomicRMWInst::FAdd: {
16028 Type *Ty = RMW->getType();
16029
16030 if (Ty->isHalfTy())
16032
16033 if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
16035
16038 Subtarget->hasAtomicFaddNoRtnInsts()) {
16039 if (Subtarget->hasGFX940Insts())
16041
16044
16045 // Always expand system scope fp atomics.
16046 if (HasSystemScope)
16048
16049 if ((AS == AMDGPUAS::GLOBAL_ADDRESS ||
16051 Ty->isFloatTy()) {
16052 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16053 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16054 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16055 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16056 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16057 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16058 }
16059
16060 // flat atomic fadd f32: gfx940, gfx11+.
16061 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
16062 Subtarget->hasFlatAtomicFaddF32Inst())
16063 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16064
16065 // global and flat atomic fadd f64: gfx90a, gfx940.
16066 if (Ty->isDoubleTy() && Subtarget->hasGFX90AInsts())
16067 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16068
16069 // If it is in flat address space, and the type is float, we will try to
16070 // expand it, if the target supports global and lds atomic fadd. The
16071 // reason we need that is, in the expansion, we emit the check of address
16072 // space. If it is in global address space, we emit the global atomic
16073 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16074 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
16075 Subtarget->hasLDSFPAtomicAdd()) {
16076 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16078 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16080 }
16081
16083 }
16084
16085 // DS FP atomics do respect the denormal mode, but the rounding mode is
16086 // fixed to round-to-nearest-even.
16087 // The only exception is DS_ADD_F64 which never flushes regardless of mode.
16088 if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
16089 if (!Ty->isDoubleTy())
16091
16094
16095 return RMW->getFunction()
16096 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16097 .getValueAsString() == "true"
16098 ? ReportUnsafeHWInst(AtomicExpansionKind::None)
16100 }
16101
16103 }
16106 case AtomicRMWInst::Min:
16107 case AtomicRMWInst::Max:
16109 case AtomicRMWInst::UMax: {
16112 if (RMW->getType()->isFloatTy() &&
16115
16116 // Always expand system scope min/max atomics.
16117 if (HasSystemScope)
16119 }
16120 break;
16121 }
16122 default:
16123 break;
16124 }
16125
16127}
16128
16134}
16135
16138 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16141}
16142
16148}
16149
16150const TargetRegisterClass *
16151SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16153 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16154 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16155 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16156 : &AMDGPU::SReg_32RegClass;
16157 if (!TRI->isSGPRClass(RC) && !isDivergent)
16158 return TRI->getEquivalentSGPRClass(RC);
16159 else if (TRI->isSGPRClass(RC) && isDivergent)
16160 return TRI->getEquivalentVGPRClass(RC);
16161
16162 return RC;
16163}
16164
16165// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16166// uniform values (as produced by the mask results of control flow intrinsics)
16167// used outside of divergent blocks. The phi users need to also be treated as
16168// always uniform.
16169//
16170// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16171static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16172 unsigned WaveSize) {
16173 // FIXME: We assume we never cast the mask results of a control flow
16174 // intrinsic.
16175 // Early exit if the type won't be consistent as a compile time hack.
16176 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16177 if (!IT || IT->getBitWidth() != WaveSize)
16178 return false;
16179
16180 if (!isa<Instruction>(V))
16181 return false;
16182 if (!Visited.insert(V).second)
16183 return false;
16184 bool Result = false;
16185 for (const auto *U : V->users()) {
16186 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16187 if (V == U->getOperand(1)) {
16188 switch (Intrinsic->getIntrinsicID()) {
16189 default:
16190 Result = false;
16191 break;
16192 case Intrinsic::amdgcn_if_break:
16193 case Intrinsic::amdgcn_if:
16194 case Intrinsic::amdgcn_else:
16195 Result = true;
16196 break;
16197 }
16198 }
16199 if (V == U->getOperand(0)) {
16200 switch (Intrinsic->getIntrinsicID()) {
16201 default:
16202 Result = false;
16203 break;
16204 case Intrinsic::amdgcn_end_cf:
16205 case Intrinsic::amdgcn_loop:
16206 Result = true;
16207 break;
16208 }
16209 }
16210 } else {
16211 Result = hasCFUser(U, Visited, WaveSize);
16212 }
16213 if (Result)
16214 break;
16215 }
16216 return Result;
16217}
16218
16220 const Value *V) const {
16221 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16222 if (CI->isInlineAsm()) {
16223 // FIXME: This cannot give a correct answer. This should only trigger in
16224 // the case where inline asm returns mixed SGPR and VGPR results, used
16225 // outside the defining block. We don't have a specific result to
16226 // consider, so this assumes if any value is SGPR, the overall register
16227 // also needs to be SGPR.
16228 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16230 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16231 for (auto &TC : TargetConstraints) {
16232 if (TC.Type == InlineAsm::isOutput) {
16235 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16236 if (RC && SIRI->isSGPRClass(RC))
16237 return true;
16238 }
16239 }
16240 }
16241 }
16243 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16244}
16245
16247 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16248 for (; I != E; ++I) {
16249 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
16250 if (getBasePtrIndex(M) == I.getOperandNo())
16251 return true;
16252 }
16253 }
16254 return false;
16255}
16256
16258 SDValue N1) const {
16259 if (!N0.hasOneUse())
16260 return false;
16261 // Take care of the opportunity to keep N0 uniform
16262 if (N0->isDivergent() || !N1->isDivergent())
16263 return true;
16264 // Check if we have a good chance to form the memory access pattern with the
16265 // base and offset
16266 return (DAG.isBaseWithConstantOffset(N0) &&
16267 hasMemSDNodeUser(*N0->use_begin()));
16268}
16269
16271 Register N0, Register N1) const {
16272 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16273}
16274
16277 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16279 if (I.getMetadata("amdgpu.noclobber"))
16280 Flags |= MONoClobber;
16281 if (I.getMetadata("amdgpu.last.use"))
16282 Flags |= MOLastUse;
16283 return Flags;
16284}
16285
16287 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16288 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16289 if (User->getOpcode() != ISD::CopyToReg)
16290 return false;
16291 if (!Def->isMachineOpcode())
16292 return false;
16293 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16294 if (!MDef)
16295 return false;
16296
16297 unsigned ResNo = User->getOperand(Op).getResNo();
16298 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16299 return false;
16300 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16301 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16302 PhysReg = AMDGPU::SCC;
16303 const TargetRegisterClass *RC =
16304 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16305 Cost = RC->getCopyCost();
16306 return true;
16307 }
16308 return false;
16309}
16310
16312 assert(Subtarget->hasAtomicFaddInsts() &&
16313 "target should have atomic fadd instructions");
16314 assert(AI->getType()->isFloatTy() &&
16316 "generic atomicrmw expansion only supports FP32 operand in flat "
16317 "address space");
16319 "only fadd is supported for now");
16320
16321 // Given: atomicrmw fadd ptr %addr, float %val ordering
16322 //
16323 // With this expansion we produce the following code:
16324 // [...]
16325 // br label %atomicrmw.check.shared
16326 //
16327 // atomicrmw.check.shared:
16328 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16329 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16330 //
16331 // atomicrmw.shared:
16332 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16333 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16334 // float %val ordering
16335 // br label %atomicrmw.phi
16336 //
16337 // atomicrmw.check.private:
16338 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16339 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16340 //
16341 // atomicrmw.private:
16342 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16343 // %loaded.private = load float, ptr addrspace(5) %cast.private
16344 // %val.new = fadd float %loaded.private, %val
16345 // store float %val.new, ptr addrspace(5) %cast.private
16346 // br label %atomicrmw.phi
16347 //
16348 // atomicrmw.global:
16349 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16350 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16351 // float %val ordering
16352 // br label %atomicrmw.phi
16353 //
16354 // atomicrmw.phi:
16355 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16356 // [ %loaded.private, %atomicrmw.private ],
16357 // [ %loaded.global, %atomicrmw.global ]
16358 // br label %atomicrmw.end
16359 //
16360 // atomicrmw.end:
16361 // [...]
16362
16363 IRBuilder<> Builder(AI);
16364 LLVMContext &Ctx = Builder.getContext();
16365
16366 BasicBlock *BB = Builder.GetInsertBlock();
16367 Function *F = BB->getParent();
16368 BasicBlock *ExitBB =
16369 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16370 BasicBlock *CheckSharedBB =
16371 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
16372 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16373 BasicBlock *CheckPrivateBB =
16374 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16375 BasicBlock *PrivateBB =
16376 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16377 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16378 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16379
16380 Value *Val = AI->getValOperand();
16381 Type *ValTy = Val->getType();
16382 Value *Addr = AI->getPointerOperand();
16383
16384 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16385 Value *Val) -> Value * {
16386 AtomicRMWInst *OldVal =
16387 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16388 AI->getOrdering(), AI->getSyncScopeID());
16390 AI->getAllMetadata(MDs);
16391 for (auto &P : MDs)
16392 OldVal->setMetadata(P.first, P.second);
16393 return OldVal;
16394 };
16395
16396 std::prev(BB->end())->eraseFromParent();
16397 Builder.SetInsertPoint(BB);
16398 Builder.CreateBr(CheckSharedBB);
16399
16400 Builder.SetInsertPoint(CheckSharedBB);
16401 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16402 {Addr}, nullptr, "is.shared");
16403 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16404
16405 Builder.SetInsertPoint(SharedBB);
16406 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16408 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16409 Builder.CreateBr(PhiBB);
16410
16411 Builder.SetInsertPoint(CheckPrivateBB);
16412 CallInst *IsPrivate = Builder.CreateIntrinsic(
16413 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16414 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16415
16416 Builder.SetInsertPoint(PrivateBB);
16417 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16419 Value *LoadedPrivate =
16420 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
16421 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
16422 Builder.CreateStore(NewVal, CastToPrivate);
16423 Builder.CreateBr(PhiBB);
16424
16425 Builder.SetInsertPoint(GlobalBB);
16426 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16428 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16429 Builder.CreateBr(PhiBB);
16430
16431 Builder.SetInsertPoint(PhiBB);
16432 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
16433 Loaded->addIncoming(LoadedShared, SharedBB);
16434 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16435 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16436 Builder.CreateBr(ExitBB);
16437
16438 AI->replaceAllUsesWith(Loaded);
16439 AI->eraseFromParent();
16440}
16441
16442LoadInst *
16444 IRBuilder<> Builder(AI);
16445 auto Order = AI->getOrdering();
16446
16447 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16448 // must be flushed if the atomic ordering had a release semantics. This is
16449 // not necessary a fence, a release fence just coincides to do that flush.
16450 // Avoid replacing of an atomicrmw with a release semantics.
16451 if (isReleaseOrStronger(Order))
16452 return nullptr;
16453
16454 LoadInst *LI = Builder.CreateAlignedLoad(
16455 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16456 LI->setAtomic(Order, AI->getSyncScopeID());
16457 LI->copyMetadata(*AI);
16458 LI->takeName(AI);
16459 AI->replaceAllUsesWith(LI);
16460 AI->eraseFromParent();
16461 return LI;
16462}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
unsigned const TargetRegisterInfo * TRI
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1174
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1171
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
bool unsafeFPAtomicsDisabled(Function *F)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getIdxEn(SDValue VIndex)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
LLVM IR instance of the generic uniformity analysis.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:988
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5196
bool isNegative() const
Definition: APFloat.h:1295
APInt bitcastToAPInt() const
Definition: APFloat.h:1210
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1006
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:957
bool isInfinity() const
Definition: APFloat.h:1292
Class for arbitrary precision integers.
Definition: APInt.h:76
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:444
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1199
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:684
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:867
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
Value * getPointerOperand()
Definition: Instructions.h:910
BinOp getOperation() const
Definition: Instructions.h:845
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:901
Value * getValOperand()
Definition: Instructions.h:914
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:887
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:918
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:349
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:442
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:198
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:557
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:205
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1703
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1789
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1648
unsigned arg_size() const
Definition: InstrTypes.h:1646
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:965
@ ICMP_NE
not equal
Definition: InstrTypes.h:987
bool isSigned() const
Definition: InstrTypes.h:1226
bool isFPPredicate() const
Definition: InstrTypes.h:1083
bool isIntPredicate() const
Definition: InstrTypes.h:1084
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:204
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Definition: DerivedTypes.h:103
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:200
iterator_range< arg_iterator > args()
Definition: Function.h:837
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:695
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:262
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:342
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:732
bool hasPrefetch() const
Definition: GCNSubtarget.h:887
bool hasD16Images() const
Definition: GCNSubtarget.h:682
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:462
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:453
bool hasDot7Insts() const
Definition: GCNSubtarget.h:781
bool hasApertureRegs() const
Definition: GCNSubtarget.h:582
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:610
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:751
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:396
bool hasMAIInsts() const
Definition: GCNSubtarget.h:801
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:662
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:512
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:570
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:251
bool hasDot1Insts() const
Definition: GCNSubtarget.h:757
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:823
Align getStackAlignment() const
Definition: GCNSubtarget.h:900
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:440
bool enableFlatScratch() const
Definition: GCNSubtarget.h:635
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:606
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:446
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:839
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:263
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:727
bool useDS128() const
Definition: GCNSubtarget.h:522
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:442
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:255
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:574
bool hasLDSFPAtomicAdd() const
Definition: GCNSubtarget.h:958
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:412
bool hasIntClamp() const
Definition: GCNSubtarget.h:342
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:988
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:362
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:586
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:614
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:913
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:716
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:321
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:867
bool hasFFBL() const
Definition: GCNSubtarget.h:400
bool hasNSAEncoding() const
bool hasSMemRealTime() const
Definition: GCNSubtarget.h:930
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:544
bool hasMed3_16() const
Definition: GCNSubtarget.h:408
bool hasMovrel() const
Definition: GCNSubtarget.h:934
bool hasBFI() const
Definition: GCNSubtarget.h:388
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:562
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:329
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:507
bool hasFFBH() const
Definition: GCNSubtarget.h:404
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:819
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:825
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:948
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:532
bool hasDot8Insts() const
Definition: GCNSubtarget.h:785
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:527
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:516
Generation getGeneration() const
Definition: GCNSubtarget.h:302
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:714
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:718
bool hasAddr64() const
Definition: GCNSubtarget.h:366
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:416
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:710
bool hasFractBug() const
Definition: GCNSubtarget.h:380
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:384
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:697
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:510
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Type * getValueType() const
Definition: GlobalValue.h:296
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1806
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1527
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2380
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1114
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1789
LLVMContext & getContext() const
Definition: IRBuilder.h:176
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1802
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1853
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1108
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2115
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2649
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:340
const BasicBlock * getParent() const
Definition: Instruction.h:151
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:84
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1633
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
Definition: Instruction.h:376
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:266
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
bool isCompare() const
Return true if this instruction is a comparison.
Definition: MCInstrDesc.h:341
bool hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI=nullptr) const
Return true if this instruction implicitly defines the specified physical register.
Definition: MCInstrDesc.cpp:32
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Metadata node.
Definition: Metadata.h:1067
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:554
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
void setImplicit(bool Val=true)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void AddIMGInit(MachineInstr &MI) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:954
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:551
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const Pass * getPass() const
Definition: SelectionDAG.h:470
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:560
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:849
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:257
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:271
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:370
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:246
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
constexpr bool isZero() const
Definition: TypeSize.h:156
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:422
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:268
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:236
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1122
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:723
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:998
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1269
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1271
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1241
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1272
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:487
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:979
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1031
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1254
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:913
@ FPTRUNC_ROUND
Definition: ISDOpcodes.h:480
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1267
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1268
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:939
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1400
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1274
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1188
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1047
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:722
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1221
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:988
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1077
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1270
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:500
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:507
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1237
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:222
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:208
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:880
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1016
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:993
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1265
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on two values,...
Definition: ISDOpcodes.h:978
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1211
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1248
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1273
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:331
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1041
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1097
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:922
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1279
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1263
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:465
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:984
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1264
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1182
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:470
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1208
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1262
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:944
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:414
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:907
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1094
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1070
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:763
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1278
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1523
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1503
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition: Function.cpp:1017
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1522
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double inv_pi
Definition: MathExtras.h:38
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:219
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2082
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:141
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:212
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1888
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:428
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:628
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals