LLVM 19.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101 addRegisterClass(MVT::Untyped, V64RegClass);
102
103 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
105
106 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
111
112 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
114
115 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
117
118 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
120
121 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
123
124 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
126
127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129
130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132
133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135
136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138
139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141
142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147
148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
191
192 // We need to custom lower vector stores from local memory
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Custom);
199
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 if (isTypeLegal(MVT::bf16)) {
208 for (unsigned Opc :
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Opc, MVT::bf16, Promote);
220 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221 }
222
224
226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227
228 // TODO: Could make these legal
232
233 // We only need to custom lower because we can't specify an action for bf16
234 // sources.
237
239 AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
240 }
241
242 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
243 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
248 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
253 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
254 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
255 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
256 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
257 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
258
259 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
260 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
261 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
262 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
263 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
264 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
265 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
266
267 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
268
272 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
273
274 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
275
277 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
278
280 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
281 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
282
284 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
285 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
286 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
287 Expand);
289 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
290 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
291 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
292 Expand);
293
295 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
296 MVT::v3i16, MVT::v4i16, MVT::Other},
297 Custom);
298
301 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
302
304
306
308 Expand);
309
310#if 0
312#endif
313
314 // We only support LOAD/STORE and vector manipulation ops for vectors
315 // with > 4 elements.
316 for (MVT VT :
317 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
318 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
319 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
320 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
321 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
322 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
323 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
324 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
325 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
326 switch (Op) {
327 case ISD::LOAD:
328 case ISD::STORE:
330 case ISD::BITCAST:
331 case ISD::UNDEF:
335 case ISD::IS_FPCLASS:
336 break;
341 break;
342 default:
344 break;
345 }
346 }
347 }
348
350
351 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
352 // is expanded to avoid having two separate loops in case the index is a VGPR.
353
354 // Most operations are naturally 32-bit vector operations. We only support
355 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
356 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
358 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
359
361 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
362
364 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
365
367 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
368 }
369
370 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
372 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
373
375 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
376
378 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
379
381 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
382 }
383
384 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
386 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
387
389 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
390
392 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
393
395 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
396 }
397
398 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
400 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
401
403 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
404
406 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
407
409 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
410 }
411
412 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
414 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
415
417 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
418
420 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
421
423 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
424 }
425
427 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
428 Expand);
429
430 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
431 Custom);
432
433 // Avoid stack access for these.
434 // TODO: Generalize to more vector types.
436 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
437 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
438 Custom);
439
440 // Deal with vec3 vector operations when widened to vec4.
442 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
443
444 // Deal with vec5/6/7 vector operations when widened to vec8.
446 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
450 Custom);
451
452 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
453 // and output demarshalling
454 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
455
456 // We can't return success/failure, only the old value,
457 // let LLVM add the comparison
459 Expand);
460
461 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
462
463 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
464
465 // FIXME: This should be narrowed to i32, but that only happens if i64 is
466 // illegal.
467 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
468 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
469
470 // On SI this is s_memtime and s_memrealtime on VI.
472
473 if (Subtarget->hasSMemRealTime() ||
477
478 if (Subtarget->has16BitInsts()) {
481 } else {
483 }
484
485 if (Subtarget->hasMadMacF32Insts())
487
488 if (!Subtarget->hasBFI())
489 // fcopysign can be done in a single instruction with BFI.
490 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
491
492 if (!Subtarget->hasBCNT(32))
494
495 if (!Subtarget->hasBCNT(64))
497
498 if (Subtarget->hasFFBH())
500
501 if (Subtarget->hasFFBL())
503
504 // We only really have 32-bit BFE instructions (and 16-bit on VI).
505 //
506 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
507 // effort to match them now. We want this to be false for i64 cases when the
508 // extraction isn't restricted to the upper or lower half. Ideally we would
509 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
510 // span the midpoint are probably relatively rare, so don't worry about them
511 // for now.
512 if (Subtarget->hasBFE())
514
515 // Clamp modifier on add/sub
516 if (Subtarget->hasIntClamp())
518
519 if (Subtarget->hasAddNoCarry())
520 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
521 Legal);
522
523 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
524 Custom);
525
526 // These are really only legal for ieee_mode functions. We should be avoiding
527 // them for functions that don't have ieee_mode enabled, so just say they are
528 // legal.
530 {MVT::f32, MVT::f64}, Legal);
531
532 if (Subtarget->haveRoundOpsF64())
534 Legal);
535 else
537 MVT::f64, Custom);
538
540 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
541 Legal);
542 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
543
546
547 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
548 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
549
550 // Custom lower these because we can't specify a rule based on an illegal
551 // source bf16.
554
555 if (Subtarget->has16BitInsts()) {
558 MVT::i16, Legal);
559
560 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
561
563 MVT::i16, Expand);
564
568 ISD::CTPOP},
569 MVT::i16, Promote);
570
572
573 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
574
576 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
578 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
579
583
585
586 // F16 - Constant Actions.
589
590 // F16 - Load/Store Actions.
592 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
594 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
595
596 // BF16 - Load/Store Actions.
598 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
600 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
601
602 // F16 - VOP1 Actions.
605 MVT::f16, Custom);
606
609
610 // F16 - VOP2 Actions.
611 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
612 Expand);
616
617 // F16 - VOP3 Actions.
619 if (STI.hasMadF16())
621
622 for (MVT VT :
623 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
624 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
625 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
626 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
627 switch (Op) {
628 case ISD::LOAD:
629 case ISD::STORE:
631 case ISD::BITCAST:
632 case ISD::UNDEF:
638 case ISD::IS_FPCLASS:
639 break;
642 break;
643 default:
645 break;
646 }
647 }
648 }
649
650 // v_perm_b32 can handle either of these.
651 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
653
654 // XXX - Do these do anything? Vector constants turn into build_vector.
655 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
656
657 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
658 Legal);
659
661 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
663 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
664
666 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
668 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
669
670 setOperationAction(ISD::AND, MVT::v2i16, Promote);
671 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
672 setOperationAction(ISD::OR, MVT::v2i16, Promote);
673 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
674 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
675 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
676
678 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
680 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
681 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
682 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
683
685 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
687 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
689 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
690
692 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
694 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
695 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
696 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
697
699 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
701 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
702
704 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
706 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
708 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
709
710 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
712 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
713 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
714 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
715 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
716
718 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
720 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
721 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
722 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
723
724 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
725 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
726 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
727 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
728 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
730
732 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
734 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
735 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
736 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
737
739 MVT::v2i32, Expand);
741
743 MVT::v4i32, Expand);
744
746 MVT::v8i32, Expand);
747
748 if (!Subtarget->hasVOP3PInsts())
750 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
751
752 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
753 // This isn't really legal, but this avoids the legalizer unrolling it (and
754 // allows matching fneg (fabs x) patterns)
755 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
756
759
761 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
762 Custom);
763
765 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
766 Expand);
767
768 for (MVT Vec16 :
769 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
770 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
773 Vec16, Custom);
775 }
776 }
777
778 if (Subtarget->hasVOP3PInsts()) {
782 MVT::v2i16, Legal);
783
786 MVT::v2f16, Legal);
787
788 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
789 Custom);
790
792 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
793 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
794 Custom);
795
796 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
797 // Split vector operations.
802 VT, Custom);
803
804 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
805 // Split vector operations.
807 VT, Custom);
808
809 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
810 Custom);
811
812 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
813 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
814 Custom);
815
816 if (Subtarget->hasPackedFP32Ops()) {
818 MVT::v2f32, Legal);
820 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
821 Custom);
822 }
823 }
824
826
827 if (Subtarget->has16BitInsts()) {
829 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
831 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
832 } else {
833 // Legalization hack.
834 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
835
837 }
838
840 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
841 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
842 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
843 MVT::v32f16, MVT::v32bf16},
844 Custom);
845
847
848 if (Subtarget->hasScalarSMulU64())
850
851 if (Subtarget->hasMad64_32())
853
854 if (Subtarget->hasPrefetch())
856
857 if (Subtarget->hasIEEEMinMax())
859 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
860
862 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
863 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
864 Custom);
865
867 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
868 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
869 MVT::i16, MVT::i8, MVT::i128},
870 Custom);
871
873 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
874 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
875 MVT::i8, MVT::i128},
876 Custom);
877
882
883 // TODO: Could move this to custom lowering, could benefit from combines on
884 // extract of relevant bits.
886
888
891 ISD::SUB,
893 ISD::FADD,
894 ISD::FSUB,
895 ISD::FDIV,
902 ISD::FMA,
903 ISD::SMIN,
904 ISD::SMAX,
905 ISD::UMIN,
906 ISD::UMAX,
908 ISD::AND,
909 ISD::OR,
910 ISD::XOR,
911 ISD::FSHR,
921
922 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
924
925 // All memory operations. Some folding on the pointer operand is done to help
926 // matching the constant offsets in the addressing modes.
949
950 // FIXME: In other contexts we pretend this is a per-function property.
952
954}
955
957 return Subtarget;
958}
959
960//===----------------------------------------------------------------------===//
961// TargetLowering queries
962//===----------------------------------------------------------------------===//
963
964// v_mad_mix* support a conversion from f16 to f32.
965//
966// There is only one special case when denormals are enabled we don't currently,
967// where this is OK to use.
968bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
969 EVT DestVT, EVT SrcVT) const {
970 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
971 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
972 DestVT.getScalarType() == MVT::f32 &&
973 SrcVT.getScalarType() == MVT::f16 &&
974 // TODO: This probably only requires no input flushing?
976}
977
979 LLT DestTy, LLT SrcTy) const {
980 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
981 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
982 DestTy.getScalarSizeInBits() == 32 &&
983 SrcTy.getScalarSizeInBits() == 16 &&
984 // TODO: This probably only requires no input flushing?
986}
987
989 // SI has some legal vector types, but no legal vector operations. Say no
990 // shuffles are legal in order to prefer scalarizing some vector operations.
991 return false;
992}
993
996 EVT VT) const {
999
1000 if (VT.isVector()) {
1001 EVT ScalarVT = VT.getScalarType();
1002 unsigned Size = ScalarVT.getSizeInBits();
1003 if (Size == 16) {
1004 if (Subtarget->has16BitInsts()) {
1005 if (VT.isInteger())
1006 return MVT::v2i16;
1007 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1008 }
1009 return VT.isInteger() ? MVT::i32 : MVT::f32;
1010 }
1011
1012 if (Size < 16)
1013 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1014 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1015 }
1016
1017 if (VT.getSizeInBits() > 32)
1018 return MVT::i32;
1019
1021}
1022
1025 EVT VT) const {
1028
1029 if (VT.isVector()) {
1030 unsigned NumElts = VT.getVectorNumElements();
1031 EVT ScalarVT = VT.getScalarType();
1032 unsigned Size = ScalarVT.getSizeInBits();
1033
1034 // FIXME: Should probably promote 8-bit vectors to i16.
1035 if (Size == 16 && Subtarget->has16BitInsts())
1036 return (NumElts + 1) / 2;
1037
1038 if (Size <= 32)
1039 return NumElts;
1040
1041 if (Size > 32)
1042 return NumElts * ((Size + 31) / 32);
1043 } else if (VT.getSizeInBits() > 32)
1044 return (VT.getSizeInBits() + 31) / 32;
1045
1047}
1048
1050 LLVMContext &Context, CallingConv::ID CC,
1051 EVT VT, EVT &IntermediateVT,
1052 unsigned &NumIntermediates, MVT &RegisterVT) const {
1053 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1054 unsigned NumElts = VT.getVectorNumElements();
1055 EVT ScalarVT = VT.getScalarType();
1056 unsigned Size = ScalarVT.getSizeInBits();
1057 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1058 // support, but unless we can properly handle 3-vectors, it will be still be
1059 // inconsistent.
1060 if (Size == 16 && Subtarget->has16BitInsts()) {
1061 if (ScalarVT == MVT::bf16) {
1062 RegisterVT = MVT::i32;
1063 IntermediateVT = MVT::v2bf16;
1064 } else {
1065 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1066 IntermediateVT = RegisterVT;
1067 }
1068 NumIntermediates = (NumElts + 1) / 2;
1069 return NumIntermediates;
1070 }
1071
1072 if (Size == 32) {
1073 RegisterVT = ScalarVT.getSimpleVT();
1074 IntermediateVT = RegisterVT;
1075 NumIntermediates = NumElts;
1076 return NumIntermediates;
1077 }
1078
1079 if (Size < 16 && Subtarget->has16BitInsts()) {
1080 // FIXME: Should probably form v2i16 pieces
1081 RegisterVT = MVT::i16;
1082 IntermediateVT = ScalarVT;
1083 NumIntermediates = NumElts;
1084 return NumIntermediates;
1085 }
1086
1087
1088 if (Size != 16 && Size <= 32) {
1089 RegisterVT = MVT::i32;
1090 IntermediateVT = ScalarVT;
1091 NumIntermediates = NumElts;
1092 return NumIntermediates;
1093 }
1094
1095 if (Size > 32) {
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = RegisterVT;
1098 NumIntermediates = NumElts * ((Size + 31) / 32);
1099 return NumIntermediates;
1100 }
1101 }
1102
1104 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1105}
1106
1107static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
1108 assert(MaxNumLanes != 0);
1109
1110 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1111 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1112 return EVT::getVectorVT(Ty->getContext(),
1113 EVT::getEVT(VT->getElementType()),
1114 NumElts);
1115 }
1116
1117 return EVT::getEVT(Ty);
1118}
1119
1120// Peek through TFE struct returns to only use the data size.
1121static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
1122 auto *ST = dyn_cast<StructType>(Ty);
1123 if (!ST)
1124 return memVTFromLoadIntrData(Ty, MaxNumLanes);
1125
1126 // TFE intrinsics return an aggregate type.
1127 assert(ST->getNumContainedTypes() == 2 &&
1128 ST->getContainedType(1)->isIntegerTy(32));
1129 return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
1130}
1131
1132/// Map address space 7 to MVT::v5i32 because that's its in-memory
1133/// representation. This return value is vector-typed because there is no
1134/// MVT::i160 and it is not clear if one can be added. While this could
1135/// cause issues during codegen, these address space 7 pointers will be
1136/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1137/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1138/// modeling, to work.
1140 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1141 return MVT::v5i32;
1143 DL.getPointerSizeInBits(AS) == 192)
1144 return MVT::v6i32;
1146}
1147/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1148/// v8i32 when padding is added.
1149/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1150/// also v8i32 with padding.
1152 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1153 DL.getPointerSizeInBits(AS) == 160) ||
1155 DL.getPointerSizeInBits(AS) == 192))
1156 return MVT::v8i32;
1158}
1159
1161 const CallInst &CI,
1162 MachineFunction &MF,
1163 unsigned IntrID) const {
1165 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1167
1168 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1171 (Intrinsic::ID)IntrID);
1172 MemoryEffects ME = Attr.getMemoryEffects();
1173 if (ME.doesNotAccessMemory())
1174 return false;
1175
1176 // TODO: Should images get their own address space?
1177 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1178
1179 if (RsrcIntr->IsImage)
1180 Info.align.reset();
1181
1182 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1183 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1184 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1185 // We conservatively set the memory operand of a buffer intrinsic to the
1186 // base resource pointer, so that we can access alias information about
1187 // those pointers. Cases like "this points at the same value
1188 // but with a different offset" are handled in
1189 // areMemAccessesTriviallyDisjoint.
1190 Info.ptrVal = RsrcArg;
1191 }
1192
1193 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1194 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1197 if (ME.onlyReadsMemory()) {
1198 unsigned MaxNumLanes = 4;
1199
1200 if (RsrcIntr->IsImage) {
1203 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1205
1206 if (!BaseOpcode->Gather4) {
1207 // If this isn't a gather, we may have excess loaded elements in the
1208 // IR type. Check the dmask for the real number of elements loaded.
1209 unsigned DMask
1210 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1211 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1212 }
1213 }
1214
1215 Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
1216
1217 // FIXME: What does alignment mean for an image?
1220 } else if (ME.onlyWritesMemory()) {
1222
1223 Type *DataTy = CI.getArgOperand(0)->getType();
1224 if (RsrcIntr->IsImage) {
1225 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1226 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1227 Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
1228 } else
1229 Info.memVT = EVT::getEVT(DataTy);
1230
1232 } else {
1233 // Atomic
1234 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1236 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1240
1241 switch (IntrID) {
1242 default:
1243 // XXX - Should this be volatile without known ordering?
1245 break;
1246 case Intrinsic::amdgcn_raw_buffer_load_lds:
1247 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1248 case Intrinsic::amdgcn_struct_buffer_load_lds:
1249 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1250 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1251 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1252 Info.ptrVal = CI.getArgOperand(1);
1253 return true;
1254 }
1255 }
1256 }
1257 return true;
1258 }
1259
1260 switch (IntrID) {
1261 case Intrinsic::amdgcn_ds_ordered_add:
1262 case Intrinsic::amdgcn_ds_ordered_swap:
1263 case Intrinsic::amdgcn_ds_fadd:
1264 case Intrinsic::amdgcn_ds_fmin:
1265 case Intrinsic::amdgcn_ds_fmax: {
1267 Info.memVT = MVT::getVT(CI.getType());
1268 Info.ptrVal = CI.getOperand(0);
1269 Info.align.reset();
1271
1272 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1273 if (!Vol->isZero())
1275
1276 return true;
1277 }
1278 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1280 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1281 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1282 Info.align.reset();
1284
1285 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1286 if (!Vol || !Vol->isZero())
1288
1289 return true;
1290 }
1291 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1292 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1294 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1295 Info.ptrVal = nullptr;
1296 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1298 return true;
1299 }
1300 case Intrinsic::amdgcn_ds_append:
1301 case Intrinsic::amdgcn_ds_consume: {
1303 Info.memVT = MVT::getVT(CI.getType());
1304 Info.ptrVal = CI.getOperand(0);
1305 Info.align.reset();
1307
1308 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1309 if (!Vol->isZero())
1311
1312 return true;
1313 }
1314 case Intrinsic::amdgcn_global_atomic_csub: {
1316 Info.memVT = MVT::getVT(CI.getType());
1317 Info.ptrVal = CI.getOperand(0);
1318 Info.align.reset();
1322 return true;
1323 }
1324 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1326 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1327
1328 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1329 Info.align.reset();
1332 return true;
1333 }
1334 case Intrinsic::amdgcn_global_atomic_fadd:
1335 case Intrinsic::amdgcn_global_atomic_fmin:
1336 case Intrinsic::amdgcn_global_atomic_fmax:
1337 case Intrinsic::amdgcn_global_atomic_fmin_num:
1338 case Intrinsic::amdgcn_global_atomic_fmax_num:
1339 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1340 case Intrinsic::amdgcn_flat_atomic_fadd:
1341 case Intrinsic::amdgcn_flat_atomic_fmin:
1342 case Intrinsic::amdgcn_flat_atomic_fmax:
1343 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1344 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1345 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1346 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1347 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1349 Info.memVT = MVT::getVT(CI.getType());
1350 Info.ptrVal = CI.getOperand(0);
1351 Info.align.reset();
1356 return true;
1357 }
1358 case Intrinsic::amdgcn_global_load_tr_b64:
1359 case Intrinsic::amdgcn_global_load_tr_b128: {
1361 Info.memVT = MVT::getVT(CI.getType());
1362 Info.ptrVal = CI.getOperand(0);
1363 Info.align.reset();
1365 return true;
1366 }
1367 case Intrinsic::amdgcn_ds_gws_init:
1368 case Intrinsic::amdgcn_ds_gws_barrier:
1369 case Intrinsic::amdgcn_ds_gws_sema_v:
1370 case Intrinsic::amdgcn_ds_gws_sema_br:
1371 case Intrinsic::amdgcn_ds_gws_sema_p:
1372 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1374
1375 const GCNTargetMachine &TM =
1376 static_cast<const GCNTargetMachine &>(getTargetMachine());
1377
1379 Info.ptrVal = MFI->getGWSPSV(TM);
1380
1381 // This is an abstract access, but we need to specify a type and size.
1382 Info.memVT = MVT::i32;
1383 Info.size = 4;
1384 Info.align = Align(4);
1385
1386 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1388 else
1390 return true;
1391 }
1392 case Intrinsic::amdgcn_global_load_lds: {
1394 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1395 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1396 Info.ptrVal = CI.getArgOperand(1);
1398 return true;
1399 }
1400 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1402
1403 const GCNTargetMachine &TM =
1404 static_cast<const GCNTargetMachine &>(getTargetMachine());
1405
1407 Info.ptrVal = MFI->getGWSPSV(TM);
1408
1409 // This is an abstract access, but we need to specify a type and size.
1410 Info.memVT = MVT::i32;
1411 Info.size = 4;
1412 Info.align = Align(4);
1413
1415 return true;
1416 }
1417 default:
1418 return false;
1419 }
1420}
1421
1423 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1424 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1425 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1426 // The DAG's ValueType loses the addrspaces.
1427 // Add them as 2 extra Constant operands "from" and "to".
1428 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1429 unsigned DstAS = I.getType()->getPointerAddressSpace();
1430 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1431 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1432 break;
1433 }
1434 default:
1435 break;
1436 }
1437}
1438
1441 Type *&AccessTy) const {
1442 Value *Ptr = nullptr;
1443 switch (II->getIntrinsicID()) {
1444 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1445 case Intrinsic::amdgcn_ds_append:
1446 case Intrinsic::amdgcn_ds_consume:
1447 case Intrinsic::amdgcn_ds_fadd:
1448 case Intrinsic::amdgcn_ds_fmax:
1449 case Intrinsic::amdgcn_ds_fmin:
1450 case Intrinsic::amdgcn_ds_ordered_add:
1451 case Intrinsic::amdgcn_ds_ordered_swap:
1452 case Intrinsic::amdgcn_flat_atomic_fadd:
1453 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1454 case Intrinsic::amdgcn_flat_atomic_fmax:
1455 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1456 case Intrinsic::amdgcn_flat_atomic_fmin:
1457 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1458 case Intrinsic::amdgcn_global_atomic_csub:
1459 case Intrinsic::amdgcn_global_atomic_fadd:
1460 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1461 case Intrinsic::amdgcn_global_atomic_fmax:
1462 case Intrinsic::amdgcn_global_atomic_fmax_num:
1463 case Intrinsic::amdgcn_global_atomic_fmin:
1464 case Intrinsic::amdgcn_global_atomic_fmin_num:
1465 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1466 case Intrinsic::amdgcn_global_load_tr_b64:
1467 case Intrinsic::amdgcn_global_load_tr_b128:
1468 Ptr = II->getArgOperand(0);
1469 break;
1470 case Intrinsic::amdgcn_global_load_lds:
1471 Ptr = II->getArgOperand(1);
1472 break;
1473 default:
1474 return false;
1475 }
1476 AccessTy = II->getType();
1477 Ops.push_back(Ptr);
1478 return true;
1479}
1480
1481bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1482 unsigned AddrSpace,
1483 uint64_t FlatVariant) const {
1484 if (!Subtarget->hasFlatInstOffsets()) {
1485 // Flat instructions do not have offsets, and only have the register
1486 // address.
1487 return AM.BaseOffs == 0 && AM.Scale == 0;
1488 }
1489
1490 return AM.Scale == 0 &&
1491 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1492 AM.BaseOffs, AddrSpace, FlatVariant));
1493}
1494
1496 if (Subtarget->hasFlatGlobalInsts())
1497 return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS,
1499
1500 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1501 // Assume the we will use FLAT for all global memory accesses
1502 // on VI.
1503 // FIXME: This assumption is currently wrong. On VI we still use
1504 // MUBUF instructions for the r + i addressing mode. As currently
1505 // implemented, the MUBUF instructions only work on buffer < 4GB.
1506 // It may be possible to support > 4GB buffers with MUBUF instructions,
1507 // by setting the stride value in the resource descriptor which would
1508 // increase the size limit to (stride * 4GB). However, this is risky,
1509 // because it has never been validated.
1510 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1512 }
1513
1514 return isLegalMUBUFAddressingMode(AM);
1515}
1516
1517bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1518 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1519 // additionally can do r + r + i with addr64. 32-bit has more addressing
1520 // mode options. Depending on the resource constant, it can also do
1521 // (i64 r0) + (i32 r1) * (i14 i).
1522 //
1523 // Private arrays end up using a scratch buffer most of the time, so also
1524 // assume those use MUBUF instructions. Scratch loads / stores are currently
1525 // implemented as mubuf instructions with offen bit set, so slightly
1526 // different than the normal addr64.
1527 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1528 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1529 return false;
1530
1531 // FIXME: Since we can split immediate into soffset and immediate offset,
1532 // would it make sense to allow any immediate?
1533
1534 switch (AM.Scale) {
1535 case 0: // r + i or just i, depending on HasBaseReg.
1536 return true;
1537 case 1:
1538 return true; // We have r + r or r + i.
1539 case 2:
1540 if (AM.HasBaseReg) {
1541 // Reject 2 * r + r.
1542 return false;
1543 }
1544
1545 // Allow 2 * r as r + r
1546 // Or 2 * r + i is allowed as r + r + i.
1547 return true;
1548 default: // Don't allow n * r
1549 return false;
1550 }
1551}
1552
1554 const AddrMode &AM, Type *Ty,
1555 unsigned AS, Instruction *I) const {
1556 // No global is ever allowed as a base.
1557 if (AM.BaseGV)
1558 return false;
1559
1560 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1561 return isLegalGlobalAddressingMode(AM);
1562
1563 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1567 // If the offset isn't a multiple of 4, it probably isn't going to be
1568 // correctly aligned.
1569 // FIXME: Can we get the real alignment here?
1570 if (AM.BaseOffs % 4 != 0)
1571 return isLegalMUBUFAddressingMode(AM);
1572
1573 if (!Subtarget->hasScalarSubwordLoads()) {
1574 // There are no SMRD extloads, so if we have to do a small type access we
1575 // will use a MUBUF load.
1576 // FIXME?: We also need to do this if unaligned, but we don't know the
1577 // alignment here.
1578 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1579 return isLegalGlobalAddressingMode(AM);
1580 }
1581
1583 // SMRD instructions have an 8-bit, dword offset on SI.
1584 if (!isUInt<8>(AM.BaseOffs / 4))
1585 return false;
1586 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1587 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1588 // in 8-bits, it can use a smaller encoding.
1589 if (!isUInt<32>(AM.BaseOffs / 4))
1590 return false;
1591 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1592 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1593 if (!isUInt<20>(AM.BaseOffs))
1594 return false;
1595 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1596 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1597 // for S_BUFFER_* instructions).
1598 if (!isInt<21>(AM.BaseOffs))
1599 return false;
1600 } else {
1601 // On GFX12, all offsets are signed 24-bit in bytes.
1602 if (!isInt<24>(AM.BaseOffs))
1603 return false;
1604 }
1605
1606 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1607 return true;
1608
1609 if (AM.Scale == 1 && AM.HasBaseReg)
1610 return true;
1611
1612 return false;
1613 }
1614
1615 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1616 return Subtarget->enableFlatScratch()
1617 ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS,
1619 : isLegalMUBUFAddressingMode(AM);
1620
1621 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1622 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1623 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1624 // field.
1625 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1626 // an 8-bit dword offset but we don't know the alignment here.
1627 if (!isUInt<16>(AM.BaseOffs))
1628 return false;
1629
1630 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1631 return true;
1632
1633 if (AM.Scale == 1 && AM.HasBaseReg)
1634 return true;
1635
1636 return false;
1637 }
1638
1640 // For an unknown address space, this usually means that this is for some
1641 // reason being used for pure arithmetic, and not based on some addressing
1642 // computation. We don't have instructions that compute pointers with any
1643 // addressing modes, so treat them as having no offset like flat
1644 // instructions.
1645 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1647 }
1648
1649 // Assume a user alias of global for unknown address spaces.
1650 return isLegalGlobalAddressingMode(AM);
1651}
1652
1654 const MachineFunction &MF) const {
1656 return (MemVT.getSizeInBits() <= 4 * 32);
1657 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1658 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1659 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1660 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1661 return (MemVT.getSizeInBits() <= 2 * 32);
1662 }
1663 return true;
1664}
1665
1667 unsigned Size, unsigned AddrSpace, Align Alignment,
1668 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1669 if (IsFast)
1670 *IsFast = 0;
1671
1672 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1673 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1674 // Check if alignment requirements for ds_read/write instructions are
1675 // disabled.
1676 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1677 return false;
1678
1679 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1680 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1681 Alignment < RequiredAlignment)
1682 return false;
1683
1684 // Either, the alignment requirements are "enabled", or there is an
1685 // unaligned LDS access related hardware bug though alignment requirements
1686 // are "disabled". In either case, we need to check for proper alignment
1687 // requirements.
1688 //
1689 switch (Size) {
1690 case 64:
1691 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1692 // address is negative, then the instruction is incorrectly treated as
1693 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1694 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1695 // load later in the SILoadStoreOptimizer.
1696 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1697 return false;
1698
1699 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1700 // can do a 4 byte aligned, 8 byte access in a single operation using
1701 // ds_read2/write2_b32 with adjacent offsets.
1702 RequiredAlignment = Align(4);
1703
1704 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1705 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1706 // ds_write2_b32 depending on the alignment. In either case with either
1707 // alignment there is no faster way of doing this.
1708
1709 // The numbers returned here and below are not additive, it is a 'speed
1710 // rank'. They are just meant to be compared to decide if a certain way
1711 // of lowering an operation is faster than another. For that purpose
1712 // naturally aligned operation gets it bitsize to indicate that "it
1713 // operates with a speed comparable to N-bit wide load". With the full
1714 // alignment ds128 is slower than ds96 for example. If underaligned it
1715 // is comparable to a speed of a single dword access, which would then
1716 // mean 32 < 128 and it is faster to issue a wide load regardless.
1717 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1718 // wider load which will not be aligned anymore the latter is slower.
1719 if (IsFast)
1720 *IsFast = (Alignment >= RequiredAlignment) ? 64
1721 : (Alignment < Align(4)) ? 32
1722 : 1;
1723 return true;
1724 }
1725
1726 break;
1727 case 96:
1728 if (!Subtarget->hasDS96AndDS128())
1729 return false;
1730
1731 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1732 // gfx8 and older.
1733
1734 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1735 // Naturally aligned access is fastest. However, also report it is Fast
1736 // if memory is aligned less than DWORD. A narrow load or store will be
1737 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1738 // be more of them, so overall we will pay less penalty issuing a single
1739 // instruction.
1740
1741 // See comment on the values above.
1742 if (IsFast)
1743 *IsFast = (Alignment >= RequiredAlignment) ? 96
1744 : (Alignment < Align(4)) ? 32
1745 : 1;
1746 return true;
1747 }
1748
1749 break;
1750 case 128:
1751 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1752 return false;
1753
1754 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1755 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1756 // single operation using ds_read2/write2_b64.
1757 RequiredAlignment = Align(8);
1758
1759 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1760 // Naturally aligned access is fastest. However, also report it is Fast
1761 // if memory is aligned less than DWORD. A narrow load or store will be
1762 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1763 // will be more of them, so overall we will pay less penalty issuing a
1764 // single instruction.
1765
1766 // See comment on the values above.
1767 if (IsFast)
1768 *IsFast = (Alignment >= RequiredAlignment) ? 128
1769 : (Alignment < Align(4)) ? 32
1770 : 1;
1771 return true;
1772 }
1773
1774 break;
1775 default:
1776 if (Size > 32)
1777 return false;
1778
1779 break;
1780 }
1781
1782 // See comment on the values above.
1783 // Note that we have a single-dword or sub-dword here, so if underaligned
1784 // it is a slowest possible access, hence returned value is 0.
1785 if (IsFast)
1786 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1787
1788 return Alignment >= RequiredAlignment ||
1789 Subtarget->hasUnalignedDSAccessEnabled();
1790 }
1791
1792 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1793 bool AlignedBy4 = Alignment >= Align(4);
1794 if (IsFast)
1795 *IsFast = AlignedBy4;
1796
1797 return AlignedBy4 ||
1798 Subtarget->enableFlatScratch() ||
1799 Subtarget->hasUnalignedScratchAccess();
1800 }
1801
1802 // FIXME: We have to be conservative here and assume that flat operations
1803 // will access scratch. If we had access to the IR function, then we
1804 // could determine if any private memory was used in the function.
1805 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1806 !Subtarget->hasUnalignedScratchAccess()) {
1807 bool AlignedBy4 = Alignment >= Align(4);
1808 if (IsFast)
1809 *IsFast = AlignedBy4;
1810
1811 return AlignedBy4;
1812 }
1813
1814 // So long as they are correct, wide global memory operations perform better
1815 // than multiple smaller memory ops -- even when misaligned
1816 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1817 if (IsFast)
1818 *IsFast = Size;
1819
1820 return Alignment >= Align(4) ||
1822 }
1823
1824 // Smaller than dword value must be aligned.
1825 if (Size < 32)
1826 return false;
1827
1828 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1829 // byte-address are ignored, thus forcing Dword alignment.
1830 // This applies to private, global, and constant memory.
1831 if (IsFast)
1832 *IsFast = 1;
1833
1834 return Size >= 32 && Alignment >= Align(4);
1835}
1836
1838 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1839 unsigned *IsFast) const {
1841 Alignment, Flags, IsFast);
1842}
1843
1845 const MemOp &Op, const AttributeList &FuncAttributes) const {
1846 // FIXME: Should account for address space here.
1847
1848 // The default fallback uses the private pointer size as a guess for a type to
1849 // use. Make sure we switch these to 64-bit accesses.
1850
1851 if (Op.size() >= 16 &&
1852 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1853 return MVT::v4i32;
1854
1855 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1856 return MVT::v2i32;
1857
1858 // Use the default.
1859 return MVT::Other;
1860}
1861
1863 const MemSDNode *MemNode = cast<MemSDNode>(N);
1864 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1865}
1866
1868 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1870}
1871
1873 unsigned DestAS) const {
1874 // Flat -> private/local is a simple truncate.
1875 // Flat -> global is no-op
1876 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1877 return true;
1878
1879 const GCNTargetMachine &TM =
1880 static_cast<const GCNTargetMachine &>(getTargetMachine());
1881 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1882}
1883
1885 const MemSDNode *MemNode = cast<MemSDNode>(N);
1886
1888}
1889
1892 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1893 VT.getScalarType().bitsLE(MVT::i16))
1896}
1897
1899 Type *Ty) const {
1900 // FIXME: Could be smarter if called for vector constants.
1901 return true;
1902}
1903
1905 unsigned Index) const {
1907 return false;
1908
1909 // TODO: Add more cases that are cheap.
1910 return Index == 0;
1911}
1912
1914 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1915 switch (Op) {
1916 case ISD::LOAD:
1917 case ISD::STORE:
1918
1919 // These operations are done with 32-bit instructions anyway.
1920 case ISD::AND:
1921 case ISD::OR:
1922 case ISD::XOR:
1923 case ISD::SELECT:
1924 // TODO: Extensions?
1925 return true;
1926 default:
1927 return false;
1928 }
1929 }
1930
1931 // SimplifySetCC uses this function to determine whether or not it should
1932 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1933 if (VT == MVT::i1 && Op == ISD::SETCC)
1934 return false;
1935
1937}
1938
1939SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1940 const SDLoc &SL,
1941 SDValue Chain,
1942 uint64_t Offset) const {
1943 const DataLayout &DL = DAG.getDataLayout();
1946
1947 const ArgDescriptor *InputPtrReg;
1948 const TargetRegisterClass *RC;
1949 LLT ArgTy;
1951
1952 std::tie(InputPtrReg, RC, ArgTy) =
1954
1955 // We may not have the kernarg segment argument if we have no kernel
1956 // arguments.
1957 if (!InputPtrReg)
1958 return DAG.getConstant(Offset, SL, PtrVT);
1959
1961 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1962 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1963
1964 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1965}
1966
1967SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1968 const SDLoc &SL) const {
1971 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1972}
1973
1974SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1975 const SDLoc &SL) const {
1976
1978 std::optional<uint32_t> KnownSize =
1980 if (KnownSize.has_value())
1981 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1982 return SDValue();
1983}
1984
1985SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1986 const SDLoc &SL, SDValue Val,
1987 bool Signed,
1988 const ISD::InputArg *Arg) const {
1989 // First, if it is a widened vector, narrow it.
1990 if (VT.isVector() &&
1992 EVT NarrowedVT =
1995 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1996 DAG.getConstant(0, SL, MVT::i32));
1997 }
1998
1999 // Then convert the vector elements or scalar value.
2000 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2001 VT.bitsLT(MemVT)) {
2002 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2003 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2004 }
2005
2006 if (MemVT.isFloatingPoint())
2007 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2008 else if (Signed)
2009 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2010 else
2011 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2012
2013 return Val;
2014}
2015
2016SDValue SITargetLowering::lowerKernargMemParameter(
2017 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2018 uint64_t Offset, Align Alignment, bool Signed,
2019 const ISD::InputArg *Arg) const {
2021
2022 // Try to avoid using an extload by loading earlier than the argument address,
2023 // and extracting the relevant bits. The load should hopefully be merged with
2024 // the previous argument.
2025 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2026 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2027 int64_t AlignDownOffset = alignDown(Offset, 4);
2028 int64_t OffsetDiff = Offset - AlignDownOffset;
2029
2030 EVT IntVT = MemVT.changeTypeToInteger();
2031
2032 // TODO: If we passed in the base kernel offset we could have a better
2033 // alignment than 4, but we don't really need it.
2034 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2035 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2038
2039 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2040 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2041
2042 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2043 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2044 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2045
2046
2047 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2048 }
2049
2050 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2051 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2054
2055 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2056 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2057}
2058
2059SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2060 const SDLoc &SL, SDValue Chain,
2061 const ISD::InputArg &Arg) const {
2063 MachineFrameInfo &MFI = MF.getFrameInfo();
2064
2065 if (Arg.Flags.isByVal()) {
2066 unsigned Size = Arg.Flags.getByValSize();
2067 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2068 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2069 }
2070
2071 unsigned ArgOffset = VA.getLocMemOffset();
2072 unsigned ArgSize = VA.getValVT().getStoreSize();
2073
2074 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2075
2076 // Create load nodes to retrieve arguments from the stack.
2077 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2078 SDValue ArgValue;
2079
2080 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2082 MVT MemVT = VA.getValVT();
2083
2084 switch (VA.getLocInfo()) {
2085 default:
2086 break;
2087 case CCValAssign::BCvt:
2088 MemVT = VA.getLocVT();
2089 break;
2090 case CCValAssign::SExt:
2091 ExtType = ISD::SEXTLOAD;
2092 break;
2093 case CCValAssign::ZExt:
2094 ExtType = ISD::ZEXTLOAD;
2095 break;
2096 case CCValAssign::AExt:
2097 ExtType = ISD::EXTLOAD;
2098 break;
2099 }
2100
2101 ArgValue = DAG.getExtLoad(
2102 ExtType, SL, VA.getLocVT(), Chain, FIN,
2104 MemVT);
2105 return ArgValue;
2106}
2107
2108SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2109 const SIMachineFunctionInfo &MFI,
2110 EVT VT,
2112 const ArgDescriptor *Reg = nullptr;
2113 const TargetRegisterClass *RC;
2114 LLT Ty;
2115
2117 const ArgDescriptor WorkGroupIDX =
2118 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2119 // If GridZ is not programmed in an entry function then the hardware will set
2120 // it to all zeros, so there is no need to mask the GridY value in the low
2121 // order bits.
2122 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2123 AMDGPU::TTMP7,
2124 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2125 const ArgDescriptor WorkGroupIDZ =
2126 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2127 if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
2128 switch (PVID) {
2130 Reg = &WorkGroupIDX;
2131 RC = &AMDGPU::SReg_32RegClass;
2132 Ty = LLT::scalar(32);
2133 break;
2135 Reg = &WorkGroupIDY;
2136 RC = &AMDGPU::SReg_32RegClass;
2137 Ty = LLT::scalar(32);
2138 break;
2140 Reg = &WorkGroupIDZ;
2141 RC = &AMDGPU::SReg_32RegClass;
2142 Ty = LLT::scalar(32);
2143 break;
2144 default:
2145 break;
2146 }
2147 }
2148
2149 if (!Reg)
2150 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2151 if (!Reg) {
2153 // It's possible for a kernarg intrinsic call to appear in a kernel with
2154 // no allocated segment, in which case we do not add the user sgpr
2155 // argument, so just return null.
2156 return DAG.getConstant(0, SDLoc(), VT);
2157 }
2158
2159 // It's undefined behavior if a function marked with the amdgpu-no-*
2160 // attributes uses the corresponding intrinsic.
2161 return DAG.getUNDEF(VT);
2162 }
2163
2164 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2165}
2166
2168 CallingConv::ID CallConv,
2169 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2170 FunctionType *FType,
2171 SIMachineFunctionInfo *Info) {
2172 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2173 const ISD::InputArg *Arg = &Ins[I];
2174
2175 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2176 "vector type argument should have been split");
2177
2178 // First check if it's a PS input addr.
2179 if (CallConv == CallingConv::AMDGPU_PS &&
2180 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2181 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2182
2183 // Inconveniently only the first part of the split is marked as isSplit,
2184 // so skip to the end. We only want to increment PSInputNum once for the
2185 // entire split argument.
2186 if (Arg->Flags.isSplit()) {
2187 while (!Arg->Flags.isSplitEnd()) {
2188 assert((!Arg->VT.isVector() ||
2189 Arg->VT.getScalarSizeInBits() == 16) &&
2190 "unexpected vector split in ps argument type");
2191 if (!SkipArg)
2192 Splits.push_back(*Arg);
2193 Arg = &Ins[++I];
2194 }
2195 }
2196
2197 if (SkipArg) {
2198 // We can safely skip PS inputs.
2199 Skipped.set(Arg->getOrigArgIndex());
2200 ++PSInputNum;
2201 continue;
2202 }
2203
2204 Info->markPSInputAllocated(PSInputNum);
2205 if (Arg->Used)
2206 Info->markPSInputEnabled(PSInputNum);
2207
2208 ++PSInputNum;
2209 }
2210
2211 Splits.push_back(*Arg);
2212 }
2213}
2214
2215// Allocate special inputs passed in VGPRs.
2217 MachineFunction &MF,
2218 const SIRegisterInfo &TRI,
2219 SIMachineFunctionInfo &Info) const {
2220 const LLT S32 = LLT::scalar(32);
2222
2223 if (Info.hasWorkItemIDX()) {
2224 Register Reg = AMDGPU::VGPR0;
2225 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2226
2227 CCInfo.AllocateReg(Reg);
2228 unsigned Mask = (Subtarget->hasPackedTID() &&
2229 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2230 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2231 }
2232
2233 if (Info.hasWorkItemIDY()) {
2234 assert(Info.hasWorkItemIDX());
2235 if (Subtarget->hasPackedTID()) {
2236 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2237 0x3ff << 10));
2238 } else {
2239 unsigned Reg = AMDGPU::VGPR1;
2240 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2241
2242 CCInfo.AllocateReg(Reg);
2243 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2244 }
2245 }
2246
2247 if (Info.hasWorkItemIDZ()) {
2248 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2249 if (Subtarget->hasPackedTID()) {
2250 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2251 0x3ff << 20));
2252 } else {
2253 unsigned Reg = AMDGPU::VGPR2;
2254 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2255
2256 CCInfo.AllocateReg(Reg);
2257 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2258 }
2259 }
2260}
2261
2262// Try to allocate a VGPR at the end of the argument list, or if no argument
2263// VGPRs are left allocating a stack slot.
2264// If \p Mask is is given it indicates bitfield position in the register.
2265// If \p Arg is given use it with new ]p Mask instead of allocating new.
2266static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2267 ArgDescriptor Arg = ArgDescriptor()) {
2268 if (Arg.isSet())
2269 return ArgDescriptor::createArg(Arg, Mask);
2270
2271 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2272 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2273 if (RegIdx == ArgVGPRs.size()) {
2274 // Spill to stack required.
2275 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2276
2277 return ArgDescriptor::createStack(Offset, Mask);
2278 }
2279
2280 unsigned Reg = ArgVGPRs[RegIdx];
2281 Reg = CCInfo.AllocateReg(Reg);
2282 assert(Reg != AMDGPU::NoRegister);
2283
2284 MachineFunction &MF = CCInfo.getMachineFunction();
2285 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2286 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2287 return ArgDescriptor::createRegister(Reg, Mask);
2288}
2289
2291 const TargetRegisterClass *RC,
2292 unsigned NumArgRegs) {
2293 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2294 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2295 if (RegIdx == ArgSGPRs.size())
2296 report_fatal_error("ran out of SGPRs for arguments");
2297
2298 unsigned Reg = ArgSGPRs[RegIdx];
2299 Reg = CCInfo.AllocateReg(Reg);
2300 assert(Reg != AMDGPU::NoRegister);
2301
2302 MachineFunction &MF = CCInfo.getMachineFunction();
2303 MF.addLiveIn(Reg, RC);
2305}
2306
2307// If this has a fixed position, we still should allocate the register in the
2308// CCInfo state. Technically we could get away with this for values passed
2309// outside of the normal argument range.
2311 const TargetRegisterClass *RC,
2312 MCRegister Reg) {
2313 Reg = CCInfo.AllocateReg(Reg);
2314 assert(Reg != AMDGPU::NoRegister);
2315 MachineFunction &MF = CCInfo.getMachineFunction();
2316 MF.addLiveIn(Reg, RC);
2317}
2318
2319static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2320 if (Arg) {
2321 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2322 Arg.getRegister());
2323 } else
2324 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2325}
2326
2327static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2328 if (Arg) {
2329 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2330 Arg.getRegister());
2331 } else
2332 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2333}
2334
2335/// Allocate implicit function VGPR arguments at the end of allocated user
2336/// arguments.
2338 CCState &CCInfo, MachineFunction &MF,
2339 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2340 const unsigned Mask = 0x3ff;
2341 ArgDescriptor Arg;
2342
2343 if (Info.hasWorkItemIDX()) {
2344 Arg = allocateVGPR32Input(CCInfo, Mask);
2345 Info.setWorkItemIDX(Arg);
2346 }
2347
2348 if (Info.hasWorkItemIDY()) {
2349 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2350 Info.setWorkItemIDY(Arg);
2351 }
2352
2353 if (Info.hasWorkItemIDZ())
2354 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2355}
2356
2357/// Allocate implicit function VGPR arguments in fixed registers.
2359 CCState &CCInfo, MachineFunction &MF,
2360 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2361 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2362 if (!Reg)
2363 report_fatal_error("failed to allocated VGPR for implicit arguments");
2364
2365 const unsigned Mask = 0x3ff;
2366 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2367 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2368 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2369}
2370
2372 CCState &CCInfo,
2373 MachineFunction &MF,
2374 const SIRegisterInfo &TRI,
2375 SIMachineFunctionInfo &Info) const {
2376 auto &ArgInfo = Info.getArgInfo();
2377 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2378
2379 // TODO: Unify handling with private memory pointers.
2380 if (UserSGPRInfo.hasDispatchPtr())
2381 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2382
2383 const Module *M = MF.getFunction().getParent();
2384 if (UserSGPRInfo.hasQueuePtr() &&
2386 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2387
2388 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2389 // constant offset from the kernarg segment.
2390 if (Info.hasImplicitArgPtr())
2391 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2392
2393 if (UserSGPRInfo.hasDispatchID())
2394 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2395
2396 // flat_scratch_init is not applicable for non-kernel functions.
2397
2398 if (Info.hasWorkGroupIDX())
2399 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2400
2401 if (Info.hasWorkGroupIDY())
2402 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2403
2404 if (Info.hasWorkGroupIDZ())
2405 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2406
2407 if (Info.hasLDSKernelId())
2408 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2409}
2410
2411// Allocate special inputs passed in user SGPRs.
2413 MachineFunction &MF,
2414 const SIRegisterInfo &TRI,
2415 SIMachineFunctionInfo &Info) const {
2416 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2417 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2418 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2419 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2420 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2421 }
2422
2423 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2424 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2425 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2426 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2427 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2428 }
2429
2430 if (UserSGPRInfo.hasDispatchPtr()) {
2431 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2432 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2433 CCInfo.AllocateReg(DispatchPtrReg);
2434 }
2435
2436 const Module *M = MF.getFunction().getParent();
2437 if (UserSGPRInfo.hasQueuePtr() &&
2439 Register QueuePtrReg = Info.addQueuePtr(TRI);
2440 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2441 CCInfo.AllocateReg(QueuePtrReg);
2442 }
2443
2444 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2446 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2447 CCInfo.AllocateReg(InputPtrReg);
2448
2449 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2450 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2451 }
2452
2453 if (UserSGPRInfo.hasDispatchID()) {
2454 Register DispatchIDReg = Info.addDispatchID(TRI);
2455 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2456 CCInfo.AllocateReg(DispatchIDReg);
2457 }
2458
2459 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2460 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2461 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2462 CCInfo.AllocateReg(FlatScratchInitReg);
2463 }
2464
2465 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2466 // these from the dispatch pointer.
2467}
2468
2469// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2470// sequential starting from the first argument.
2472 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2474 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2475 Function &F = MF.getFunction();
2476 unsigned LastExplicitArgOffset =
2477 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2478 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2479 bool InPreloadSequence = true;
2480 unsigned InIdx = 0;
2481 for (auto &Arg : F.args()) {
2482 if (!InPreloadSequence || !Arg.hasInRegAttr())
2483 break;
2484
2485 int ArgIdx = Arg.getArgNo();
2486 // Don't preload non-original args or parts not in the current preload
2487 // sequence.
2488 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2489 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2490 break;
2491
2492 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2493 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2494 InIdx++) {
2495 assert(ArgLocs[ArgIdx].isMemLoc());
2496 auto &ArgLoc = ArgLocs[InIdx];
2497 const Align KernelArgBaseAlign = Align(16);
2498 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2499 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2500 unsigned NumAllocSGPRs =
2501 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2502
2503 // Arg is preloaded into the previous SGPR.
2504 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2505 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2506 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2507 continue;
2508 }
2509
2510 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2511 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2512 // Check for free user SGPRs for preloading.
2513 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2514 SGPRInfo.getNumFreeUserSGPRs()) {
2515 InPreloadSequence = false;
2516 break;
2517 }
2518
2519 // Preload this argument.
2520 const TargetRegisterClass *RC =
2521 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2522 SmallVectorImpl<MCRegister> *PreloadRegs =
2523 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2524
2525 if (PreloadRegs->size() > 1)
2526 RC = &AMDGPU::SGPR_32RegClass;
2527 for (auto &Reg : *PreloadRegs) {
2528 assert(Reg);
2529 MF.addLiveIn(Reg, RC);
2530 CCInfo.AllocateReg(Reg);
2531 }
2532
2533 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2534 }
2535 }
2536}
2537
2539 const SIRegisterInfo &TRI,
2540 SIMachineFunctionInfo &Info) const {
2541 // Always allocate this last since it is a synthetic preload.
2542 if (Info.hasLDSKernelId()) {
2543 Register Reg = Info.addLDSKernelId();
2544 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2545 CCInfo.AllocateReg(Reg);
2546 }
2547}
2548
2549// Allocate special input registers that are initialized per-wave.
2551 MachineFunction &MF,
2553 CallingConv::ID CallConv,
2554 bool IsShader) const {
2555 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2556 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2557 // Note: user SGPRs are handled by the front-end for graphics shaders
2558 // Pad up the used user SGPRs with dead inputs.
2559
2560 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2561 // before enabling architected SGPRs for workgroup IDs.
2562 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2563
2564 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2565 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2566 // rely on it to reach 16 since if we end up having no stack usage, it will
2567 // not really be added.
2568 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2569 Info.hasWorkGroupIDY() +
2570 Info.hasWorkGroupIDZ() +
2571 Info.hasWorkGroupInfo();
2572 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2573 Register Reg = Info.addReservedUserSGPR();
2574 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2575 CCInfo.AllocateReg(Reg);
2576 }
2577 }
2578
2579 if (!HasArchitectedSGPRs) {
2580 if (Info.hasWorkGroupIDX()) {
2581 Register Reg = Info.addWorkGroupIDX();
2582 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2583 CCInfo.AllocateReg(Reg);
2584 }
2585
2586 if (Info.hasWorkGroupIDY()) {
2587 Register Reg = Info.addWorkGroupIDY();
2588 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2589 CCInfo.AllocateReg(Reg);
2590 }
2591
2592 if (Info.hasWorkGroupIDZ()) {
2593 Register Reg = Info.addWorkGroupIDZ();
2594 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2595 CCInfo.AllocateReg(Reg);
2596 }
2597 }
2598
2599 if (Info.hasWorkGroupInfo()) {
2600 Register Reg = Info.addWorkGroupInfo();
2601 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2602 CCInfo.AllocateReg(Reg);
2603 }
2604
2605 if (Info.hasPrivateSegmentWaveByteOffset()) {
2606 // Scratch wave offset passed in system SGPR.
2607 unsigned PrivateSegmentWaveByteOffsetReg;
2608
2609 if (IsShader) {
2610 PrivateSegmentWaveByteOffsetReg =
2611 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2612
2613 // This is true if the scratch wave byte offset doesn't have a fixed
2614 // location.
2615 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2616 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2617 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2618 }
2619 } else
2620 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2621
2622 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2623 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2624 }
2625
2626 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2627 Info.getNumPreloadedSGPRs() >= 16);
2628}
2629
2631 MachineFunction &MF,
2632 const SIRegisterInfo &TRI,
2633 SIMachineFunctionInfo &Info) {
2634 // Now that we've figured out where the scratch register inputs are, see if
2635 // should reserve the arguments and use them directly.
2636 MachineFrameInfo &MFI = MF.getFrameInfo();
2637 bool HasStackObjects = MFI.hasStackObjects();
2638 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2639
2640 // Record that we know we have non-spill stack objects so we don't need to
2641 // check all stack objects later.
2642 if (HasStackObjects)
2643 Info.setHasNonSpillStackObjects(true);
2644
2645 // Everything live out of a block is spilled with fast regalloc, so it's
2646 // almost certain that spilling will be required.
2647 if (TM.getOptLevel() == CodeGenOptLevel::None)
2648 HasStackObjects = true;
2649
2650 // For now assume stack access is needed in any callee functions, so we need
2651 // the scratch registers to pass in.
2652 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2653
2654 if (!ST.enableFlatScratch()) {
2655 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2656 // If we have stack objects, we unquestionably need the private buffer
2657 // resource. For the Code Object V2 ABI, this will be the first 4 user
2658 // SGPR inputs. We can reserve those and use them directly.
2659
2660 Register PrivateSegmentBufferReg =
2662 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2663 } else {
2664 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2665 // We tentatively reserve the last registers (skipping the last registers
2666 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2667 // we'll replace these with the ones immediately after those which were
2668 // really allocated. In the prologue copies will be inserted from the
2669 // argument to these reserved registers.
2670
2671 // Without HSA, relocations are used for the scratch pointer and the
2672 // buffer resource setup is always inserted in the prologue. Scratch wave
2673 // offset is still in an input SGPR.
2674 Info.setScratchRSrcReg(ReservedBufferReg);
2675 }
2676 }
2677
2679
2680 // For entry functions we have to set up the stack pointer if we use it,
2681 // whereas non-entry functions get this "for free". This means there is no
2682 // intrinsic advantage to using S32 over S34 in cases where we do not have
2683 // calls but do need a frame pointer (i.e. if we are requested to have one
2684 // because frame pointer elimination is disabled). To keep things simple we
2685 // only ever use S32 as the call ABI stack pointer, and so using it does not
2686 // imply we need a separate frame pointer.
2687 //
2688 // Try to use s32 as the SP, but move it if it would interfere with input
2689 // arguments. This won't work with calls though.
2690 //
2691 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2692 // registers.
2693 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2694 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2695 } else {
2697
2698 if (MFI.hasCalls())
2699 report_fatal_error("call in graphics shader with too many input SGPRs");
2700
2701 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2702 if (!MRI.isLiveIn(Reg)) {
2703 Info.setStackPtrOffsetReg(Reg);
2704 break;
2705 }
2706 }
2707
2708 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2709 report_fatal_error("failed to find register for SP");
2710 }
2711
2712 // hasFP should be accurate for entry functions even before the frame is
2713 // finalized, because it does not rely on the known stack size, only
2714 // properties like whether variable sized objects are present.
2715 if (ST.getFrameLowering()->hasFP(MF)) {
2716 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2717 }
2718}
2719
2722 return !Info->isEntryFunction();
2723}
2724
2726
2727}
2728
2730 MachineBasicBlock *Entry,
2731 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2733
2734 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2735 if (!IStart)
2736 return;
2737
2738 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2739 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2740 MachineBasicBlock::iterator MBBI = Entry->begin();
2741 for (const MCPhysReg *I = IStart; *I; ++I) {
2742 const TargetRegisterClass *RC = nullptr;
2743 if (AMDGPU::SReg_64RegClass.contains(*I))
2744 RC = &AMDGPU::SGPR_64RegClass;
2745 else if (AMDGPU::SReg_32RegClass.contains(*I))
2746 RC = &AMDGPU::SGPR_32RegClass;
2747 else
2748 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2749
2750 Register NewVR = MRI->createVirtualRegister(RC);
2751 // Create copy from CSR to a virtual register.
2752 Entry->addLiveIn(*I);
2753 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2754 .addReg(*I);
2755
2756 // Insert the copy-back instructions right before the terminator.
2757 for (auto *Exit : Exits)
2758 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2759 TII->get(TargetOpcode::COPY), *I)
2760 .addReg(NewVR);
2761 }
2762}
2763
2765 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2766 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2767 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2769
2771 const Function &Fn = MF.getFunction();
2774
2775 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2776 DiagnosticInfoUnsupported NoGraphicsHSA(
2777 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2778 DAG.getContext()->diagnose(NoGraphicsHSA);
2779 return DAG.getEntryNode();
2780 }
2781
2784 BitVector Skipped(Ins.size());
2785 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2786 *DAG.getContext());
2787
2788 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2789 bool IsKernel = AMDGPU::isKernel(CallConv);
2790 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2791
2792 if (IsGraphics) {
2793 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2794 assert(!UserSGPRInfo.hasDispatchPtr() &&
2795 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2796 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2797 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2798 (void)UserSGPRInfo;
2799 if (!Subtarget->enableFlatScratch())
2800 assert(!UserSGPRInfo.hasFlatScratchInit());
2801 if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs())
2802 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2803 !Info->hasWorkGroupIDZ());
2804 }
2805
2806 if (CallConv == CallingConv::AMDGPU_PS) {
2807 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2808
2809 // At least one interpolation mode must be enabled or else the GPU will
2810 // hang.
2811 //
2812 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2813 // set PSInputAddr, the user wants to enable some bits after the compilation
2814 // based on run-time states. Since we can't know what the final PSInputEna
2815 // will look like, so we shouldn't do anything here and the user should take
2816 // responsibility for the correct programming.
2817 //
2818 // Otherwise, the following restrictions apply:
2819 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2820 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2821 // enabled too.
2822 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2823 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2824 CCInfo.AllocateReg(AMDGPU::VGPR0);
2825 CCInfo.AllocateReg(AMDGPU::VGPR1);
2826 Info->markPSInputAllocated(0);
2827 Info->markPSInputEnabled(0);
2828 }
2829 if (Subtarget->isAmdPalOS()) {
2830 // For isAmdPalOS, the user does not enable some bits after compilation
2831 // based on run-time states; the register values being generated here are
2832 // the final ones set in hardware. Therefore we need to apply the
2833 // workaround to PSInputAddr and PSInputEnable together. (The case where
2834 // a bit is set in PSInputAddr but not PSInputEnable is where the
2835 // frontend set up an input arg for a particular interpolation mode, but
2836 // nothing uses that input arg. Really we should have an earlier pass
2837 // that removes such an arg.)
2838 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2839 if ((PsInputBits & 0x7F) == 0 ||
2840 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2841 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2842 }
2843 } else if (IsKernel) {
2844 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2845 } else {
2846 Splits.append(Ins.begin(), Ins.end());
2847 }
2848
2849 if (IsKernel)
2850 analyzeFormalArgumentsCompute(CCInfo, Ins);
2851
2852 if (IsEntryFunc) {
2853 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2854 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2855 if (IsKernel && Subtarget->hasKernargPreload())
2856 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2857
2858 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2859 } else if (!IsGraphics) {
2860 // For the fixed ABI, pass workitem IDs in the last argument register.
2861 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2862
2863 // FIXME: Sink this into allocateSpecialInputSGPRs
2864 if (!Subtarget->enableFlatScratch())
2865 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2866
2867 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2868 }
2869
2870 if (!IsKernel) {
2871 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2872 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2873 }
2874
2876
2877 // FIXME: This is the minimum kernel argument alignment. We should improve
2878 // this to the maximum alignment of the arguments.
2879 //
2880 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2881 // kern arg offset.
2882 const Align KernelArgBaseAlign = Align(16);
2883
2884 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2885 const ISD::InputArg &Arg = Ins[i];
2886 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2887 InVals.push_back(DAG.getUNDEF(Arg.VT));
2888 continue;
2889 }
2890
2891 CCValAssign &VA = ArgLocs[ArgIdx++];
2892 MVT VT = VA.getLocVT();
2893
2894 if (IsEntryFunc && VA.isMemLoc()) {
2895 VT = Ins[i].VT;
2896 EVT MemVT = VA.getLocVT();
2897
2898 const uint64_t Offset = VA.getLocMemOffset();
2899 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2900
2901 if (Arg.Flags.isByRef()) {
2902 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2903
2904 const GCNTargetMachine &TM =
2905 static_cast<const GCNTargetMachine &>(getTargetMachine());
2906 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2907 Arg.Flags.getPointerAddrSpace())) {
2910 }
2911
2912 InVals.push_back(Ptr);
2913 continue;
2914 }
2915
2916 SDValue NewArg;
2917 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2918 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2919 // In this case the argument is packed into the previous preload SGPR.
2920 int64_t AlignDownOffset = alignDown(Offset, 4);
2921 int64_t OffsetDiff = Offset - AlignDownOffset;
2922 EVT IntVT = MemVT.changeTypeToInteger();
2923
2927 Register Reg =
2928 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2929
2930 assert(Reg);
2931 Register VReg = MRI.getLiveInVirtReg(Reg);
2932 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2933
2934 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2935 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2936
2937 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2938 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2939 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2940 Ins[i].Flags.isSExt(), &Ins[i]);
2941
2942 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2943 } else {
2947 const SmallVectorImpl<MCRegister> &PreloadRegs =
2948 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2949
2950 SDValue Copy;
2951 if (PreloadRegs.size() == 1) {
2952 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2953 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2954 NewArg = DAG.getCopyFromReg(
2955 Chain, DL, VReg,
2957 TRI->getRegSizeInBits(*RC)));
2958
2959 } else {
2960 // If the kernarg alignment does not match the alignment of the SGPR
2961 // tuple RC that can accommodate this argument, it will be built up
2962 // via copies from from the individual SGPRs that the argument was
2963 // preloaded to.
2965 for (auto Reg : PreloadRegs) {
2966 Register VReg = MRI.getLiveInVirtReg(Reg);
2967 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2968 Elts.push_back(Copy);
2969 }
2970 NewArg =
2971 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
2972 PreloadRegs.size()),
2973 DL, Elts);
2974 }
2975
2976 SDValue CMemVT;
2977 if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
2978 CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
2979 else
2980 CMemVT = DAG.getBitcast(MemVT, NewArg);
2981 NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
2982 Ins[i].Flags.isSExt(), &Ins[i]);
2983 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
2984 }
2985 } else {
2986 NewArg =
2987 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
2988 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2989 }
2990 Chains.push_back(NewArg.getValue(1));
2991
2992 auto *ParamTy =
2993 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2995 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2996 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2997 // On SI local pointers are just offsets into LDS, so they are always
2998 // less than 16-bits. On CI and newer they could potentially be
2999 // real pointers, so we can't guarantee their size.
3000 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3001 DAG.getValueType(MVT::i16));
3002 }
3003
3004 InVals.push_back(NewArg);
3005 continue;
3006 } else if (!IsEntryFunc && VA.isMemLoc()) {
3007 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3008 InVals.push_back(Val);
3009 if (!Arg.Flags.isByVal())
3010 Chains.push_back(Val.getValue(1));
3011 continue;
3012 }
3013
3014 assert(VA.isRegLoc() && "Parameter must be in a register!");
3015
3016 Register Reg = VA.getLocReg();
3017 const TargetRegisterClass *RC = nullptr;
3018 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3019 RC = &AMDGPU::VGPR_32RegClass;
3020 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3021 RC = &AMDGPU::SGPR_32RegClass;
3022 else
3023 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3024 EVT ValVT = VA.getValVT();
3025
3026 Reg = MF.addLiveIn(Reg, RC);
3027 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3028
3029 if (Arg.Flags.isSRet()) {
3030 // The return object should be reasonably addressable.
3031
3032 // FIXME: This helps when the return is a real sret. If it is a
3033 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3034 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3035 unsigned NumBits
3037 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3038 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3039 }
3040
3041 // If this is an 8 or 16-bit value, it is really passed promoted
3042 // to 32 bits. Insert an assert[sz]ext to capture this, then
3043 // truncate to the right size.
3044 switch (VA.getLocInfo()) {
3045 case CCValAssign::Full:
3046 break;
3047 case CCValAssign::BCvt:
3048 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3049 break;
3050 case CCValAssign::SExt:
3051 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3052 DAG.getValueType(ValVT));
3053 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3054 break;
3055 case CCValAssign::ZExt:
3056 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3057 DAG.getValueType(ValVT));
3058 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3059 break;
3060 case CCValAssign::AExt:
3061 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3062 break;
3063 default:
3064 llvm_unreachable("Unknown loc info!");
3065 }
3066
3067 InVals.push_back(Val);
3068 }
3069
3070 // Start adding system SGPRs.
3071 if (IsEntryFunc)
3072 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3073
3074 auto &ArgUsageInfo =
3076 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3077
3078 unsigned StackArgSize = CCInfo.getStackSize();
3079 Info->setBytesInStackArgArea(StackArgSize);
3080
3081 return Chains.empty() ? Chain :
3082 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3083}
3084
3085// TODO: If return values can't fit in registers, we should return as many as
3086// possible in registers before passing on stack.
3088 CallingConv::ID CallConv,
3089 MachineFunction &MF, bool IsVarArg,
3091 LLVMContext &Context) const {
3092 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3093 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3094 // for shaders. Vector types should be explicitly handled by CC.
3095 if (AMDGPU::isEntryFunctionCC(CallConv))
3096 return true;
3097
3099 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3100 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3101 return false;
3102
3103 // We must use the stack if return would require unavailable registers.
3104 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3105 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3106 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3107 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3108 return false;
3109
3110 return true;
3111}
3112
3113SDValue
3115 bool isVarArg,
3117 const SmallVectorImpl<SDValue> &OutVals,
3118 const SDLoc &DL, SelectionDAG &DAG) const {
3121
3122 if (AMDGPU::isKernel(CallConv)) {
3123 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3124 OutVals, DL, DAG);
3125 }
3126
3127 bool IsShader = AMDGPU::isShader(CallConv);
3128
3129 Info->setIfReturnsVoid(Outs.empty());
3130 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3131
3132 // CCValAssign - represent the assignment of the return value to a location.
3135
3136 // CCState - Info about the registers and stack slots.
3137 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3138 *DAG.getContext());
3139
3140 // Analyze outgoing return values.
3141 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3142
3143 SDValue Glue;
3145 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3146
3147 // Copy the result values into the output registers.
3148 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3149 ++I, ++RealRVLocIdx) {
3150 CCValAssign &VA = RVLocs[I];
3151 assert(VA.isRegLoc() && "Can only return in registers!");
3152 // TODO: Partially return in registers if return values don't fit.
3153 SDValue Arg = OutVals[RealRVLocIdx];
3154
3155 // Copied from other backends.
3156 switch (VA.getLocInfo()) {
3157 case CCValAssign::Full:
3158 break;
3159 case CCValAssign::BCvt:
3160 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3161 break;
3162 case CCValAssign::SExt:
3163 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3164 break;
3165 case CCValAssign::ZExt:
3166 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3167 break;
3168 case CCValAssign::AExt:
3169 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3170 break;
3171 default:
3172 llvm_unreachable("Unknown loc info!");
3173 }
3174
3175 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3176 Glue = Chain.getValue(1);
3177 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3178 }
3179
3180 // FIXME: Does sret work properly?
3181 if (!Info->isEntryFunction()) {
3182 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3183 const MCPhysReg *I =
3184 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3185 if (I) {
3186 for (; *I; ++I) {
3187 if (AMDGPU::SReg_64RegClass.contains(*I))
3188 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3189 else if (AMDGPU::SReg_32RegClass.contains(*I))
3190 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3191 else
3192 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3193 }
3194 }
3195 }
3196
3197 // Update chain and glue.
3198 RetOps[0] = Chain;
3199 if (Glue.getNode())
3200 RetOps.push_back(Glue);
3201
3202 unsigned Opc = AMDGPUISD::ENDPGM;
3203 if (!IsWaveEnd)
3205 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3206}
3207
3209 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3210 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3211 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3212 SDValue ThisVal) const {
3213 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3214
3215 // Assign locations to each value returned by this call.
3217 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3218 *DAG.getContext());
3219 CCInfo.AnalyzeCallResult(Ins, RetCC);
3220
3221 // Copy all of the result registers out of their specified physreg.
3222 for (unsigned i = 0; i != RVLocs.size(); ++i) {
3223 CCValAssign VA = RVLocs[i];
3224 SDValue Val;
3225
3226 if (VA.isRegLoc()) {
3227 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3228 Chain = Val.getValue(1);
3229 InGlue = Val.getValue(2);
3230 } else if (VA.isMemLoc()) {
3231 report_fatal_error("TODO: return values in memory");
3232 } else
3233 llvm_unreachable("unknown argument location type");
3234
3235 switch (VA.getLocInfo()) {
3236 case CCValAssign::Full:
3237 break;
3238 case CCValAssign::BCvt:
3239 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3240 break;
3241 case CCValAssign::ZExt:
3242 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3243 DAG.getValueType(VA.getValVT()));
3244 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3245 break;
3246 case CCValAssign::SExt:
3247 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3248 DAG.getValueType(VA.getValVT()));
3249 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3250 break;
3251 case CCValAssign::AExt:
3252 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3253 break;
3254 default:
3255 llvm_unreachable("Unknown loc info!");
3256 }
3257
3258 InVals.push_back(Val);
3259 }
3260
3261 return Chain;
3262}
3263
3264// Add code to pass special inputs required depending on used features separate
3265// from the explicit user arguments present in the IR.
3267 CallLoweringInfo &CLI,
3268 CCState &CCInfo,
3269 const SIMachineFunctionInfo &Info,
3270 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3271 SmallVectorImpl<SDValue> &MemOpChains,
3272 SDValue Chain) const {
3273 // If we don't have a call site, this was a call inserted by
3274 // legalization. These can never use special inputs.
3275 if (!CLI.CB)
3276 return;
3277
3278 SelectionDAG &DAG = CLI.DAG;
3279 const SDLoc &DL = CLI.DL;
3280 const Function &F = DAG.getMachineFunction().getFunction();
3281
3282 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3283 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3284
3285 const AMDGPUFunctionArgInfo *CalleeArgInfo
3287 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3288 auto &ArgUsageInfo =
3290 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3291 }
3292
3293 // TODO: Unify with private memory register handling. This is complicated by
3294 // the fact that at least in kernels, the input argument is not necessarily
3295 // in the same location as the input.
3296 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3298 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3299 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3300 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3301 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3302 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3303 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3304 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3305 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3306 };
3307
3308 for (auto Attr : ImplicitAttrs) {
3309 const ArgDescriptor *OutgoingArg;
3310 const TargetRegisterClass *ArgRC;
3311 LLT ArgTy;
3312
3313 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3314
3315 // If the callee does not use the attribute value, skip copying the value.
3316 if (CLI.CB->hasFnAttr(Attr.second))
3317 continue;
3318
3319 std::tie(OutgoingArg, ArgRC, ArgTy) =
3320 CalleeArgInfo->getPreloadedValue(InputID);
3321 if (!OutgoingArg)
3322 continue;
3323
3324 const ArgDescriptor *IncomingArg;
3325 const TargetRegisterClass *IncomingArgRC;
3326 LLT Ty;
3327 std::tie(IncomingArg, IncomingArgRC, Ty) =
3328 CallerArgInfo.getPreloadedValue(InputID);
3329 assert(IncomingArgRC == ArgRC);
3330
3331 // All special arguments are ints for now.
3332 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3333 SDValue InputReg;
3334
3335 if (IncomingArg) {
3336 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3337 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3338 // The implicit arg ptr is special because it doesn't have a corresponding
3339 // input for kernels, and is computed from the kernarg segment pointer.
3340 InputReg = getImplicitArgPtr(DAG, DL);
3341 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3342 std::optional<uint32_t> Id =
3344 if (Id.has_value()) {
3345 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3346 } else {
3347 InputReg = DAG.getUNDEF(ArgVT);
3348 }
3349 } else {
3350 // We may have proven the input wasn't needed, although the ABI is
3351 // requiring it. We just need to allocate the register appropriately.
3352 InputReg = DAG.getUNDEF(ArgVT);
3353 }
3354
3355 if (OutgoingArg->isRegister()) {
3356 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3357 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3358 report_fatal_error("failed to allocate implicit input argument");
3359 } else {
3360 unsigned SpecialArgOffset =
3361 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3362 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3363 SpecialArgOffset);
3364 MemOpChains.push_back(ArgStore);
3365 }
3366 }
3367
3368 // Pack workitem IDs into a single register or pass it as is if already
3369 // packed.
3370 const ArgDescriptor *OutgoingArg;
3371 const TargetRegisterClass *ArgRC;
3372 LLT Ty;
3373
3374 std::tie(OutgoingArg, ArgRC, Ty) =
3376 if (!OutgoingArg)
3377 std::tie(OutgoingArg, ArgRC, Ty) =
3379 if (!OutgoingArg)
3380 std::tie(OutgoingArg, ArgRC, Ty) =
3382 if (!OutgoingArg)
3383 return;
3384
3385 const ArgDescriptor *IncomingArgX = std::get<0>(
3387 const ArgDescriptor *IncomingArgY = std::get<0>(
3389 const ArgDescriptor *IncomingArgZ = std::get<0>(
3391
3392 SDValue InputReg;
3393 SDLoc SL;
3394
3395 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3396 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3397 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3398
3399 // If incoming ids are not packed we need to pack them.
3400 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3401 NeedWorkItemIDX) {
3402 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3403 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3404 } else {
3405 InputReg = DAG.getConstant(0, DL, MVT::i32);
3406 }
3407 }
3408
3409 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3410 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3411 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3412 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3413 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3414 InputReg = InputReg.getNode() ?
3415 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3416 }
3417
3418 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3419 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3420 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3421 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3422 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3423 InputReg = InputReg.getNode() ?
3424 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3425 }
3426
3427 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3428 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3429 // We're in a situation where the outgoing function requires the workitem
3430 // ID, but the calling function does not have it (e.g a graphics function
3431 // calling a C calling convention function). This is illegal, but we need
3432 // to produce something.
3433 InputReg = DAG.getUNDEF(MVT::i32);
3434 } else {
3435 // Workitem ids are already packed, any of present incoming arguments
3436 // will carry all required fields.
3438 IncomingArgX ? *IncomingArgX :
3439 IncomingArgY ? *IncomingArgY :
3440 *IncomingArgZ, ~0u);
3441 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3442 }
3443 }
3444
3445 if (OutgoingArg->isRegister()) {
3446 if (InputReg)
3447 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3448
3449 CCInfo.AllocateReg(OutgoingArg->getRegister());
3450 } else {
3451 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3452 if (InputReg) {
3453 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3454 SpecialArgOffset);
3455 MemOpChains.push_back(ArgStore);
3456 }
3457 }
3458}
3459
3461 return CC == CallingConv::Fast;
3462}
3463
3464/// Return true if we might ever do TCO for calls with this calling convention.
3466 switch (CC) {
3467 case CallingConv::C:
3469 return true;
3470 default:
3471 return canGuaranteeTCO(CC);
3472 }
3473}
3474
3476 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3478 const SmallVectorImpl<SDValue> &OutVals,
3479 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3480 if (AMDGPU::isChainCC(CalleeCC))
3481 return true;
3482
3483 if (!mayTailCallThisCC(CalleeCC))
3484 return false;
3485
3486 // For a divergent call target, we need to do a waterfall loop over the
3487 // possible callees which precludes us from using a simple jump.
3488 if (Callee->isDivergent())
3489 return false;
3490
3492 const Function &CallerF = MF.getFunction();
3493 CallingConv::ID CallerCC = CallerF.getCallingConv();
3495 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3496
3497 // Kernels aren't callable, and don't have a live in return address so it
3498 // doesn't make sense to do a tail call with entry functions.
3499 if (!CallerPreserved)
3500 return false;
3501
3502 bool CCMatch = CallerCC == CalleeCC;
3503
3505 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3506 return true;
3507 return false;
3508 }
3509
3510 // TODO: Can we handle var args?
3511 if (IsVarArg)
3512 return false;
3513
3514 for (const Argument &Arg : CallerF.args()) {
3515 if (Arg.hasByValAttr())
3516 return false;
3517 }
3518
3519 LLVMContext &Ctx = *DAG.getContext();
3520
3521 // Check that the call results are passed in the same way.
3522 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3523 CCAssignFnForCall(CalleeCC, IsVarArg),
3524 CCAssignFnForCall(CallerCC, IsVarArg)))
3525 return false;
3526
3527 // The callee has to preserve all registers the caller needs to preserve.
3528 if (!CCMatch) {
3529 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3530 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3531 return false;
3532 }
3533
3534 // Nothing more to check if the callee is taking no arguments.
3535 if (Outs.empty())
3536 return true;
3537
3539 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3540
3541 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3542
3543 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3544 // If the stack arguments for this call do not fit into our own save area then
3545 // the call cannot be made tail.
3546 // TODO: Is this really necessary?
3547 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3548 return false;
3549
3550 const MachineRegisterInfo &MRI = MF.getRegInfo();
3551 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3552}
3553
3555 if (!CI->isTailCall())
3556 return false;
3557
3558 const Function *ParentFn = CI->getParent()->getParent();
3560 return false;
3561 return true;
3562}
3563
3564// The wave scratch offset register is used as the global base pointer.
3566 SmallVectorImpl<SDValue> &InVals) const {
3567 CallingConv::ID CallConv = CLI.CallConv;
3568 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3569
3570 SelectionDAG &DAG = CLI.DAG;
3571
3572 TargetLowering::ArgListEntry RequestedExec;
3573 if (IsChainCallConv) {
3574 // The last argument should be the value that we need to put in EXEC.
3575 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3576 // don't treat it like the rest of the arguments.
3577 RequestedExec = CLI.Args.back();
3578 assert(RequestedExec.Node && "No node for EXEC");
3579
3580 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3581 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3582
3583 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3584 CLI.Outs.pop_back();
3585 CLI.OutVals.pop_back();
3586
3587 if (RequestedExec.Ty->isIntegerTy(64)) {
3588 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3589 CLI.Outs.pop_back();
3590 CLI.OutVals.pop_back();
3591 }
3592
3593 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3594 "Haven't popped all the pieces of the EXEC mask");
3595 }
3596
3597 const SDLoc &DL = CLI.DL;
3599 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3601 SDValue Chain = CLI.Chain;
3602 SDValue Callee = CLI.Callee;
3603 bool &IsTailCall = CLI.IsTailCall;
3604 bool IsVarArg = CLI.IsVarArg;
3605 bool IsSibCall = false;
3607
3608 if (Callee.isUndef() || isNullConstant(Callee)) {
3609 if (!CLI.IsTailCall) {
3610 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3611 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3612 }
3613
3614 return Chain;
3615 }
3616
3617 if (IsVarArg) {
3618 return lowerUnhandledCall(CLI, InVals,
3619 "unsupported call to variadic function ");
3620 }
3621
3622 if (!CLI.CB)
3623 report_fatal_error("unsupported libcall legalization");
3624
3625 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3626 return lowerUnhandledCall(CLI, InVals,
3627 "unsupported required tail call to function ");
3628 }
3629
3630 if (IsTailCall) {
3632 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3633 if (!IsTailCall &&
3634 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3635 report_fatal_error("failed to perform tail call elimination on a call "
3636 "site marked musttail or on llvm.amdgcn.cs.chain");
3637 }
3638
3639 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3640
3641 // A sibling call is one where we're under the usual C ABI and not planning
3642 // to change that but can still do a tail call:
3643 if (!TailCallOpt && IsTailCall)
3644 IsSibCall = true;
3645
3646 if (IsTailCall)
3647 ++NumTailCalls;
3648 }
3649
3652 SmallVector<SDValue, 8> MemOpChains;
3653
3654 // Analyze operands of the call, assigning locations to each operand.
3656 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3657 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3658
3659 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3660 // With a fixed ABI, allocate fixed registers before user arguments.
3661 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3662 }
3663
3664 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3665
3666 // Get a count of how many bytes are to be pushed on the stack.
3667 unsigned NumBytes = CCInfo.getStackSize();
3668
3669 if (IsSibCall) {
3670 // Since we're not changing the ABI to make this a tail call, the memory
3671 // operands are already available in the caller's incoming argument space.
3672 NumBytes = 0;
3673 }
3674
3675 // FPDiff is the byte offset of the call's argument area from the callee's.
3676 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3677 // by this amount for a tail call. In a sibling call it must be 0 because the
3678 // caller will deallocate the entire stack and the callee still expects its
3679 // arguments to begin at SP+0. Completely unused for non-tail calls.
3680 int32_t FPDiff = 0;
3681 MachineFrameInfo &MFI = MF.getFrameInfo();
3682
3683 // Adjust the stack pointer for the new arguments...
3684 // These operations are automatically eliminated by the prolog/epilog pass
3685 if (!IsSibCall)
3686 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3687
3688 if (!IsSibCall || IsChainCallConv) {
3689 if (!Subtarget->enableFlatScratch()) {
3690 SmallVector<SDValue, 4> CopyFromChains;
3691
3692 // In the HSA case, this should be an identity copy.
3693 SDValue ScratchRSrcReg
3694 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3695 RegsToPass.emplace_back(IsChainCallConv
3696 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3697 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3698 ScratchRSrcReg);
3699 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3700 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3701 }
3702 }
3703
3704 MVT PtrVT = MVT::i32;
3705
3706 // Walk the register/memloc assignments, inserting copies/loads.
3707 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3708 CCValAssign &VA = ArgLocs[i];
3709 SDValue Arg = OutVals[i];
3710
3711 // Promote the value if needed.
3712 switch (VA.getLocInfo()) {
3713 case CCValAssign::Full:
3714 break;
3715 case CCValAssign::BCvt:
3716 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3717 break;
3718 case CCValAssign::ZExt:
3719 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3720 break;
3721 case CCValAssign::SExt:
3722 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3723 break;
3724 case CCValAssign::AExt:
3725 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3726 break;
3727 case CCValAssign::FPExt:
3728 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3729 break;
3730 default:
3731 llvm_unreachable("Unknown loc info!");
3732 }
3733
3734 if (VA.isRegLoc()) {
3735 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3736 } else {
3737 assert(VA.isMemLoc());
3738
3739 SDValue DstAddr;
3740 MachinePointerInfo DstInfo;
3741
3742 unsigned LocMemOffset = VA.getLocMemOffset();
3743 int32_t Offset = LocMemOffset;
3744
3745 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3746 MaybeAlign Alignment;
3747
3748 if (IsTailCall) {
3749 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3750 unsigned OpSize = Flags.isByVal() ?
3751 Flags.getByValSize() : VA.getValVT().getStoreSize();
3752
3753 // FIXME: We can have better than the minimum byval required alignment.
3754 Alignment =
3755 Flags.isByVal()
3756 ? Flags.getNonZeroByValAlign()
3757 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3758
3759 Offset = Offset + FPDiff;
3760 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3761
3762 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3763 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3764
3765 // Make sure any stack arguments overlapping with where we're storing
3766 // are loaded before this eventual operation. Otherwise they'll be
3767 // clobbered.
3768
3769 // FIXME: Why is this really necessary? This seems to just result in a
3770 // lot of code to copy the stack and write them back to the same
3771 // locations, which are supposed to be immutable?
3772 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3773 } else {
3774 // Stores to the argument stack area are relative to the stack pointer.
3775 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3776 MVT::i32);
3777 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3778 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3779 Alignment =
3780 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3781 }
3782
3783 if (Outs[i].Flags.isByVal()) {
3784 SDValue SizeNode =
3785 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3786 SDValue Cpy =
3787 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3788 Outs[i].Flags.getNonZeroByValAlign(),
3789 /*isVol = */ false, /*AlwaysInline = */ true,
3790 /*isTailCall = */ false, DstInfo,
3792
3793 MemOpChains.push_back(Cpy);
3794 } else {
3795 SDValue Store =
3796 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3797 MemOpChains.push_back(Store);
3798 }
3799 }
3800 }
3801
3802 if (!MemOpChains.empty())
3803 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3804
3805 // Build a sequence of copy-to-reg nodes chained together with token chain
3806 // and flag operands which copy the outgoing args into the appropriate regs.
3807 SDValue InGlue;
3808 for (auto &RegToPass : RegsToPass) {
3809 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3810 RegToPass.second, InGlue);
3811 InGlue = Chain.getValue(1);
3812 }
3813
3814
3815 // We don't usually want to end the call-sequence here because we would tidy
3816 // the frame up *after* the call, however in the ABI-changing tail-call case
3817 // we've carefully laid out the parameters so that when sp is reset they'll be
3818 // in the correct location.
3819 if (IsTailCall && !IsSibCall) {
3820 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3821 InGlue = Chain.getValue(1);
3822 }
3823
3824 std::vector<SDValue> Ops;
3825 Ops.push_back(Chain);
3826 Ops.push_back(Callee);
3827 // Add a redundant copy of the callee global which will not be legalized, as
3828 // we need direct access to the callee later.
3829 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3830 const GlobalValue *GV = GSD->getGlobal();
3831 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3832 } else {
3833 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3834 }
3835
3836 if (IsTailCall) {
3837 // Each tail call may have to adjust the stack by a different amount, so
3838 // this information must travel along with the operation for eventual
3839 // consumption by emitEpilogue.
3840 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3841 }
3842
3843 if (IsChainCallConv)
3844 Ops.push_back(RequestedExec.Node);
3845
3846 // Add argument registers to the end of the list so that they are known live
3847 // into the call.
3848 for (auto &RegToPass : RegsToPass) {
3849 Ops.push_back(DAG.getRegister(RegToPass.first,
3850 RegToPass.second.getValueType()));
3851 }
3852
3853 // Add a register mask operand representing the call-preserved registers.
3854 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3855 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3856 assert(Mask && "Missing call preserved mask for calling convention");
3857 Ops.push_back(DAG.getRegisterMask(Mask));
3858
3859 if (InGlue.getNode())
3860 Ops.push_back(InGlue);
3861
3862 // NOTE: This potentially results in *two* glue operands, and the wrong one
3863 // might possibly show up where the other was intended. In particular,
3864 // Emitter::EmitMachineNode() expects only the glued convergence token if it
3865 // exists. Similarly, the selection of the call expects to match only the
3866 // InGlue operand if it exists.
3867 if (SDValue Token = CLI.ConvergenceControlToken) {
3868 Ops.push_back(SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE,
3869 DL, MVT::Glue, Token),
3870 0));
3871 }
3872
3873 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3874
3875 // If we're doing a tall call, use a TC_RETURN here rather than an
3876 // actual call instruction.
3877 if (IsTailCall) {
3878 MFI.setHasTailCall();
3879 unsigned OPC = AMDGPUISD::TC_RETURN;
3880 switch (CallConv) {
3883 break;
3887 break;
3888 }
3889
3890 return DAG.getNode(OPC, DL, NodeTys, Ops);
3891 }
3892
3893 // Returns a chain and a flag for retval copy to use.
3894 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3895 Chain = Call.getValue(0);
3896 InGlue = Call.getValue(1);
3897
3898 uint64_t CalleePopBytes = NumBytes;
3899 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3900 if (!Ins.empty())
3901 InGlue = Chain.getValue(1);
3902
3903 // Handle result values, copying them out of physregs into vregs that we
3904 // return.
3905 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3906 InVals, /*IsThisReturn=*/false, SDValue());
3907}
3908
3909// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3910// except for applying the wave size scale to the increment amount.
3912 SDValue Op, SelectionDAG &DAG) const {
3913 const MachineFunction &MF = DAG.getMachineFunction();
3915
3916 SDLoc dl(Op);
3917 EVT VT = Op.getValueType();
3918 SDValue Tmp1 = Op;
3919 SDValue Tmp2 = Op.getValue(1);
3920 SDValue Tmp3 = Op.getOperand(2);
3921 SDValue Chain = Tmp1.getOperand(0);
3922
3923 Register SPReg = Info->getStackPtrOffsetReg();
3924
3925 // Chain the dynamic stack allocation so that it doesn't modify the stack
3926 // pointer when other instructions are using the stack.
3927 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3928
3929 SDValue Size = Tmp2.getOperand(1);
3930 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3931 Chain = SP.getValue(1);
3932 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3933 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3934 unsigned Opc =
3937
3938 SDValue ScaledSize = DAG.getNode(
3939 ISD::SHL, dl, VT, Size,
3940 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3941
3942 Align StackAlign = TFL->getStackAlign();
3943 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3944 if (Alignment && *Alignment > StackAlign) {
3945 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3946 DAG.getConstant(-(uint64_t)Alignment->value()
3947 << Subtarget->getWavefrontSizeLog2(),
3948 dl, VT));
3949 }
3950
3951 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3952 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3953
3954 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3955}
3956
3958 SelectionDAG &DAG) const {
3959 // We only handle constant sizes here to allow non-entry block, static sized
3960 // allocas. A truly dynamic value is more difficult to support because we
3961 // don't know if the size value is uniform or not. If the size isn't uniform,
3962 // we would need to do a wave reduction to get the maximum size to know how
3963 // much to increment the uniform stack pointer.
3964 SDValue Size = Op.getOperand(1);
3965 if (isa<ConstantSDNode>(Size))
3966 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3967
3969}
3970
3972 if (Op.getValueType() != MVT::i32)
3973 return Op; // Defer to cannot select error.
3974
3976 SDLoc SL(Op);
3977
3978 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
3979
3980 // Convert from wave uniform to swizzled vector address. This should protect
3981 // from any edge cases where the stacksave result isn't directly used with
3982 // stackrestore.
3983 SDValue VectorAddress =
3984 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
3985 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
3986}
3987
3989 SelectionDAG &DAG) const {
3990 SDLoc SL(Op);
3991 assert(Op.getValueType() == MVT::i32);
3992
3993 uint32_t BothRoundHwReg =
3995 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
3996
3997 SDValue IntrinID =
3998 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
3999 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4000 Op.getOperand(0), IntrinID, GetRoundBothImm);
4001
4002 // There are two rounding modes, one for f32 and one for f64/f16. We only
4003 // report in the standard value range if both are the same.
4004 //
4005 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4006 // ties away from zero is not supported, and the other values are rotated by
4007 // 1.
4008 //
4009 // If the two rounding modes are not the same, report a target defined value.
4010
4011 // Mode register rounding mode fields:
4012 //
4013 // [1:0] Single-precision round mode.
4014 // [3:2] Double/Half-precision round mode.
4015 //
4016 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4017 //
4018 // Hardware Spec
4019 // Toward-0 3 0
4020 // Nearest Even 0 1
4021 // +Inf 1 2
4022 // -Inf 2 3
4023 // NearestAway0 N/A 4
4024 //
4025 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4026 // table we can index by the raw hardware mode.
4027 //
4028 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4029
4030 SDValue BitTable =
4032
4033 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4034 SDValue RoundModeTimesNumBits =
4035 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4036
4037 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4038 // knew only one mode was demanded.
4039 SDValue TableValue =
4040 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4041 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4042
4043 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4044 SDValue TableEntry =
4045 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4046
4047 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4048 // if it's an extended value.
4049 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4050 SDValue IsStandardValue =
4051 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4052 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4053 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4054 TableEntry, EnumOffset);
4055
4056 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4057}
4058
4060 if (Op->isDivergent())
4061 return SDValue();
4062
4063 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4068 break;
4069 default:
4070 return SDValue();
4071 }
4072
4073 return Op;
4074}
4075
4076// Work around DAG legality rules only based on the result type.
4078 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4079 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4080 EVT SrcVT = Src.getValueType();
4081
4082 if (SrcVT.getScalarType() != MVT::bf16)
4083 return Op;
4084
4085 SDLoc SL(Op);
4086 SDValue BitCast =
4087 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4088
4089 EVT DstVT = Op.getValueType();
4090 if (IsStrict)
4091 llvm_unreachable("Need STRICT_BF16_TO_FP");
4092
4093 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4094}
4095
4097 SDLoc SL(Op);
4098 if (Op.getValueType() != MVT::i64)
4099 return Op;
4100
4101 uint32_t ModeHwReg =
4103 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4104 uint32_t TrapHwReg =
4106 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4107
4108 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4109 SDValue IntrinID =
4110 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4111 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4112 Op.getOperand(0), IntrinID, ModeHwRegImm);
4113 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4114 Op.getOperand(0), IntrinID, TrapHwRegImm);
4115 SDValue TokenReg =
4116 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4117 GetTrapReg.getValue(1));
4118
4119 SDValue CvtPtr =
4120 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4121 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4122
4123 return DAG.getMergeValues({Result, TokenReg}, SL);
4124}
4125
4127 SDLoc SL(Op);
4128 if (Op.getOperand(1).getValueType() != MVT::i64)
4129 return Op;
4130
4131 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4132 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4133 DAG.getConstant(0, SL, MVT::i32));
4134 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4135 DAG.getConstant(1, SL, MVT::i32));
4136
4137 SDValue ReadFirstLaneID =
4138 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4139 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4140 ReadFirstLaneID, NewModeReg);
4141 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4142 ReadFirstLaneID, NewTrapReg);
4143
4144 unsigned ModeHwReg =
4146 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4147 unsigned TrapHwReg =
4149 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4150
4151 SDValue IntrinID =
4152 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4153 SDValue SetModeReg =
4154 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4155 IntrinID, ModeHwRegImm, NewModeReg);
4156 SDValue SetTrapReg =
4157 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4158 IntrinID, TrapHwRegImm, NewTrapReg);
4159 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4160}
4161
4163 const MachineFunction &MF) const {
4165 .Case("m0", AMDGPU::M0)
4166 .Case("exec", AMDGPU::EXEC)
4167 .Case("exec_lo", AMDGPU::EXEC_LO)
4168 .Case("exec_hi", AMDGPU::EXEC_HI)
4169 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4170 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4171 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4172 .Default(Register());
4173
4174 if (Reg == AMDGPU::NoRegister) {
4175 report_fatal_error(Twine("invalid register name \""
4176 + StringRef(RegName) + "\"."));
4177
4178 }
4179
4180 if (!Subtarget->hasFlatScrRegister() &&
4181 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4182 report_fatal_error(Twine("invalid register \""
4183 + StringRef(RegName) + "\" for subtarget."));
4184 }
4185
4186 switch (Reg) {
4187 case AMDGPU::M0:
4188 case AMDGPU::EXEC_LO:
4189 case AMDGPU::EXEC_HI:
4190 case AMDGPU::FLAT_SCR_LO:
4191 case AMDGPU::FLAT_SCR_HI:
4192 if (VT.getSizeInBits() == 32)
4193 return Reg;
4194 break;
4195 case AMDGPU::EXEC:
4196 case AMDGPU::FLAT_SCR:
4197 if (VT.getSizeInBits() == 64)
4198 return Reg;
4199 break;
4200 default:
4201 llvm_unreachable("missing register type checking");
4202 }
4203
4204 report_fatal_error(Twine("invalid type for register \""
4205 + StringRef(RegName) + "\"."));
4206}
4207
4208// If kill is not the last instruction, split the block so kill is always a
4209// proper terminator.
4212 MachineBasicBlock *BB) const {
4213 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4215 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4216 return SplitBB;
4217}
4218
4219// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4220// \p MI will be the only instruction in the loop body block. Otherwise, it will
4221// be the first instruction in the remainder block.
4222//
4223/// \returns { LoopBody, Remainder }
4224static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4228
4229 // To insert the loop we need to split the block. Move everything after this
4230 // point to a new block, and insert a new empty block between the two.
4232 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4234 ++MBBI;
4235
4236 MF->insert(MBBI, LoopBB);
4237 MF->insert(MBBI, RemainderBB);
4238
4239 LoopBB->addSuccessor(LoopBB);
4240 LoopBB->addSuccessor(RemainderBB);
4241
4242 // Move the rest of the block into a new block.
4243 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4244
4245 if (InstInLoop) {
4246 auto Next = std::next(I);
4247
4248 // Move instruction to loop body.
4249 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4250
4251 // Move the rest of the block.
4252 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4253 } else {
4254 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4255 }
4256
4257 MBB.addSuccessor(LoopBB);
4258
4259 return std::pair(LoopBB, RemainderBB);
4260}
4261
4262/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4264 MachineBasicBlock *MBB = MI.getParent();
4266 auto I = MI.getIterator();
4267 auto E = std::next(I);
4268
4269 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4270 .addImm(0);
4271
4272 MIBundleBuilder Bundler(*MBB, I, E);
4273 finalizeBundle(*MBB, Bundler.begin());
4274}
4275
4278 MachineBasicBlock *BB) const {
4279 const DebugLoc &DL = MI.getDebugLoc();
4280
4282
4283 MachineBasicBlock *LoopBB;
4284 MachineBasicBlock *RemainderBB;
4286
4287 // Apparently kill flags are only valid if the def is in the same block?
4288 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4289 Src->setIsKill(false);
4290
4291 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4292
4293 MachineBasicBlock::iterator I = LoopBB->end();
4294
4295 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4297
4298 // Clear TRAP_STS.MEM_VIOL
4299 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4300 .addImm(0)
4301 .addImm(EncodedReg);
4302
4304
4305 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4306
4307 // Load and check TRAP_STS.MEM_VIOL
4308 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4309 .addImm(EncodedReg);
4310
4311 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4312 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4313 .addReg(Reg, RegState::Kill)
4314 .addImm(0);
4315 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4316 .addMBB(LoopBB);
4317
4318 return RemainderBB;
4319}
4320
4321// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4322// wavefront. If the value is uniform and just happens to be in a VGPR, this
4323// will only do one iteration. In the worst case, this will loop 64 times.
4324//
4325// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4328 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4329 const DebugLoc &DL, const MachineOperand &Idx,
4330 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4331 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4332 Register &SGPRIdxReg) {
4333
4334 MachineFunction *MF = OrigBB.getParent();
4335 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4336 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4338
4339 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4340 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4341 Register NewExec = MRI.createVirtualRegister(BoolRC);
4342 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4343 Register CondReg = MRI.createVirtualRegister(BoolRC);
4344
4345 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4346 .addReg(InitReg)
4347 .addMBB(&OrigBB)
4348 .addReg(ResultReg)
4349 .addMBB(&LoopBB);
4350
4351 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4352 .addReg(InitSaveExecReg)
4353 .addMBB(&OrigBB)
4354 .addReg(NewExec)
4355 .addMBB(&LoopBB);
4356
4357 // Read the next variant <- also loop target.
4358 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4359 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4360
4361 // Compare the just read M0 value to all possible Idx values.
4362 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4363 .addReg(CurrentIdxReg)
4364 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4365
4366 // Update EXEC, save the original EXEC value to VCC.
4367 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4368 : AMDGPU::S_AND_SAVEEXEC_B64),
4369 NewExec)
4370 .addReg(CondReg, RegState::Kill);
4371
4372 MRI.setSimpleHint(NewExec, CondReg);
4373
4374 if (UseGPRIdxMode) {
4375 if (Offset == 0) {
4376 SGPRIdxReg = CurrentIdxReg;
4377 } else {
4378 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4379 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4380 .addReg(CurrentIdxReg, RegState::Kill)
4381 .addImm(Offset);
4382 }
4383 } else {
4384 // Move index from VCC into M0
4385 if (Offset == 0) {
4386 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4387 .addReg(CurrentIdxReg, RegState::Kill);
4388 } else {
4389 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4390 .addReg(CurrentIdxReg, RegState::Kill)
4391 .addImm(Offset);
4392 }
4393 }
4394
4395 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4396 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4397 MachineInstr *InsertPt =
4398 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4399 : AMDGPU::S_XOR_B64_term), Exec)
4400 .addReg(Exec)
4401 .addReg(NewExec);
4402
4403 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4404 // s_cbranch_scc0?
4405
4406 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4407 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4408 .addMBB(&LoopBB);
4409
4410 return InsertPt->getIterator();
4411}
4412
4413// This has slightly sub-optimal regalloc when the source vector is killed by
4414// the read. The register allocator does not understand that the kill is
4415// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4416// subregister from it, using 1 more VGPR than necessary. This was saved when
4417// this was expanded after register allocation.
4420 unsigned InitResultReg, unsigned PhiReg, int Offset,
4421 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4423 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4424 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4426 const DebugLoc &DL = MI.getDebugLoc();
4428
4429 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4430 Register DstReg = MI.getOperand(0).getReg();
4431 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4432 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4433 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4434 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4435
4436 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4437
4438 // Save the EXEC mask
4439 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4440 .addReg(Exec);
4441
4442 MachineBasicBlock *LoopBB;
4443 MachineBasicBlock *RemainderBB;
4444 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4445
4446 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4447
4448 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4449 InitResultReg, DstReg, PhiReg, TmpExec,
4450 Offset, UseGPRIdxMode, SGPRIdxReg);
4451
4452 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4454 ++MBBI;
4455 MF->insert(MBBI, LandingPad);
4456 LoopBB->removeSuccessor(RemainderBB);
4457 LandingPad->addSuccessor(RemainderBB);
4458 LoopBB->addSuccessor(LandingPad);
4459 MachineBasicBlock::iterator First = LandingPad->begin();
4460 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4461 .addReg(SaveExec);
4462
4463 return InsPt;
4464}
4465
4466// Returns subreg index, offset
4467static std::pair<unsigned, int>
4469 const TargetRegisterClass *SuperRC,
4470 unsigned VecReg,
4471 int Offset) {
4472 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4473
4474 // Skip out of bounds offsets, or else we would end up using an undefined
4475 // register.
4476 if (Offset >= NumElts || Offset < 0)
4477 return std::pair(AMDGPU::sub0, Offset);
4478
4479 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4480}
4481
4484 int Offset) {
4485 MachineBasicBlock *MBB = MI.getParent();
4486 const DebugLoc &DL = MI.getDebugLoc();
4488
4489 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4490
4491 assert(Idx->getReg() != AMDGPU::NoRegister);
4492
4493 if (Offset == 0) {
4494 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4495 } else {
4496 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4497 .add(*Idx)
4498 .addImm(Offset);
4499 }
4500}
4501
4504 int Offset) {
4505 MachineBasicBlock *MBB = MI.getParent();
4506 const DebugLoc &DL = MI.getDebugLoc();
4508
4509 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4510
4511 if (Offset == 0)
4512 return Idx->getReg();
4513
4514 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4515 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4516 .add(*Idx)
4517 .addImm(Offset);
4518 return Tmp;
4519}
4520
4523 const GCNSubtarget &ST) {
4524 const SIInstrInfo *TII = ST.getInstrInfo();
4525 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4528
4529 Register Dst = MI.getOperand(0).getReg();
4530 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4531 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4532 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4533
4534 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4535 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4536
4537 unsigned SubReg;
4538 std::tie(SubReg, Offset)
4539 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4540
4541 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4542
4543 // Check for a SGPR index.
4544 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4546 const DebugLoc &DL = MI.getDebugLoc();
4547
4548 if (UseGPRIdxMode) {
4549 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4550 // to avoid interfering with other uses, so probably requires a new
4551 // optimization pass.
4553
4554 const MCInstrDesc &GPRIDXDesc =
4555 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4556 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4557 .addReg(SrcReg)
4558 .addReg(Idx)
4559 .addImm(SubReg);
4560 } else {
4562
4563 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4564 .addReg(SrcReg, 0, SubReg)
4565 .addReg(SrcReg, RegState::Implicit);
4566 }
4567
4568 MI.eraseFromParent();
4569
4570 return &MBB;
4571 }
4572
4573 // Control flow needs to be inserted if indexing with a VGPR.
4574 const DebugLoc &DL = MI.getDebugLoc();
4576
4577 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4578 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4579
4580 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4581
4582 Register SGPRIdxReg;
4583 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4584 UseGPRIdxMode, SGPRIdxReg);
4585
4586 MachineBasicBlock *LoopBB = InsPt->getParent();
4587
4588 if (UseGPRIdxMode) {
4589 const MCInstrDesc &GPRIDXDesc =
4590 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4591
4592 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4593 .addReg(SrcReg)
4594 .addReg(SGPRIdxReg)
4595 .addImm(SubReg);
4596 } else {
4597 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4598 .addReg(SrcReg, 0, SubReg)
4599 .addReg(SrcReg, RegState::Implicit);
4600 }
4601
4602 MI.eraseFromParent();
4603
4604 return LoopBB;
4605}
4606
4609 const GCNSubtarget &ST) {
4610 const SIInstrInfo *TII = ST.getInstrInfo();
4611 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4614
4615 Register Dst = MI.getOperand(0).getReg();
4616 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4617 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4618 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4619 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4620 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4621 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4622
4623 // This can be an immediate, but will be folded later.
4624 assert(Val->getReg());
4625
4626 unsigned SubReg;
4627 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4628 SrcVec->getReg(),
4629 Offset);
4630 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4631
4632 if (Idx->getReg() == AMDGPU::NoRegister) {
4634 const DebugLoc &DL = MI.getDebugLoc();
4635
4636 assert(Offset == 0);
4637
4638 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4639 .add(*SrcVec)
4640 .add(*Val)
4641 .addImm(SubReg);
4642
4643 MI.eraseFromParent();
4644 return &MBB;
4645 }
4646
4647 // Check for a SGPR index.
4648 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4650 const DebugLoc &DL = MI.getDebugLoc();
4651
4652 if (UseGPRIdxMode) {
4654
4655 const MCInstrDesc &GPRIDXDesc =
4656 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4657 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4658 .addReg(SrcVec->getReg())
4659 .add(*Val)
4660 .addReg(Idx)
4661 .addImm(SubReg);
4662 } else {
4664
4665 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4666 TRI.getRegSizeInBits(*VecRC), 32, false);
4667 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4668 .addReg(SrcVec->getReg())
4669 .add(*Val)
4670 .addImm(SubReg);
4671 }
4672 MI.eraseFromParent();
4673 return &MBB;
4674 }
4675
4676 // Control flow needs to be inserted if indexing with a VGPR.
4677 if (Val->isReg())
4678 MRI.clearKillFlags(Val->getReg());
4679
4680 const DebugLoc &DL = MI.getDebugLoc();
4681
4682 Register PhiReg = MRI.createVirtualRegister(VecRC);
4683
4684 Register SGPRIdxReg;
4685 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4686 UseGPRIdxMode, SGPRIdxReg);
4687 MachineBasicBlock *LoopBB = InsPt->getParent();
4688
4689 if (UseGPRIdxMode) {
4690 const MCInstrDesc &GPRIDXDesc =
4691 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4692
4693 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4694 .addReg(PhiReg)
4695 .add(*Val)
4696 .addReg(SGPRIdxReg)
4697 .addImm(AMDGPU::sub0);
4698 } else {
4699 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4700 TRI.getRegSizeInBits(*VecRC), 32, false);
4701 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4702 .addReg(PhiReg)
4703 .add(*Val)
4704 .addImm(AMDGPU::sub0);
4705 }
4706
4707 MI.eraseFromParent();
4708 return LoopBB;
4709}
4710
4713 const GCNSubtarget &ST,
4714 unsigned Opc) {
4716 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4717 const DebugLoc &DL = MI.getDebugLoc();
4718 const SIInstrInfo *TII = ST.getInstrInfo();
4719
4720 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4721 Register SrcReg = MI.getOperand(1).getReg();
4722 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4723 Register DstReg = MI.getOperand(0).getReg();
4724 MachineBasicBlock *RetBB = nullptr;
4725 if (isSGPR) {
4726 // These operations with a uniform value i.e. SGPR are idempotent.
4727 // Reduced value will be same as given sgpr.
4728 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4729 RetBB = &BB;
4730 } else {
4731 // TODO: Implement DPP Strategy and switch based on immediate strategy
4732 // operand. For now, for all the cases (default, Iterative and DPP we use
4733 // iterative approach by default.)
4734
4735 // To reduce the VGPR using iterative approach, we need to iterate
4736 // over all the active lanes. Lowering consists of ComputeLoop,
4737 // which iterate over only active lanes. We use copy of EXEC register
4738 // as induction variable and every active lane modifies it using bitset0
4739 // so that we will get the next active lane for next iteration.
4741 Register SrcReg = MI.getOperand(1).getReg();
4742
4743 // Create Control flow for loop
4744 // Split MI's Machine Basic block into For loop
4745 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4746
4747 // Create virtual registers required for lowering.
4748 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4749 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4750 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4751 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4752
4753 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4754 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4755 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4756
4757 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4758 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4759
4760 bool IsWave32 = ST.isWave32();
4761 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4762 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4763
4764 // Create initail values of induction variable from Exec, Accumulator and
4765 // insert branch instr to newly created ComputeBlockk
4766 uint32_t InitalValue =
4767 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4768 auto TmpSReg =
4769 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4770 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4771 .addImm(InitalValue);
4772 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4773
4774 // Start constructing ComputeLoop
4775 I = ComputeLoop->end();
4776 auto Accumulator =
4777 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4778 .addReg(InitalValReg)
4779 .addMBB(&BB);
4780 auto ActiveBits =
4781 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4782 .addReg(TmpSReg->getOperand(0).getReg())
4783 .addMBB(&BB);
4784
4785 // Perform the computations
4786 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4787 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4788 .addReg(ActiveBits->getOperand(0).getReg());
4789 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4790 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4791 .addReg(SrcReg)
4792 .addReg(FF1->getOperand(0).getReg());
4793 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4794 .addReg(Accumulator->getOperand(0).getReg())
4795 .addReg(LaneValue->getOperand(0).getReg());
4796
4797 // Manipulate the iterator to get the next active lane
4798 unsigned BITSETOpc =
4799 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4800 auto NewActiveBits =
4801 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4802 .addReg(FF1->getOperand(0).getReg())
4803 .addReg(ActiveBits->getOperand(0).getReg());
4804
4805 // Add phi nodes
4806 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4807 .addMBB(ComputeLoop);
4808 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4809 .addMBB(ComputeLoop);
4810
4811 // Creating branching
4812 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4813 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4814 .addReg(NewActiveBits->getOperand(0).getReg())
4815 .addImm(0);
4816 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4817 .addMBB(ComputeLoop);
4818
4819 RetBB = ComputeEnd;
4820 }
4821 MI.eraseFromParent();
4822 return RetBB;
4823}
4824
4826 MachineInstr &MI, MachineBasicBlock *BB) const {
4827
4829 MachineFunction *MF = BB->getParent();
4831
4832 switch (MI.getOpcode()) {
4833 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4834 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4835 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4836 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4837 case AMDGPU::S_UADDO_PSEUDO:
4838 case AMDGPU::S_USUBO_PSEUDO: {
4839 const DebugLoc &DL = MI.getDebugLoc();
4840 MachineOperand &Dest0 = MI.getOperand(0);
4841 MachineOperand &Dest1 = MI.getOperand(1);
4842 MachineOperand &Src0 = MI.getOperand(2);
4843 MachineOperand &Src1 = MI.getOperand(3);
4844
4845 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4846 ? AMDGPU::S_ADD_I32
4847 : AMDGPU::S_SUB_I32;
4848 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4849
4850 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4851 .addImm(1)
4852 .addImm(0);
4853
4854 MI.eraseFromParent();
4855 return BB;
4856 }
4857 case AMDGPU::S_ADD_U64_PSEUDO:
4858 case AMDGPU::S_SUB_U64_PSEUDO: {
4859 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4860 // For GFX12, we emit s_add_u64 and s_sub_u64.
4861 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4863 const DebugLoc &DL = MI.getDebugLoc();
4864 MachineOperand &Dest = MI.getOperand(0);
4865 MachineOperand &Src0 = MI.getOperand(1);
4866 MachineOperand &Src1 = MI.getOperand(2);
4867 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4868 if (Subtarget->hasScalarAddSub64()) {
4869 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4870 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
4871 .add(Src0)
4872 .add(Src1);
4873 } else {
4874 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4875 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4876
4877 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4878 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4879
4880 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4881 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4882 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4883 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4884
4885 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4886 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4887 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4888 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4889
4890 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4891 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4892 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4893 .add(Src0Sub0)
4894 .add(Src1Sub0);
4895 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4896 .add(Src0Sub1)
4897 .add(Src1Sub1);
4898 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4899 .addReg(DestSub0)
4900 .addImm(AMDGPU::sub0)
4901 .addReg(DestSub1)
4902 .addImm(AMDGPU::sub1);
4903 }
4904 MI.eraseFromParent();
4905 return BB;
4906 }
4907 case AMDGPU::V_ADD_U64_PSEUDO:
4908 case AMDGPU::V_SUB_U64_PSEUDO: {
4910 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4911 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4912 const DebugLoc &DL = MI.getDebugLoc();
4913
4914 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4915
4916 MachineOperand &Dest = MI.getOperand(0);
4917 MachineOperand &Src0 = MI.getOperand(1);
4918 MachineOperand &Src1 = MI.getOperand(2);
4919
4920 if (IsAdd && ST.hasLshlAddB64()) {
4921 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
4922 Dest.getReg())
4923 .add(Src0)
4924 .addImm(0)
4925 .add(Src1);
4926 TII->legalizeOperands(*Add);
4927 MI.eraseFromParent();
4928 return BB;
4929 }
4930
4931 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4932
4933 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4934 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4935
4936 Register CarryReg = MRI.createVirtualRegister(CarryRC);
4937 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
4938
4939 const TargetRegisterClass *Src0RC = Src0.isReg()
4940 ? MRI.getRegClass(Src0.getReg())
4941 : &AMDGPU::VReg_64RegClass;
4942 const TargetRegisterClass *Src1RC = Src1.isReg()
4943 ? MRI.getRegClass(Src1.getReg())
4944 : &AMDGPU::VReg_64RegClass;
4945
4946 const TargetRegisterClass *Src0SubRC =
4947 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4948 const TargetRegisterClass *Src1SubRC =
4949 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4950
4951 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
4952 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4953 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
4954 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4955
4956 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
4957 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4958 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
4959 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4960
4961 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4962 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4963 .addReg(CarryReg, RegState::Define)
4964 .add(SrcReg0Sub0)
4965 .add(SrcReg1Sub0)
4966 .addImm(0); // clamp bit
4967
4968 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4969 MachineInstr *HiHalf =
4970 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4971 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4972 .add(SrcReg0Sub1)
4973 .add(SrcReg1Sub1)
4974 .addReg(CarryReg, RegState::Kill)
4975 .addImm(0); // clamp bit
4976
4977 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4978 .addReg(DestSub0)
4979 .addImm(AMDGPU::sub0)
4980 .addReg(DestSub1)
4981 .addImm(AMDGPU::sub1);
4982 TII->legalizeOperands(*LoHalf);
4983 TII->legalizeOperands(*HiHalf);
4984 MI.eraseFromParent();
4985 return BB;
4986 }
4987 case AMDGPU::S_ADD_CO_PSEUDO:
4988 case AMDGPU::S_SUB_CO_PSEUDO: {
4989 // This pseudo has a chance to be selected
4990 // only from uniform add/subcarry node. All the VGPR operands
4991 // therefore assumed to be splat vectors.
4993 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4994 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4996 const DebugLoc &DL = MI.getDebugLoc();
4997 MachineOperand &Dest = MI.getOperand(0);
4998 MachineOperand &CarryDest = MI.getOperand(1);
4999 MachineOperand &Src0 = MI.getOperand(2);
5000 MachineOperand &Src1 = MI.getOperand(3);
5001 MachineOperand &Src2 = MI.getOperand(4);
5002 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5003 ? AMDGPU::S_ADDC_U32
5004 : AMDGPU::S_SUBB_U32;
5005 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5006 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5007 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5008 .addReg(Src0.getReg());
5009 Src0.setReg(RegOp0);
5010 }
5011 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5012 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5013 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5014 .addReg(Src1.getReg());
5015 Src1.setReg(RegOp1);
5016 }
5017 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5018 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5019 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5020 .addReg(Src2.getReg());
5021 Src2.setReg(RegOp2);
5022 }
5023
5024 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5025 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5026 assert(WaveSize == 64 || WaveSize == 32);
5027
5028 if (WaveSize == 64) {
5029 if (ST.hasScalarCompareEq64()) {
5030 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5031 .addReg(Src2.getReg())
5032 .addImm(0);
5033 } else {
5034 const TargetRegisterClass *SubRC =
5035 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5036 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5037 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5038 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5039 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5040 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5041
5042 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5043 .add(Src2Sub0)
5044 .add(Src2Sub1);
5045
5046 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5047 .addReg(Src2_32, RegState::Kill)
5048 .addImm(0);
5049 }
5050 } else {
5051 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5052 .addReg(Src2.getReg())
5053 .addImm(0);
5054 }
5055
5056 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5057
5058 unsigned SelOpc =
5059 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5060
5061 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5062 .addImm(-1)
5063 .addImm(0);
5064
5065 MI.eraseFromParent();
5066 return BB;
5067 }
5068 case AMDGPU::SI_INIT_M0: {
5069 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5070 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5071 .add(MI.getOperand(0));
5072 MI.eraseFromParent();
5073 return BB;
5074 }
5075 case AMDGPU::GET_GROUPSTATICSIZE: {
5076 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5077 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5078 DebugLoc DL = MI.getDebugLoc();
5079 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5080 .add(MI.getOperand(0))
5081 .addImm(MFI->getLDSSize());
5082 MI.eraseFromParent();
5083 return BB;
5084 }
5085 case AMDGPU::GET_SHADERCYCLESHILO: {
5088 const DebugLoc &DL = MI.getDebugLoc();
5089 // The algorithm is:
5090 //
5091 // hi1 = getreg(SHADER_CYCLES_HI)
5092 // lo1 = getreg(SHADER_CYCLES_LO)
5093 // hi2 = getreg(SHADER_CYCLES_HI)
5094 //
5095 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5096 // Otherwise there was overflow and the result is hi2:0. In both cases the
5097 // result should represent the actual time at some point during the sequence
5098 // of three getregs.
5099 using namespace AMDGPU::Hwreg;
5100 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5101 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5102 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5103 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5104 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5105 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5106 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5107 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5108 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5109 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5110 .addReg(RegHi1)
5111 .addReg(RegHi2);
5112 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5113 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5114 .addReg(RegLo1)
5115 .addImm(0);
5116 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5117 .add(MI.getOperand(0))
5118 .addReg(RegLo)
5119 .addImm(AMDGPU::sub0)
5120 .addReg(RegHi2)
5121 .addImm(AMDGPU::sub1);
5122 MI.eraseFromParent();
5123 return BB;
5124 }
5125 case AMDGPU::SI_INDIRECT_SRC_V1:
5126 case AMDGPU::SI_INDIRECT_SRC_V2:
5127 case AMDGPU::SI_INDIRECT_SRC_V4:
5128 case AMDGPU::SI_INDIRECT_SRC_V8:
5129 case AMDGPU::SI_INDIRECT_SRC_V9:
5130 case AMDGPU::SI_INDIRECT_SRC_V10:
5131 case AMDGPU::SI_INDIRECT_SRC_V11:
5132 case AMDGPU::SI_INDIRECT_SRC_V12:
5133 case AMDGPU::SI_INDIRECT_SRC_V16:
5134 case AMDGPU::SI_INDIRECT_SRC_V32:
5135 return emitIndirectSrc(MI, *BB, *getSubtarget());
5136 case AMDGPU::SI_INDIRECT_DST_V1:
5137 case AMDGPU::SI_INDIRECT_DST_V2:
5138 case AMDGPU::SI_INDIRECT_DST_V4:
5139 case AMDGPU::SI_INDIRECT_DST_V8:
5140 case AMDGPU::SI_INDIRECT_DST_V9:
5141 case AMDGPU::SI_INDIRECT_DST_V10:
5142 case AMDGPU::SI_INDIRECT_DST_V11:
5143 case AMDGPU::SI_INDIRECT_DST_V12:
5144 case AMDGPU::SI_INDIRECT_DST_V16:
5145 case AMDGPU::SI_INDIRECT_DST_V32:
5146 return emitIndirectDst(MI, *BB, *getSubtarget());
5147 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5148 case AMDGPU::SI_KILL_I1_PSEUDO:
5149 return splitKillBlock(MI, BB);
5150 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5152 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5153 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5154
5155 Register Dst = MI.getOperand(0).getReg();
5156 const MachineOperand &Src0 = MI.getOperand(1);
5157 const MachineOperand &Src1 = MI.getOperand(2);
5158 const DebugLoc &DL = MI.getDebugLoc();
5159 Register SrcCond = MI.getOperand(3).getReg();
5160
5161 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5162 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5163 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5164 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5165
5166 const TargetRegisterClass *Src0RC = Src0.isReg()
5167 ? MRI.getRegClass(Src0.getReg())
5168 : &AMDGPU::VReg_64RegClass;
5169 const TargetRegisterClass *Src1RC = Src1.isReg()
5170 ? MRI.getRegClass(Src1.getReg())
5171 : &AMDGPU::VReg_64RegClass;
5172
5173 const TargetRegisterClass *Src0SubRC =
5174 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5175 const TargetRegisterClass *Src1SubRC =
5176 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5177
5178 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5179 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5180 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5181 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5182
5183 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5184 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5185 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5186 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5187
5188 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5189 .addReg(SrcCond);
5190 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5191 .addImm(0)
5192 .add(Src0Sub0)
5193 .addImm(0)
5194 .add(Src1Sub0)
5195 .addReg(SrcCondCopy);
5196 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5197 .addImm(0)
5198 .add(Src0Sub1)
5199 .addImm(0)
5200 .add(Src1Sub1)
5201 .addReg(SrcCondCopy);
5202
5203 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5204 .addReg(DstLo)
5205 .addImm(AMDGPU::sub0)
5206 .addReg(DstHi)
5207 .addImm(AMDGPU::sub1);
5208 MI.eraseFromParent();
5209 return BB;
5210 }
5211 case AMDGPU::SI_BR_UNDEF: {
5213 const DebugLoc &DL = MI.getDebugLoc();
5214 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5215 .add(MI.getOperand(0));
5216 Br->getOperand(1).setIsUndef(); // read undef SCC
5217 MI.eraseFromParent();
5218 return BB;
5219 }
5220 case AMDGPU::ADJCALLSTACKUP:
5221 case AMDGPU::ADJCALLSTACKDOWN: {
5223 MachineInstrBuilder MIB(*MF, &MI);
5224 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5225 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5226 return BB;
5227 }
5228 case AMDGPU::SI_CALL_ISEL: {
5230 const DebugLoc &DL = MI.getDebugLoc();
5231
5232 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5233
5235 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5236
5237 for (const MachineOperand &MO : MI.operands())
5238 MIB.add(MO);
5239
5240 MIB.cloneMemRefs(MI);
5241 MI.eraseFromParent();
5242 return BB;
5243 }
5244 case AMDGPU::V_ADD_CO_U32_e32:
5245 case AMDGPU::V_SUB_CO_U32_e32:
5246 case AMDGPU::V_SUBREV_CO_U32_e32: {
5247 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5248 const DebugLoc &DL = MI.getDebugLoc();
5249 unsigned Opc = MI.getOpcode();
5250
5251 bool NeedClampOperand = false;
5252 if (TII->pseudoToMCOpcode(Opc) == -1) {
5253 Opc = AMDGPU::getVOPe64(Opc);
5254 NeedClampOperand = true;
5255 }
5256
5257 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5258 if (TII->isVOP3(*I)) {
5259 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5260 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5261 I.addReg(TRI->getVCC(), RegState::Define);
5262 }
5263 I.add(MI.getOperand(1))
5264 .add(MI.getOperand(2));
5265 if (NeedClampOperand)
5266 I.addImm(0); // clamp bit for e64 encoding
5267
5268 TII->legalizeOperands(*I);
5269
5270 MI.eraseFromParent();
5271 return BB;
5272 }
5273 case AMDGPU::V_ADDC_U32_e32:
5274 case AMDGPU::V_SUBB_U32_e32:
5275 case AMDGPU::V_SUBBREV_U32_e32:
5276 // These instructions have an implicit use of vcc which counts towards the
5277 // constant bus limit.
5278 TII->legalizeOperands(MI);
5279 return BB;
5280 case AMDGPU::DS_GWS_INIT:
5281 case AMDGPU::DS_GWS_SEMA_BR:
5282 case AMDGPU::DS_GWS_BARRIER:
5283 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5284 [[fallthrough]];
5285 case AMDGPU::DS_GWS_SEMA_V:
5286 case AMDGPU::DS_GWS_SEMA_P:
5287 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5288 // A s_waitcnt 0 is required to be the instruction immediately following.
5289 if (getSubtarget()->hasGWSAutoReplay()) {
5291 return BB;
5292 }
5293
5294 return emitGWSMemViolTestLoop(MI, BB);
5295 case AMDGPU::S_SETREG_B32: {
5296 // Try to optimize cases that only set the denormal mode or rounding mode.
5297 //
5298 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5299 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5300 // instead.
5301 //
5302 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5303 // allow you to have a no side effect instruction in the output of a
5304 // sideeffecting pattern.
5305 auto [ID, Offset, Width] =
5306 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5308 return BB;
5309
5310 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5311 const unsigned SetMask = WidthMask << Offset;
5312
5313 if (getSubtarget()->hasDenormModeInst()) {
5314 unsigned SetDenormOp = 0;
5315 unsigned SetRoundOp = 0;
5316
5317 // The dedicated instructions can only set the whole denorm or round mode
5318 // at once, not a subset of bits in either.
5319 if (SetMask ==
5321 // If this fully sets both the round and denorm mode, emit the two
5322 // dedicated instructions for these.
5323 SetRoundOp = AMDGPU::S_ROUND_MODE;
5324 SetDenormOp = AMDGPU::S_DENORM_MODE;
5325 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5326 SetRoundOp = AMDGPU::S_ROUND_MODE;
5327 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5328 SetDenormOp = AMDGPU::S_DENORM_MODE;
5329 }
5330
5331 if (SetRoundOp || SetDenormOp) {
5333 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5334 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5335 unsigned ImmVal = Def->getOperand(1).getImm();
5336 if (SetRoundOp) {
5337 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5338 .addImm(ImmVal & 0xf);
5339
5340 // If we also have the denorm mode, get just the denorm mode bits.
5341 ImmVal >>= 4;
5342 }
5343
5344 if (SetDenormOp) {
5345 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5346 .addImm(ImmVal & 0xf);
5347 }
5348
5349 MI.eraseFromParent();
5350 return BB;
5351 }
5352 }
5353 }
5354
5355 // If only FP bits are touched, used the no side effects pseudo.
5356 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5357 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5358 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5359
5360 return BB;
5361 }
5362 case AMDGPU::S_INVERSE_BALLOT_U32:
5363 case AMDGPU::S_INVERSE_BALLOT_U64: {
5365 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5366 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5367 const DebugLoc &DL = MI.getDebugLoc();
5368 const Register DstReg = MI.getOperand(0).getReg();
5369 Register MaskReg = MI.getOperand(1).getReg();
5370
5371 const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg);
5372
5373 if (IsVALU) {
5374 MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI);
5375 }
5376
5377 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg);
5378 MI.eraseFromParent();
5379 return BB;
5380 }
5381 case AMDGPU::ENDPGM_TRAP: {
5382 const DebugLoc &DL = MI.getDebugLoc();
5383 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5384 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5385 MI.addOperand(MachineOperand::CreateImm(0));
5386 return BB;
5387 }
5388
5389 // We need a block split to make the real endpgm a terminator. We also don't
5390 // want to break phis in successor blocks, so we can't just delete to the
5391 // end of the block.
5392
5393 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5395 MF->push_back(TrapBB);
5396 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5397 .addImm(0);
5398 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5399 .addMBB(TrapBB);
5400
5401 BB->addSuccessor(TrapBB);
5402 MI.eraseFromParent();
5403 return SplitBB;
5404 }
5405 default:
5406 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5407 if (!MI.mayStore())
5409 return BB;
5410 }
5412 }
5413}
5414
5416 // This currently forces unfolding various combinations of fsub into fma with
5417 // free fneg'd operands. As long as we have fast FMA (controlled by
5418 // isFMAFasterThanFMulAndFAdd), we should perform these.
5419
5420 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5421 // most of these combines appear to be cycle neutral but save on instruction
5422 // count / code size.
5423 return true;
5424}
5425
5427
5429 EVT VT) const {
5430 if (!VT.isVector()) {
5431 return MVT::i1;
5432 }
5433 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5434}
5435
5437 // TODO: Should i16 be used always if legal? For now it would force VALU
5438 // shifts.
5439 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5440}
5441
5443 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5444 ? Ty.changeElementSize(16)
5445 : Ty.changeElementSize(32);
5446}
5447
5448// Answering this is somewhat tricky and depends on the specific device which
5449// have different rates for fma or all f64 operations.
5450//
5451// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5452// regardless of which device (although the number of cycles differs between
5453// devices), so it is always profitable for f64.
5454//
5455// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5456// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5457// which we can always do even without fused FP ops since it returns the same
5458// result as the separate operations and since it is always full
5459// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5460// however does not support denormals, so we do report fma as faster if we have
5461// a fast fma device and require denormals.
5462//
5464 EVT VT) const {
5465 VT = VT.getScalarType();
5466
5467 switch (VT.getSimpleVT().SimpleTy) {
5468 case MVT::f32: {
5469 // If mad is not available this depends only on if f32 fma is full rate.
5470 if (!Subtarget->hasMadMacF32Insts())
5471 return Subtarget->hasFastFMAF32();
5472
5473 // Otherwise f32 mad is always full rate and returns the same result as
5474 // the separate operations so should be preferred over fma.
5475 // However does not support denormals.
5477 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5478
5479 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5480 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5481 }
5482 case MVT::f64:
5483 return true;
5484 case MVT::f16:
5485 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5486 default:
5487 break;
5488 }
5489
5490 return false;
5491}
5492
5494 LLT Ty) const {
5495 switch (Ty.getScalarSizeInBits()) {
5496 case 16:
5497 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5498 case 32:
5499 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5500 case 64:
5501 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5502 default:
5503 break;
5504 }
5505
5506 return false;
5507}
5508
5510 if (!Ty.isScalar())
5511 return false;
5512
5513 if (Ty.getScalarSizeInBits() == 16)
5514 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5515 if (Ty.getScalarSizeInBits() == 32)
5516 return Subtarget->hasMadMacF32Insts() &&
5517 denormalModeIsFlushAllF32(*MI.getMF());
5518
5519 return false;
5520}
5521
5523 const SDNode *N) const {
5524 // TODO: Check future ftz flag
5525 // v_mad_f32/v_mac_f32 do not support denormals.
5526 EVT VT = N->getValueType(0);
5527 if (VT == MVT::f32)
5528 return Subtarget->hasMadMacF32Insts() &&
5530 if (VT == MVT::f16) {
5531 return Subtarget->hasMadF16() &&
5533 }
5534
5535 return false;
5536}
5537
5538//===----------------------------------------------------------------------===//
5539// Custom DAG Lowering Operations
5540//===----------------------------------------------------------------------===//
5541
5542// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5543// wider vector type is legal.
5545 SelectionDAG &DAG) const {
5546 unsigned Opc = Op.getOpcode();
5547 EVT VT = Op.getValueType();
5548 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5549 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5550 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5551 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5552
5553 SDValue Lo, Hi;
5554 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5555
5556 SDLoc SL(Op);
5557 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5558 Op->getFlags());
5559 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5560 Op->getFlags());
5561
5562 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5563}
5564
5565// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5566// wider vector type is legal.
5568 SelectionDAG &DAG) const {
5569 unsigned Opc = Op.getOpcode();
5570 EVT VT = Op.getValueType();
5571 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5572 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5573 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5574 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5575
5576 SDValue Lo0, Hi0;
5577 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5578 SDValue Lo1, Hi1;
5579 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5580
5581 SDLoc SL(Op);
5582
5583 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5584 Op->getFlags());
5585 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5586 Op->getFlags());
5587
5588 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5589}
5590
5592 SelectionDAG &DAG) const {
5593 unsigned Opc = Op.getOpcode();
5594 EVT VT = Op.getValueType();
5595 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5596 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5597 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5598 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5599 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5600 VT == MVT::v32bf16);
5601
5602 SDValue Lo0, Hi0;
5603 SDValue Op0 = Op.getOperand(0);
5604 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5605 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5606 : std::pair(Op0, Op0);
5607 SDValue Lo1, Hi1;
5608 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5609 SDValue Lo2, Hi2;
5610 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5611
5612 SDLoc SL(Op);
5613 auto ResVT = DAG.GetSplitDestVTs(VT);
5614
5615 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5616 Op->getFlags());
5617 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5618 Op->getFlags());
5619
5620 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5621}
5622
5623
5625 switch (Op.getOpcode()) {
5626 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5627 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5628 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5629 case ISD::LOAD: {
5630 SDValue Result = LowerLOAD(Op, DAG);
5631 assert((!Result.getNode() ||
5632 Result.getNode()->getNumValues() == 2) &&
5633 "Load should return a value and a chain");
5634 return Result;
5635 }
5636 case ISD::FSQRT: {
5637 EVT VT = Op.getValueType();
5638 if (VT == MVT::f32)
5639 return lowerFSQRTF32(Op, DAG);
5640 if (VT == MVT::f64)
5641 return lowerFSQRTF64(Op, DAG);
5642 return SDValue();
5643 }
5644 case ISD::FSIN:
5645 case ISD::FCOS:
5646 return LowerTrig(Op, DAG);
5647 case ISD::SELECT: return LowerSELECT(Op, DAG);
5648 case ISD::FDIV: return LowerFDIV(Op, DAG);
5649 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5650 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5651 case ISD::STORE: return LowerSTORE(Op, DAG);
5652 case ISD::GlobalAddress: {
5655 return LowerGlobalAddress(MFI, Op, DAG);
5656 }
5657 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5658 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5659 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5660 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5662 return lowerINSERT_SUBVECTOR(Op, DAG);
5664 return lowerINSERT_VECTOR_ELT(Op, DAG);
5666 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5668 return lowerVECTOR_SHUFFLE(Op, DAG);
5670 return lowerSCALAR_TO_VECTOR(Op, DAG);
5671 case ISD::BUILD_VECTOR:
5672 return lowerBUILD_VECTOR(Op, DAG);
5673 case ISD::FP_ROUND:
5675 return lowerFP_ROUND(Op, DAG);
5676 case ISD::FPTRUNC_ROUND: {
5677 unsigned Opc;
5678 SDLoc DL(Op);
5679
5680 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5681 return SDValue();
5682
5683 // Get the rounding mode from the last operand
5684 int RoundMode = Op.getConstantOperandVal(1);
5685 if (RoundMode == (int)RoundingMode::TowardPositive)
5687 else if (RoundMode == (int)RoundingMode::TowardNegative)
5689 else
5690 return SDValue();
5691
5692 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5693 }
5694 case ISD::TRAP:
5695 return lowerTRAP(Op, DAG);
5696 case ISD::DEBUGTRAP:
5697 return lowerDEBUGTRAP(Op, DAG);
5698 case ISD::FABS:
5699 case ISD::FNEG:
5700 case ISD::FCANONICALIZE:
5701 case ISD::BSWAP:
5702 return splitUnaryVectorOp(Op, DAG);
5703 case ISD::FMINNUM:
5704 case ISD::FMAXNUM:
5705 return lowerFMINNUM_FMAXNUM(Op, DAG);
5706 case ISD::FLDEXP:
5707 case ISD::STRICT_FLDEXP:
5708 return lowerFLDEXP(Op, DAG);
5709 case ISD::FMA:
5710 return splitTernaryVectorOp(Op, DAG);
5711 case ISD::FP_TO_SINT:
5712 case ISD::FP_TO_UINT:
5713 return LowerFP_TO_INT(Op, DAG);
5714 case ISD::SHL:
5715 case ISD::SRA:
5716 case ISD::SRL:
5717 case ISD::ADD:
5718 case ISD::SUB:
5719 case ISD::SMIN:
5720 case ISD::SMAX:
5721 case ISD::UMIN:
5722 case ISD::UMAX:
5723 case ISD::FADD:
5724 case ISD::FMUL:
5725 case ISD::FMINNUM_IEEE:
5726 case ISD::FMAXNUM_IEEE:
5727 case ISD::UADDSAT:
5728 case ISD::USUBSAT:
5729 case ISD::SADDSAT:
5730 case ISD::SSUBSAT:
5731 return splitBinaryVectorOp(Op, DAG);
5732 case ISD::MUL:
5733 return lowerMUL(Op, DAG);
5734 case ISD::SMULO:
5735 case ISD::UMULO:
5736 return lowerXMULO(Op, DAG);
5737 case ISD::SMUL_LOHI:
5738 case ISD::UMUL_LOHI:
5739 return lowerXMUL_LOHI(Op, DAG);
5741 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5742 case ISD::STACKSAVE:
5743 return LowerSTACKSAVE(Op, DAG);
5744 case ISD::GET_ROUNDING:
5745 return lowerGET_ROUNDING(Op, DAG);
5746 case ISD::PREFETCH:
5747 return lowerPREFETCH(Op, DAG);
5748 case ISD::FP_EXTEND:
5750 return lowerFP_EXTEND(Op, DAG);
5751 case ISD::GET_FPENV:
5752 return lowerGET_FPENV(Op, DAG);
5753 case ISD::SET_FPENV:
5754 return lowerSET_FPENV(Op, DAG);
5755 }
5756 return SDValue();
5757}
5758
5759// Used for D16: Casts the result of an instruction into the right vector,
5760// packs values if loads return unpacked values.
5762 const SDLoc &DL,
5763 SelectionDAG &DAG, bool Unpacked) {
5764 if (!LoadVT.isVector())
5765 return Result;
5766
5767 // Cast back to the original packed type or to a larger type that is a
5768 // multiple of 32 bit for D16. Widening the return type is a required for
5769 // legalization.
5770 EVT FittingLoadVT = LoadVT;
5771 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5772 FittingLoadVT =
5774 LoadVT.getVectorNumElements() + 1);
5775 }
5776
5777 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5778 // Truncate to v2i16/v4i16.
5779 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5780
5781 // Workaround legalizer not scalarizing truncate after vector op
5782 // legalization but not creating intermediate vector trunc.
5784 DAG.ExtractVectorElements(Result, Elts);
5785 for (SDValue &Elt : Elts)
5786 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5787
5788 // Pad illegal v1i16/v3fi6 to v4i16
5789 if ((LoadVT.getVectorNumElements() % 2) == 1)
5790 Elts.push_back(DAG.getUNDEF(MVT::i16));
5791
5792 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5793
5794 // Bitcast to original type (v2f16/v4f16).
5795 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5796 }
5797
5798 // Cast back to the original packed type.
5799 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5800}
5801
5802SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5803 MemSDNode *M,
5804 SelectionDAG &DAG,
5806 bool IsIntrinsic) const {
5807 SDLoc DL(M);
5808
5809 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5810 EVT LoadVT = M->getValueType(0);
5811
5812 EVT EquivLoadVT = LoadVT;
5813 if (LoadVT.isVector()) {
5814 if (Unpacked) {
5815 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5816 LoadVT.getVectorNumElements());
5817 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5818 // Widen v3f16 to legal type
5819 EquivLoadVT =
5821 LoadVT.getVectorNumElements() + 1);
5822 }
5823 }
5824
5825 // Change from v4f16/v2f16 to EquivLoadVT.
5826 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5827
5829 = DAG.getMemIntrinsicNode(
5830 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5831 VTList, Ops, M->getMemoryVT(),
5832 M->getMemOperand());
5833
5834 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5835
5836 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5837}
5838
5839SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5840 SelectionDAG &DAG,
5841 ArrayRef<SDValue> Ops) const {
5842 SDLoc DL(M);
5843 EVT LoadVT = M->getValueType(0);
5844 EVT EltType = LoadVT.getScalarType();
5845 EVT IntVT = LoadVT.changeTypeToInteger();
5846
5847 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5848
5849 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5850 bool IsTFE = M->getNumValues() == 3;
5851
5852 unsigned Opc;
5853 if (IsFormat) {
5856 } else {
5857 // TODO: Support non-format TFE loads.
5858 if (IsTFE)
5859 return SDValue();
5861 }
5862
5863 if (IsD16) {
5864 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5865 }
5866
5867 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5868 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5869 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand());
5870
5871 if (isTypeLegal(LoadVT)) {
5872 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
5873 M->getMemOperand(), DAG);
5874 }
5875
5876 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
5877 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
5878 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
5879 M->getMemOperand(), DAG);
5880 return DAG.getMergeValues(
5881 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
5882 DL);
5883}
5884
5886 SDNode *N, SelectionDAG &DAG) {
5887 EVT VT = N->getValueType(0);
5888 unsigned CondCode = N->getConstantOperandVal(3);
5889 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
5890 return DAG.getUNDEF(VT);
5891
5892 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
5893
5894 SDValue LHS = N->getOperand(1);
5895 SDValue RHS = N->getOperand(2);
5896
5897 SDLoc DL(N);
5898
5899 EVT CmpVT = LHS.getValueType();
5900 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
5901 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
5903 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
5904 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
5905 }
5906
5907 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
5908
5909 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
5910 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
5911
5912 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
5913 DAG.getCondCode(CCOpcode));
5914 if (VT.bitsEq(CCVT))
5915 return SetCC;
5916 return DAG.getZExtOrTrunc(SetCC, DL, VT);
5917}
5918
5920 SDNode *N, SelectionDAG &DAG) {
5921 EVT VT = N->getValueType(0);
5922
5923 unsigned CondCode = N->getConstantOperandVal(3);
5924 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
5925 return DAG.getUNDEF(VT);
5926
5927 SDValue Src0 = N->getOperand(1);
5928 SDValue Src1 = N->getOperand(2);
5929 EVT CmpVT = Src0.getValueType();
5930 SDLoc SL(N);
5931
5932 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
5933 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
5934 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
5935 }
5936
5937 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
5938 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
5939 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
5940 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
5941 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
5942 Src1, DAG.getCondCode(CCOpcode));
5943 if (VT.bitsEq(CCVT))
5944 return SetCC;
5945 return DAG.getZExtOrTrunc(SetCC, SL, VT);
5946}
5947
5949 SelectionDAG &DAG) {
5950 EVT VT = N->getValueType(0);
5951 SDValue Src = N->getOperand(1);
5952 SDLoc SL(N);
5953
5954 if (Src.getOpcode() == ISD::SETCC) {
5955 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
5956 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
5957 Src.getOperand(1), Src.getOperand(2));
5958 }
5959 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
5960 // (ballot 0) -> 0
5961 if (Arg->isZero())
5962 return DAG.getConstant(0, SL, VT);
5963
5964 // (ballot 1) -> EXEC/EXEC_LO
5965 if (Arg->isOne()) {
5966 Register Exec;
5967 if (VT.getScalarSizeInBits() == 32)
5968 Exec = AMDGPU::EXEC_LO;
5969 else if (VT.getScalarSizeInBits() == 64)
5970 Exec = AMDGPU::EXEC;
5971 else
5972 return SDValue();
5973
5974 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
5975 }
5976 }
5977
5978 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
5979 // ISD::SETNE)
5980 return DAG.getNode(
5981 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
5982 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
5983}
5984
5987 SelectionDAG &DAG) const {
5988 switch (N->getOpcode()) {
5990 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
5991 Results.push_back(Res);
5992 return;
5993 }
5995 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
5996 Results.push_back(Res);
5997 return;
5998 }
6000 unsigned IID = N->getConstantOperandVal(0);
6001 switch (IID) {
6002 case Intrinsic::amdgcn_make_buffer_rsrc:
6003 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6004 return;
6005 case Intrinsic::amdgcn_cvt_pkrtz: {
6006 SDValue Src0 = N->getOperand(1);
6007 SDValue Src1 = N->getOperand(2);
6008 SDLoc SL(N);
6009 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6010 Src0, Src1);
6011 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6012 return;
6013 }
6014 case Intrinsic::amdgcn_cvt_pknorm_i16:
6015 case Intrinsic::amdgcn_cvt_pknorm_u16:
6016 case Intrinsic::amdgcn_cvt_pk_i16:
6017 case Intrinsic::amdgcn_cvt_pk_u16: {
6018 SDValue Src0 = N->getOperand(1);
6019 SDValue Src1 = N->getOperand(2);
6020 SDLoc SL(N);
6021 unsigned Opcode;
6022
6023 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6025 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6027 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6029 else
6031
6032 EVT VT = N->getValueType(0);
6033 if (isTypeLegal(VT))
6034 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6035 else {
6036 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6037 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6038 }
6039 return;
6040 }
6041 case Intrinsic::amdgcn_s_buffer_load: {
6042 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6043 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6044 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6045 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6046 // s_buffer_load_i8.
6047 if (!Subtarget->hasScalarSubwordLoads())
6048 return;
6049 SDValue Op = SDValue(N, 0);
6050 SDValue Rsrc = Op.getOperand(1);
6051 SDValue Offset = Op.getOperand(2);
6052 SDValue CachePolicy = Op.getOperand(3);
6053 EVT VT = Op.getValueType();
6054 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6055 SDLoc DL(Op);
6057 const DataLayout &DataLayout = DAG.getDataLayout();
6058 Align Alignment =
6064 VT.getStoreSize(), Alignment);
6065 SDValue LoadVal;
6066 if (!Offset->isDivergent()) {
6067 SDValue Ops[] = {Rsrc, // source register
6068 Offset, CachePolicy};
6069 SDValue BufferLoad =
6071 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6072 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6073 } else {
6074 SDValue Ops[] = {
6075 DAG.getEntryNode(), // Chain
6076 Rsrc, // rsrc
6077 DAG.getConstant(0, DL, MVT::i32), // vindex
6078 {}, // voffset
6079 {}, // soffset
6080 {}, // offset
6081 CachePolicy, // cachepolicy
6082 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6083 };
6084 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6085 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6086 }
6087 Results.push_back(LoadVal);
6088 return;
6089 }
6090 }
6091 break;
6092 }
6094 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6095 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6096 // FIXME: Hacky
6097 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6098 Results.push_back(Res.getOperand(I));
6099 }
6100 } else {
6101 Results.push_back(Res);
6102 Results.push_back(Res.getValue(1));
6103 }
6104 return;
6105 }
6106
6107 break;
6108 }
6109 case ISD::SELECT: {
6110 SDLoc SL(N);
6111 EVT VT = N->getValueType(0);
6112 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6113 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6114 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6115
6116 EVT SelectVT = NewVT;
6117 if (NewVT.bitsLT(MVT::i32)) {
6118 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6119 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6120 SelectVT = MVT::i32;
6121 }
6122
6123 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6124 N->getOperand(0), LHS, RHS);
6125
6126 if (NewVT != SelectVT)
6127 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6128 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6129 return;
6130 }
6131 case ISD::FNEG: {
6132 if (N->getValueType(0) != MVT::v2f16)
6133 break;
6134
6135 SDLoc SL(N);
6136 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6137
6138 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6139 BC,
6140 DAG.getConstant(0x80008000, SL, MVT::i32));
6141 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6142 return;
6143 }
6144 case ISD::FABS: {
6145 if (N->getValueType(0) != MVT::v2f16)
6146 break;
6147
6148 SDLoc SL(N);
6149 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6150
6151 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6152 BC,
6153 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6154 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6155 return;
6156 }
6157 case ISD::FSQRT: {
6158 if (N->getValueType(0) != MVT::f16)
6159 break;
6160 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6161 break;
6162 }
6163 default:
6165 break;
6166 }
6167}
6168
6169/// Helper function for LowerBRCOND
6170static SDNode *findUser(SDValue Value, unsigned Opcode) {
6171
6172 SDNode *Parent = Value.getNode();
6173 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6174 I != E; ++I) {
6175
6176 if (I.getUse().get() != Value)
6177 continue;
6178
6179 if (I->getOpcode() == Opcode)
6180 return *I;
6181 }
6182 return nullptr;
6183}
6184
6185unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6186 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6187 switch (Intr->getConstantOperandVal(1)) {
6188 case Intrinsic::amdgcn_if:
6189 return AMDGPUISD::IF;
6190 case Intrinsic::amdgcn_else:
6191 return AMDGPUISD::ELSE;
6192 case Intrinsic::amdgcn_loop:
6193 return AMDGPUISD::LOOP;
6194 case Intrinsic::amdgcn_end_cf:
6195 llvm_unreachable("should not occur");
6196 default:
6197 return 0;
6198 }
6199 }
6200
6201 // break, if_break, else_break are all only used as inputs to loop, not
6202 // directly as branch conditions.
6203 return 0;
6204}
6205
6207 const Triple &TT = getTargetMachine().getTargetTriple();
6211}
6212
6214 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6215 return false;
6216
6217 // FIXME: Either avoid relying on address space here or change the default
6218 // address space for functions to avoid the explicit check.
6219 return (GV->getValueType()->isFunctionTy() ||
6222}
6223
6225 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6226}
6227
6229 if (!GV->hasExternalLinkage())
6230 return true;
6231
6232 const auto OS = getTargetMachine().getTargetTriple().getOS();
6233 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6234}
6235
6236/// This transforms the control flow intrinsics to get the branch destination as
6237/// last parameter, also switches branch target with BR if the need arise
6238SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6239 SelectionDAG &DAG) const {
6240 SDLoc DL(BRCOND);
6241
6242 SDNode *Intr = BRCOND.getOperand(1).getNode();
6243 SDValue Target = BRCOND.getOperand(2);
6244 SDNode *BR = nullptr;
6245 SDNode *SetCC = nullptr;
6246
6247 if (Intr->getOpcode() == ISD::SETCC) {
6248 // As long as we negate the condition everything is fine
6249 SetCC = Intr;
6250 Intr = SetCC->getOperand(0).getNode();
6251
6252 } else {
6253 // Get the target from BR if we don't negate the condition
6254 BR = findUser(BRCOND, ISD::BR);
6255 assert(BR && "brcond missing unconditional branch user");
6256 Target = BR->getOperand(1);
6257 }
6258
6259 unsigned CFNode = isCFIntrinsic(Intr);
6260 if (CFNode == 0) {
6261 // This is a uniform branch so we don't need to legalize.
6262 return BRCOND;
6263 }
6264
6265 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6266 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6267
6268 assert(!SetCC ||
6269 (SetCC->getConstantOperandVal(1) == 1 &&
6270 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6271 ISD::SETNE));
6272
6273 // operands of the new intrinsic call
6275 if (HaveChain)
6276 Ops.push_back(BRCOND.getOperand(0));
6277
6278 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6279 Ops.push_back(Target);
6280
6281 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6282
6283 // build the new intrinsic call
6284 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6285
6286 if (!HaveChain) {
6287 SDValue Ops[] = {
6288 SDValue(Result, 0),
6289 BRCOND.getOperand(0)
6290 };
6291
6292 Result = DAG.getMergeValues(Ops, DL).getNode();
6293 }
6294
6295 if (BR) {
6296 // Give the branch instruction our target
6297 SDValue Ops[] = {
6298 BR->getOperand(0),
6299 BRCOND.getOperand(2)
6300 };
6301 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6302 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6303 }
6304
6305 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6306
6307 // Copy the intrinsic results to registers
6308 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6310 if (!CopyToReg)
6311 continue;
6312
6313 Chain = DAG.getCopyToReg(
6314 Chain, DL,
6315 CopyToReg->getOperand(1),
6316 SDValue(Result, i - 1),
6317 SDValue());
6318
6319 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6320 }
6321
6322 // Remove the old intrinsic from the chain
6324 SDValue(Intr, Intr->getNumValues() - 1),
6325 Intr->getOperand(0));
6326
6327 return Chain;
6328}
6329
6330SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6331 SelectionDAG &DAG) const {
6332 MVT VT = Op.getSimpleValueType();
6333 SDLoc DL(Op);
6334 // Checking the depth
6335 if (Op.getConstantOperandVal(0) != 0)
6336 return DAG.getConstant(0, DL, VT);
6337
6340 // Check for kernel and shader functions
6341 if (Info->isEntryFunction())
6342 return DAG.getConstant(0, DL, VT);
6343
6344 MachineFrameInfo &MFI = MF.getFrameInfo();
6345 // There is a call to @llvm.returnaddress in this function
6346 MFI.setReturnAddressIsTaken(true);
6347
6349 // Get the return address reg and mark it as an implicit live-in
6350 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6351
6352 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6353}
6354
6355SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6356 SDValue Op,
6357 const SDLoc &DL,
6358 EVT VT) const {
6359 return Op.getValueType().bitsLE(VT) ?
6360 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6361 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6362 DAG.getTargetConstant(0, DL, MVT::i32));
6363}
6364
6365SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6366 assert(Op.getValueType() == MVT::f16 &&
6367 "Do not know how to custom lower FP_ROUND for non-f16 type");
6368
6369 SDValue Src = Op.getOperand(0);
6370 EVT SrcVT = Src.getValueType();
6371 if (SrcVT != MVT::f64)
6372 return Op;
6373
6374 // TODO: Handle strictfp
6375 if (Op.getOpcode() != ISD::FP_ROUND)
6376 return Op;
6377
6378 SDLoc DL(Op);
6379
6380 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6381 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6382 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6383}
6384
6385SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6386 SelectionDAG &DAG) const {
6387 EVT VT = Op.getValueType();
6388 const MachineFunction &MF = DAG.getMachineFunction();
6390 bool IsIEEEMode = Info->getMode().IEEE;
6391
6392 // FIXME: Assert during selection that this is only selected for
6393 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6394 // mode functions, but this happens to be OK since it's only done in cases
6395 // where there is known no sNaN.
6396 if (IsIEEEMode)
6397 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6398
6399 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6400 VT == MVT::v16bf16)
6401 return splitBinaryVectorOp(Op, DAG);
6402 return Op;
6403}
6404
6405SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6406 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6407 EVT VT = Op.getValueType();
6408 assert(VT == MVT::f16);
6409
6410 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6411 EVT ExpVT = Exp.getValueType();
6412 if (ExpVT == MVT::i16)
6413 return Op;
6414
6415 SDLoc DL(Op);
6416
6417 // Correct the exponent type for f16 to i16.
6418 // Clamp the range of the exponent to the instruction's range.
6419
6420 // TODO: This should be a generic narrowing legalization, and can easily be
6421 // for GlobalISel.
6422
6423 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6424 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6425
6426 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6427 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6428
6429 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6430
6431 if (IsStrict) {
6432 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6433 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6434 }
6435
6436 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6437}
6438
6439// Custom lowering for vector multiplications and s_mul_u64.
6440SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6441 EVT VT = Op.getValueType();
6442
6443 // Split vector operands.
6444 if (VT.isVector())
6445 return splitBinaryVectorOp(Op, DAG);
6446
6447 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6448
6449 // There are four ways to lower s_mul_u64:
6450 //
6451 // 1. If all the operands are uniform, then we lower it as it is.
6452 //
6453 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6454 // multiplications because there is not a vector equivalent of s_mul_u64.
6455 //
6456 // 3. If the cost model decides that it is more efficient to use vector
6457 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6458 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6459 //
6460 // 4. If the cost model decides to use vector registers and both of the
6461 // operands are zero-extended/sign-extended from 32-bits, then we split the
6462 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6463 // possible to check if the operands are zero-extended or sign-extended in
6464 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6465 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6466 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6467 // If the cost model decides that we have to use vector registers, then
6468 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6469 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6470 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6471 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6472 // SIInstrInfo.cpp .
6473
6474 if (Op->isDivergent())
6475 return SDValue();
6476
6477 SDValue Op0 = Op.getOperand(0);
6478 SDValue Op1 = Op.getOperand(1);
6479 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6480 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6481 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6482 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6483 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6484 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6485 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6486 SDLoc SL(Op);
6487 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6488 return SDValue(
6489 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6490 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6491 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6492 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6493 return SDValue(
6494 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6495 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6496 return Op;
6497}
6498
6499SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6500 EVT VT = Op.getValueType();
6501 SDLoc SL(Op);
6502 SDValue LHS = Op.getOperand(0);
6503 SDValue RHS = Op.getOperand(1);
6504 bool isSigned = Op.getOpcode() == ISD::SMULO;
6505
6506 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6507 const APInt &C = RHSC->getAPIntValue();
6508 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6509 if (C.isPowerOf2()) {
6510 // smulo(x, signed_min) is same as umulo(x, signed_min).
6511 bool UseArithShift = isSigned && !C.isMinSignedValue();
6512 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6513 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6514 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6515 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6516 SL, VT, Result, ShiftAmt),
6517 LHS, ISD::SETNE);
6518 return DAG.getMergeValues({ Result, Overflow }, SL);
6519 }
6520 }
6521
6522 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6524 SL, VT, LHS, RHS);
6525
6526 SDValue Sign = isSigned
6527 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6528 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6529 : DAG.getConstant(0, SL, VT);
6530 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6531
6532 return DAG.getMergeValues({ Result, Overflow }, SL);
6533}
6534
6535SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6536 if (Op->isDivergent()) {
6537 // Select to V_MAD_[IU]64_[IU]32.
6538 return Op;
6539 }
6540 if (Subtarget->hasSMulHi()) {
6541 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6542 return SDValue();
6543 }
6544 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6545 // calculate the high part, so we might as well do the whole thing with
6546 // V_MAD_[IU]64_[IU]32.
6547 return Op;
6548}
6549
6550SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6551 if (!Subtarget->isTrapHandlerEnabled() ||
6553 return lowerTrapEndpgm(Op, DAG);
6554
6555 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6556 lowerTrapHsaQueuePtr(Op, DAG);
6557}
6558
6559SDValue SITargetLowering::lowerTrapEndpgm(
6560 SDValue Op, SelectionDAG &DAG) const {
6561 SDLoc SL(Op);
6562 SDValue Chain = Op.getOperand(0);
6563 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6564}
6565
6566SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6567 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6570 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6572 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6575}
6576
6577SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6578 SDValue Op, SelectionDAG &DAG) const {
6579 SDLoc SL(Op);
6580 SDValue Chain = Op.getOperand(0);
6581
6582 SDValue QueuePtr;
6583 // For code object version 5, QueuePtr is passed through implicit kernarg.
6584 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6586 QueuePtr =
6587 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6588 } else {
6591 Register UserSGPR = Info->getQueuePtrUserSGPR();
6592
6593 if (UserSGPR == AMDGPU::NoRegister) {
6594 // We probably are in a function incorrectly marked with
6595 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6596 // trap, so just use a null pointer.
6597 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6598 } else {
6599 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6600 MVT::i64);
6601 }
6602 }
6603
6604 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6605 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6606 QueuePtr, SDValue());
6607
6609 SDValue Ops[] = {
6610 ToReg,
6611 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6612 SGPR01,
6613 ToReg.getValue(1)
6614 };
6615 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6616}
6617
6618SDValue SITargetLowering::lowerTrapHsa(
6619 SDValue Op, SelectionDAG &DAG) const {
6620 SDLoc SL(Op);
6621 SDValue Chain = Op.getOperand(0);
6622
6624 SDValue Ops[] = {
6625 Chain,
6626 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6627 };
6628 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6629}
6630
6631SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6632 SDLoc SL(Op);
6633 SDValue Chain = Op.getOperand(0);
6635
6636 if (!Subtarget->isTrapHandlerEnabled() ||
6639 "debugtrap handler not supported",
6640 Op.getDebugLoc(),
6641 DS_Warning);
6642 LLVMContext &Ctx = MF.getFunction().getContext();
6643 Ctx.diagnose(NoTrap);
6644 return Chain;
6645 }
6646
6648 SDValue Ops[] = {
6649 Chain,
6650 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6651 };
6652 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6653}
6654
6655SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6656 SelectionDAG &DAG) const {
6657 if (Subtarget->hasApertureRegs()) {
6658 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6659 ? AMDGPU::SRC_SHARED_BASE
6660 : AMDGPU::SRC_PRIVATE_BASE;
6661 // Note: this feature (register) is broken. When used as a 32-bit operand,
6662 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6663 // bits.
6664 //
6665 // To work around the issue, directly emit a 64 bit mov from this register
6666 // then extract the high bits. Note that this shouldn't even result in a
6667 // shift being emitted and simply become a pair of registers (e.g.):
6668 // s_mov_b64 s[6:7], src_shared_base
6669 // v_mov_b32_e32 v1, s7
6670 //
6671 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6672 // coalescing would kick in and it would think it's okay to use the "HI"
6673 // subregister directly (instead of extracting the HI 32 bits) which is an
6674 // artificial (unusable) register.
6675 // Register TableGen definitions would need an overhaul to get rid of the
6676 // artificial "HI" aperture registers and prevent this kind of issue from
6677 // happening.
6678 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
6679 DAG.getRegister(ApertureRegNo, MVT::i64));
6680 return DAG.getNode(
6681 ISD::TRUNCATE, DL, MVT::i32,
6682 DAG.getNode(ISD::SRL, DL, MVT::i64,
6683 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6684 }
6685
6686 // For code object version 5, private_base and shared_base are passed through
6687 // implicit kernargs.
6688 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6692 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
6693 }
6694
6697 Register UserSGPR = Info->getQueuePtrUserSGPR();
6698 if (UserSGPR == AMDGPU::NoRegister) {
6699 // We probably are in a function incorrectly marked with
6700 // amdgpu-no-queue-ptr. This is undefined.
6701 return DAG.getUNDEF(MVT::i32);
6702 }
6703
6704 SDValue QueuePtr = CreateLiveInRegister(
6705 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
6706
6707 // Offset into amd_queue_t for group_segment_aperture_base_hi /
6708 // private_segment_aperture_base_hi.
6709 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
6710
6711 SDValue Ptr =
6712 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
6713
6714 // TODO: Use custom target PseudoSourceValue.
6715 // TODO: We should use the value from the IR intrinsic call, but it might not
6716 // be available and how do we get it?
6718 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
6719 commonAlignment(Align(64), StructOffset),
6722}
6723
6724/// Return true if the value is a known valid address, such that a null check is
6725/// not necessary.
6727 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
6728 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
6729 isa<BasicBlockSDNode>(Val))
6730 return true;
6731
6732 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
6733 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
6734
6735 // TODO: Search through arithmetic, handle arguments and loads
6736 // marked nonnull.
6737 return false;
6738}
6739
6740SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
6741 SelectionDAG &DAG) const {
6742 SDLoc SL(Op);
6743
6744 const AMDGPUTargetMachine &TM =
6745 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
6746
6747 unsigned DestAS, SrcAS;
6748 SDValue Src;
6749 bool IsNonNull = false;
6750 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
6751 SrcAS = ASC->getSrcAddressSpace();
6752 Src = ASC->getOperand(0);
6753 DestAS = ASC->getDestAddressSpace();
6754 } else {
6755 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6756 Op.getConstantOperandVal(0) ==
6757 Intrinsic::amdgcn_addrspacecast_nonnull);
6758 Src = Op->getOperand(1);
6759 SrcAS = Op->getConstantOperandVal(2);
6760 DestAS = Op->getConstantOperandVal(3);
6761 IsNonNull = true;
6762 }
6763
6764 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
6765
6766 // flat -> local/private
6767 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
6768 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
6769 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
6770 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6771
6772 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
6773 return Ptr;
6774
6775 unsigned NullVal = TM.getNullPointerValue(DestAS);
6776 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6777 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
6778
6779 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
6780 SegmentNullPtr);
6781 }
6782 }
6783
6784 // local/private -> flat
6785 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
6786 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
6787 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
6788
6789 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
6790 SDValue CvtPtr =
6791 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
6792 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
6793
6794 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
6795 return CvtPtr;
6796
6797 unsigned NullVal = TM.getNullPointerValue(SrcAS);
6798 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6799
6800 SDValue NonNull
6801 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
6802
6803 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
6804 FlatNullPtr);
6805 }
6806 }
6807
6808 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6809 Op.getValueType() == MVT::i64) {
6812 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
6813 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
6814 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
6815 }
6816
6817 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6818 Src.getValueType() == MVT::i64)
6819 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6820
6821 // global <-> flat are no-ops and never emitted.
6822
6823 const MachineFunction &MF = DAG.getMachineFunction();
6824 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
6825 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
6826 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
6827
6828 return DAG.getUNDEF(Op->getValueType(0));
6829}
6830
6831// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
6832// the small vector and inserting them into the big vector. That is better than
6833// the default expansion of doing it via a stack slot. Even though the use of
6834// the stack slot would be optimized away afterwards, the stack slot itself
6835// remains.
6836SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
6837 SelectionDAG &DAG) const {
6838 SDValue Vec = Op.getOperand(0);
6839 SDValue Ins = Op.getOperand(1);
6840 SDValue Idx = Op.getOperand(2);
6841 EVT VecVT = Vec.getValueType();
6842 EVT InsVT = Ins.getValueType();
6843 EVT EltVT = VecVT.getVectorElementType();
6844 unsigned InsNumElts = InsVT.getVectorNumElements();
6845 unsigned IdxVal = Idx->getAsZExtVal();
6846 SDLoc SL(Op);
6847
6848 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
6849 // Insert 32-bit registers at a time.
6850 assert(InsNumElts % 2 == 0 && "expect legal vector types");
6851
6852 unsigned VecNumElts = VecVT.getVectorNumElements();
6853 EVT NewVecVT =
6854 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
6855 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6857 MVT::i32, InsNumElts / 2);
6858
6859 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
6860 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
6861
6862 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
6863 SDValue Elt;
6864 if (InsNumElts == 2) {
6865 Elt = Ins;
6866 } else {
6867 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
6868 DAG.getConstant(I, SL, MVT::i32));
6869 }
6870 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
6871 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
6872 }
6873
6874 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
6875 }
6876
6877 for (unsigned I = 0; I != InsNumElts; ++I) {
6878 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
6879 DAG.getConstant(I, SL, MVT::i32));
6880 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
6881 DAG.getConstant(IdxVal + I, SL, MVT::i32));
6882 }
6883 return Vec;
6884}
6885
6886SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
6887 SelectionDAG &DAG) const {
6888 SDValue Vec = Op.getOperand(0);
6889 SDValue InsVal = Op.getOperand(1);
6890 SDValue Idx = Op.getOperand(2);
6891 EVT VecVT = Vec.getValueType();
6892 EVT EltVT = VecVT.getVectorElementType();
6893 unsigned VecSize = VecVT.getSizeInBits();
6894 unsigned EltSize = EltVT.getSizeInBits();
6895 SDLoc SL(Op);
6896
6897 // Specially handle the case of v4i16 with static indexing.
6898 unsigned NumElts = VecVT.getVectorNumElements();
6899 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
6900 if (NumElts == 4 && EltSize == 16 && KIdx) {
6901 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
6902
6903 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
6904 DAG.getConstant(0, SL, MVT::i32));
6905 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
6906 DAG.getConstant(1, SL, MVT::i32));
6907
6908 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
6909 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
6910
6911 unsigned Idx = KIdx->getZExtValue();
6912 bool InsertLo = Idx < 2;
6913 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
6914 InsertLo ? LoVec : HiVec,
6915 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
6916 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
6917
6918 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
6919
6920 SDValue Concat = InsertLo ?
6921 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
6922 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
6923
6924 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
6925 }
6926
6927 // Static indexing does not lower to stack access, and hence there is no need
6928 // for special custom lowering to avoid stack access.
6929 if (isa<ConstantSDNode>(Idx))
6930 return SDValue();
6931
6932 // Avoid stack access for dynamic indexing by custom lowering to
6933 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
6934
6935 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
6936
6937 MVT IntVT = MVT::getIntegerVT(VecSize);
6938
6939 // Convert vector index to bit-index and get the required bit mask.
6940 assert(isPowerOf2_32(EltSize));
6941 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
6942 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
6943 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
6944 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
6945 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
6946
6947 // 1. Create a congruent vector with the target value in each element.
6948 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
6949 DAG.getSplatBuildVector(VecVT, SL, InsVal));
6950
6951 // 2. Mask off all other indicies except the required index within (1).
6952 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
6953
6954 // 3. Mask off the required index within the target vector.
6955 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
6956 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
6957 DAG.getNOT(SL, BFM, IntVT), BCVec);
6958
6959 // 4. Get (2) and (3) ORed into the target vector.
6960 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
6961
6962 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
6963}
6964
6965SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
6966 SelectionDAG &DAG) const {
6967 SDLoc SL(Op);
6968
6969 EVT ResultVT = Op.getValueType();
6970 SDValue Vec = Op.getOperand(0);
6971 SDValue Idx = Op.getOperand(1);
6972 EVT VecVT = Vec.getValueType();
6973 unsigned VecSize = VecVT.getSizeInBits();
6974 EVT EltVT = VecVT.getVectorElementType();
6975
6976 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
6977
6978 // Make sure we do any optimizations that will make it easier to fold
6979 // source modifiers before obscuring it with bit operations.
6980
6981 // XXX - Why doesn't this get called when vector_shuffle is expanded?
6982 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
6983 return Combined;
6984
6985 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
6986 SDValue Lo, Hi;
6987 EVT LoVT, HiVT;
6988 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
6989
6990 if (VecSize == 128) {
6991 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
6992 Lo = DAG.getBitcast(LoVT,
6993 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
6994 DAG.getConstant(0, SL, MVT::i32)));
6995 Hi = DAG.getBitcast(HiVT,
6996 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
6997 DAG.getConstant(1, SL, MVT::i32)));
6998 } else if (VecSize == 256) {
6999 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7000 SDValue Parts[4];
7001 for (unsigned P = 0; P < 4; ++P) {
7002 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7003 DAG.getConstant(P, SL, MVT::i32));
7004 }
7005
7006 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7007 Parts[0], Parts[1]));
7008 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7009 Parts[2], Parts[3]));
7010 } else {
7011 assert(VecSize == 512);
7012
7013 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7014 SDValue Parts[8];
7015 for (unsigned P = 0; P < 8; ++P) {
7016 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7017 DAG.getConstant(P, SL, MVT::i32));
7018 }
7019
7020 Lo = DAG.getBitcast(LoVT,
7021 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7022 Parts[0], Parts[1], Parts[2], Parts[3]));
7023 Hi = DAG.getBitcast(HiVT,
7024 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7025 Parts[4], Parts[5],Parts[6], Parts[7]));
7026 }
7027
7028 EVT IdxVT = Idx.getValueType();
7029 unsigned NElem = VecVT.getVectorNumElements();
7030 assert(isPowerOf2_32(NElem));
7031 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7032 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7033 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7034 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7035 }
7036
7037 assert(VecSize <= 64);
7038
7039 MVT IntVT = MVT::getIntegerVT(VecSize);
7040
7041 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7042 SDValue VecBC = peekThroughBitcasts(Vec);
7043 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7044 SDValue Src = VecBC.getOperand(0);
7045 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7046 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7047 }
7048
7049 unsigned EltSize = EltVT.getSizeInBits();
7050 assert(isPowerOf2_32(EltSize));
7051
7052 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7053
7054 // Convert vector index to bit-index (* EltSize)
7055 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7056
7057 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7058 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7059
7060 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7061 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7062 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7063 }
7064
7065 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7066}
7067
7068static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7069 assert(Elt % 2 == 0);
7070 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7071}
7072
7073SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7074 SelectionDAG &DAG) const {
7075 SDLoc SL(Op);
7076 EVT ResultVT = Op.getValueType();
7077 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7078
7079 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7080 EVT EltVT = PackVT.getVectorElementType();
7081 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7082
7083 // vector_shuffle <0,1,6,7> lhs, rhs
7084 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7085 //
7086 // vector_shuffle <6,7,2,3> lhs, rhs
7087 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7088 //
7089 // vector_shuffle <6,7,0,1> lhs, rhs
7090 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7091
7092 // Avoid scalarizing when both halves are reading from consecutive elements.
7094 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7095 if (elementPairIsContiguous(SVN->getMask(), I)) {
7096 const int Idx = SVN->getMaskElt(I);
7097 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7098 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7099 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7100 PackVT, SVN->getOperand(VecIdx),
7101 DAG.getConstant(EltIdx, SL, MVT::i32));
7102 Pieces.push_back(SubVec);
7103 } else {
7104 const int Idx0 = SVN->getMaskElt(I);
7105 const int Idx1 = SVN->getMaskElt(I + 1);
7106 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7107 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7108 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7109 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7110
7111 SDValue Vec0 = SVN->getOperand(VecIdx0);
7112 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7113 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7114
7115 SDValue Vec1 = SVN->getOperand(VecIdx1);
7116 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7117 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7118 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7119 }
7120 }
7121
7122 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7123}
7124
7125SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7126 SelectionDAG &DAG) const {
7127 SDValue SVal = Op.getOperand(0);
7128 EVT ResultVT = Op.getValueType();
7129 EVT SValVT = SVal.getValueType();
7130 SDValue UndefVal = DAG.getUNDEF(SValVT);
7131 SDLoc SL(Op);
7132
7134 VElts.push_back(SVal);
7135 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7136 VElts.push_back(UndefVal);
7137
7138 return DAG.getBuildVector(ResultVT, SL, VElts);
7139}
7140
7141SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7142 SelectionDAG &DAG) const {
7143 SDLoc SL(Op);
7144 EVT VT = Op.getValueType();
7145
7146 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7147 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7149 VT.getVectorNumElements() / 2);
7150 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
7151
7152 // Turn into pair of packed build_vectors.
7153 // TODO: Special case for constants that can be materialized with s_mov_b64.
7154 SmallVector<SDValue, 4> LoOps, HiOps;
7155 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7156 LoOps.push_back(Op.getOperand(I));
7157 HiOps.push_back(Op.getOperand(I + E));
7158 }
7159 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7160 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
7161
7162 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7163 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
7164
7165 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
7166 { CastLo, CastHi });
7167 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7168 }
7169
7170 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7172 VT.getVectorNumElements() / 4);
7173 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7174
7175 SmallVector<SDValue, 4> Parts[4];
7176 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7177 for (unsigned P = 0; P < 4; ++P)
7178 Parts[P].push_back(Op.getOperand(I + P * E));
7179 }
7180 SDValue Casts[4];
7181 for (unsigned P = 0; P < 4; ++P) {
7182 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7183 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7184 }
7185
7186 SDValue Blend =
7187 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
7188 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7189 }
7190
7191 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7193 VT.getVectorNumElements() / 8);
7194 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7195
7196 SmallVector<SDValue, 8> Parts[8];
7197 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7198 for (unsigned P = 0; P < 8; ++P)
7199 Parts[P].push_back(Op.getOperand(I + P * E));
7200 }
7201 SDValue Casts[8];
7202 for (unsigned P = 0; P < 8; ++P) {
7203 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7204 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7205 }
7206
7207 SDValue Blend =
7208 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
7209 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7210 }
7211
7212 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7213 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7214
7215 SDValue Lo = Op.getOperand(0);
7216 SDValue Hi = Op.getOperand(1);
7217
7218 // Avoid adding defined bits with the zero_extend.
7219 if (Hi.isUndef()) {
7220 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7221 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7222 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7223 }
7224
7225 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7226 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7227
7228 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7229 DAG.getConstant(16, SL, MVT::i32));
7230 if (Lo.isUndef())
7231 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7232
7233 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7234 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7235
7236 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7237 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7238}
7239
7240bool
7242 // OSes that use ELF REL relocations (instead of RELA) can only store a
7243 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7244 // which can create arbitrary 64-bit addends. (This is only a problem for
7245 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7246 // the high 32 bits of the addend.)
7247 //
7248 // This should be kept in sync with how HasRelocationAddend is initialized in
7249 // the constructor of ELFAMDGPUAsmBackend.
7250 if (!Subtarget->isAmdHsaOS())
7251 return false;
7252
7253 // We can fold offsets for anything that doesn't require a GOT relocation.
7254 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7258}
7259
7260static SDValue
7262 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7263 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7264 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7265 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7266 // lowered to the following code sequence:
7267 //
7268 // For constant address space:
7269 // s_getpc_b64 s[0:1]
7270 // s_add_u32 s0, s0, $symbol
7271 // s_addc_u32 s1, s1, 0
7272 //
7273 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7274 // a fixup or relocation is emitted to replace $symbol with a literal
7275 // constant, which is a pc-relative offset from the encoding of the $symbol
7276 // operand to the global variable.
7277 //
7278 // For global address space:
7279 // s_getpc_b64 s[0:1]
7280 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7281 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7282 //
7283 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7284 // fixups or relocations are emitted to replace $symbol@*@lo and
7285 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7286 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7287 // operand to the global variable.
7288 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7289 SDValue PtrHi;
7290 if (GAFlags == SIInstrInfo::MO_NONE)
7291 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7292 else
7293 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7294 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7295}
7296
7297SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7298 SDValue Op,
7299 SelectionDAG &DAG) const {
7300 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7301 SDLoc DL(GSD);
7302 EVT PtrVT = Op.getValueType();
7303
7304 const GlobalValue *GV = GSD->getGlobal();
7310 GV->hasExternalLinkage()) {
7311 Type *Ty = GV->getValueType();
7312 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7313 // zero-sized type in other languages to declare the dynamic shared
7314 // memory which size is not known at the compile time. They will be
7315 // allocated by the runtime and placed directly after the static
7316 // allocated ones. They all share the same offset.
7317 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7318 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7319 // Adjust alignment for that dynamic shared memory array.
7321 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7322 MFI->setUsesDynamicLDS(true);
7323 return SDValue(
7324 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7325 }
7326 }
7328 }
7329
7331 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7333 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7334 }
7335
7336 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7337 SDValue AddrLo = DAG.getTargetGlobalAddress(
7338 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7339 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7340
7341 SDValue AddrHi = DAG.getTargetGlobalAddress(
7342 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7343 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7344
7345 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7346 }
7347
7348 if (shouldEmitFixup(GV))
7349 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7350
7351 if (shouldEmitPCReloc(GV))
7352 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7354
7355 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7357
7358 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7360 const DataLayout &DataLayout = DAG.getDataLayout();
7361 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7362 MachinePointerInfo PtrInfo
7364
7365 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7368}
7369
7371 const SDLoc &DL, SDValue V) const {
7372 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7373 // the destination register.
7374 //
7375 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7376 // so we will end up with redundant moves to m0.
7377 //
7378 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7379
7380 // A Null SDValue creates a glue result.
7381 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7382 V, Chain);
7383 return SDValue(M0, 0);
7384}
7385
7386SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7387 SDValue Op,
7388 MVT VT,
7389 unsigned Offset) const {
7390 SDLoc SL(Op);
7391 SDValue Param = lowerKernargMemParameter(
7392 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7393 // The local size values will have the hi 16-bits as zero.
7394 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7395 DAG.getValueType(VT));
7396}
7397
7399 EVT VT) {
7401 "non-hsa intrinsic with hsa target",
7402 DL.getDebugLoc());
7403 DAG.getContext()->diagnose(BadIntrin);
7404 return DAG.getUNDEF(VT);
7405}
7406
7408 EVT VT) {
7410 "intrinsic not supported on subtarget",
7411 DL.getDebugLoc());
7412 DAG.getContext()->diagnose(BadIntrin);
7413 return DAG.getUNDEF(VT);
7414}
7415
7417 ArrayRef<SDValue> Elts) {
7418 assert(!Elts.empty());
7419 MVT Type;
7420 unsigned NumElts = Elts.size();
7421
7422 if (NumElts <= 12) {
7423 Type = MVT::getVectorVT(MVT::f32, NumElts);
7424 } else {
7425 assert(Elts.size() <= 16);
7426 Type = MVT::v16f32;
7427 NumElts = 16;
7428 }
7429
7430 SmallVector<SDValue, 16> VecElts(NumElts);
7431 for (unsigned i = 0; i < Elts.size(); ++i) {
7432 SDValue Elt = Elts[i];
7433 if (Elt.getValueType() != MVT::f32)
7434 Elt = DAG.getBitcast(MVT::f32, Elt);
7435 VecElts[i] = Elt;
7436 }
7437 for (unsigned i = Elts.size(); i < NumElts; ++i)
7438 VecElts[i] = DAG.getUNDEF(MVT::f32);
7439
7440 if (NumElts == 1)
7441 return VecElts[0];
7442 return DAG.getBuildVector(Type, DL, VecElts);
7443}
7444
7445static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7446 SDValue Src, int ExtraElts) {
7447 EVT SrcVT = Src.getValueType();
7448
7450
7451 if (SrcVT.isVector())
7452 DAG.ExtractVectorElements(Src, Elts);
7453 else
7454 Elts.push_back(Src);
7455
7456 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7457 while (ExtraElts--)
7458 Elts.push_back(Undef);
7459
7460 return DAG.getBuildVector(CastVT, DL, Elts);
7461}
7462
7463// Re-construct the required return value for a image load intrinsic.
7464// This is more complicated due to the optional use TexFailCtrl which means the required
7465// return type is an aggregate
7467 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7468 bool Unpacked, bool IsD16, int DMaskPop,
7469 int NumVDataDwords, bool IsAtomicPacked16Bit,
7470 const SDLoc &DL) {
7471 // Determine the required return type. This is the same regardless of IsTexFail flag
7472 EVT ReqRetVT = ResultTypes[0];
7473 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7474 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7475 ? (ReqRetNumElts + 1) / 2
7476 : ReqRetNumElts;
7477
7478 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
7479 DMaskPop : (DMaskPop + 1) / 2;
7480
7481 MVT DataDwordVT = NumDataDwords == 1 ?
7482 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7483
7484 MVT MaskPopVT = MaskPopDwords == 1 ?
7485 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7486
7487 SDValue Data(Result, 0);
7488 SDValue TexFail;
7489
7490 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7491 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7492 if (MaskPopVT.isVector()) {
7493 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7494 SDValue(Result, 0), ZeroIdx);
7495 } else {
7496 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7497 SDValue(Result, 0), ZeroIdx);
7498 }
7499 }
7500
7501 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7502 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7503 NumDataDwords - MaskPopDwords);
7504
7505 if (IsD16)
7506 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7507
7508 EVT LegalReqRetVT = ReqRetVT;
7509 if (!ReqRetVT.isVector()) {
7510 if (!Data.getValueType().isInteger())
7511 Data = DAG.getNode(ISD::BITCAST, DL,
7512 Data.getValueType().changeTypeToInteger(), Data);
7513 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7514 } else {
7515 // We need to widen the return vector to a legal type
7516 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7517 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7518 LegalReqRetVT =
7520 ReqRetVT.getVectorNumElements() + 1);
7521 }
7522 }
7523 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7524
7525 if (IsTexFail) {
7526 TexFail =
7527 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7528 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7529
7530 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7531 }
7532
7533 if (Result->getNumValues() == 1)
7534 return Data;
7535
7536 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7537}
7538
7539static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7540 SDValue *LWE, bool &IsTexFail) {
7541 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7542
7543 uint64_t Value = TexFailCtrlConst->getZExtValue();
7544 if (Value) {
7545 IsTexFail = true;
7546 }
7547
7548 SDLoc DL(TexFailCtrlConst);
7549 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7550 Value &= ~(uint64_t)0x1;
7551 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7552 Value &= ~(uint64_t)0x2;
7553
7554 return Value == 0;
7555}
7556
7558 MVT PackVectorVT,
7559 SmallVectorImpl<SDValue> &PackedAddrs,
7560 unsigned DimIdx, unsigned EndIdx,
7561 unsigned NumGradients) {
7562 SDLoc DL(Op);
7563 for (unsigned I = DimIdx; I < EndIdx; I++) {
7564 SDValue Addr = Op.getOperand(I);
7565
7566 // Gradients are packed with undef for each coordinate.
7567 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7568 // 1D: undef,dx/dh; undef,dx/dv
7569 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7570 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7571 if (((I + 1) >= EndIdx) ||
7572 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7573 I == DimIdx + NumGradients - 1))) {
7574 if (Addr.getValueType() != MVT::i16)
7575 Addr = DAG.getBitcast(MVT::i16, Addr);
7576 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7577 } else {
7578 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7579 I++;
7580 }
7581 Addr = DAG.getBitcast(MVT::f32, Addr);
7582 PackedAddrs.push_back(Addr);
7583 }
7584}
7585
7586SDValue SITargetLowering::lowerImage(SDValue Op,
7588 SelectionDAG &DAG, bool WithChain) const {
7589 SDLoc DL(Op);
7591 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7592 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7594 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7595 unsigned IntrOpcode = Intr->BaseOpcode;
7596 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7597 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7598 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7599
7600 SmallVector<EVT, 3> ResultTypes(Op->values());
7601 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7602 bool IsD16 = false;
7603 bool IsG16 = false;
7604 bool IsA16 = false;
7605 SDValue VData;
7606 int NumVDataDwords;
7607 bool AdjustRetType = false;
7608 bool IsAtomicPacked16Bit = false;
7609
7610 // Offset of intrinsic arguments
7611 const unsigned ArgOffset = WithChain ? 2 : 1;
7612
7613 unsigned DMask;
7614 unsigned DMaskLanes = 0;
7615
7616 if (BaseOpcode->Atomic) {
7617 VData = Op.getOperand(2);
7618
7619 IsAtomicPacked16Bit =
7620 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7621 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7622
7623 bool Is64Bit = VData.getValueSizeInBits() == 64;
7624 if (BaseOpcode->AtomicX2) {
7625 SDValue VData2 = Op.getOperand(3);
7626 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7627 {VData, VData2});
7628 if (Is64Bit)
7629 VData = DAG.getBitcast(MVT::v4i32, VData);
7630
7631 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7632 DMask = Is64Bit ? 0xf : 0x3;
7633 NumVDataDwords = Is64Bit ? 4 : 2;
7634 } else {
7635 DMask = Is64Bit ? 0x3 : 0x1;
7636 NumVDataDwords = Is64Bit ? 2 : 1;
7637 }
7638 } else {
7639 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7640 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7641
7642 if (BaseOpcode->Store) {
7643 VData = Op.getOperand(2);
7644
7645 MVT StoreVT = VData.getSimpleValueType();
7646 if (StoreVT.getScalarType() == MVT::f16) {
7647 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7648 return Op; // D16 is unsupported for this instruction
7649
7650 IsD16 = true;
7651 VData = handleD16VData(VData, DAG, true);
7652 }
7653
7654 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7655 } else {
7656 // Work out the num dwords based on the dmask popcount and underlying type
7657 // and whether packing is supported.
7658 MVT LoadVT = ResultTypes[0].getSimpleVT();
7659 if (LoadVT.getScalarType() == MVT::f16) {
7660 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7661 return Op; // D16 is unsupported for this instruction
7662
7663 IsD16 = true;
7664 }
7665
7666 // Confirm that the return type is large enough for the dmask specified
7667 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7668 (!LoadVT.isVector() && DMaskLanes > 1))
7669 return Op;
7670
7671 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7672 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7673 // instructions.
7674 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7675 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7676 NumVDataDwords = (DMaskLanes + 1) / 2;
7677 else
7678 NumVDataDwords = DMaskLanes;
7679
7680 AdjustRetType = true;
7681 }
7682 }
7683
7684 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7686
7687 // Check for 16 bit addresses or derivatives and pack if true.
7688 MVT VAddrVT =
7689 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
7690 MVT VAddrScalarVT = VAddrVT.getScalarType();
7691 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7692 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7693
7694 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
7695 VAddrScalarVT = VAddrVT.getScalarType();
7696 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7697 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7698
7699 // Push back extra arguments.
7700 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
7701 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
7702 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7703 // Special handling of bias when A16 is on. Bias is of type half but
7704 // occupies full 32-bit.
7705 SDValue Bias = DAG.getBuildVector(
7706 MVT::v2f16, DL,
7707 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
7708 VAddrs.push_back(Bias);
7709 } else {
7710 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7711 "Bias needs to be converted to 16 bit in A16 mode");
7712 VAddrs.push_back(Op.getOperand(ArgOffset + I));
7713 }
7714 }
7715
7716 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
7717 // 16 bit gradients are supported, but are tied to the A16 control
7718 // so both gradients and addresses must be 16 bit
7719 LLVM_DEBUG(
7720 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
7721 "require 16 bit args for both gradients and addresses");
7722 return Op;
7723 }
7724
7725 if (IsA16) {
7726 if (!ST->hasA16()) {
7727 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
7728 "support 16 bit addresses\n");
7729 return Op;
7730 }
7731 }
7732
7733 // We've dealt with incorrect input so we know that if IsA16, IsG16
7734 // are set then we have to compress/pack operands (either address,
7735 // gradient or both)
7736 // In the case where a16 and gradients are tied (no G16 support) then we
7737 // have already verified that both IsA16 and IsG16 are true
7738 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
7739 // Activate g16
7740 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
7742 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
7743 }
7744
7745 // Add gradients (packed or unpacked)
7746 if (IsG16) {
7747 // Pack the gradients
7748 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
7749 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
7750 ArgOffset + Intr->GradientStart,
7751 ArgOffset + Intr->CoordStart, Intr->NumGradients);
7752 } else {
7753 for (unsigned I = ArgOffset + Intr->GradientStart;
7754 I < ArgOffset + Intr->CoordStart; I++)
7755 VAddrs.push_back(Op.getOperand(I));
7756 }
7757
7758 // Add addresses (packed or unpacked)
7759 if (IsA16) {
7760 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
7761 ArgOffset + Intr->CoordStart, VAddrEnd,
7762 0 /* No gradients */);
7763 } else {
7764 // Add uncompressed address
7765 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
7766 VAddrs.push_back(Op.getOperand(I));
7767 }
7768
7769 // If the register allocator cannot place the address registers contiguously
7770 // without introducing moves, then using the non-sequential address encoding
7771 // is always preferable, since it saves VALU instructions and is usually a
7772 // wash in terms of code size or even better.
7773 //
7774 // However, we currently have no way of hinting to the register allocator that
7775 // MIMG addresses should be placed contiguously when it is possible to do so,
7776 // so force non-NSA for the common 2-address case as a heuristic.
7777 //
7778 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7779 // allocation when possible.
7780 //
7781 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7782 // set of the remaining addresses.
7783 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
7784 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
7785 const bool UseNSA = ST->hasNSAEncoding() &&
7786 VAddrs.size() >= ST->getNSAThreshold(MF) &&
7787 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
7788 const bool UsePartialNSA =
7789 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
7790
7791 SDValue VAddr;
7792 if (UsePartialNSA) {
7793 VAddr = getBuildDwordsVector(DAG, DL,
7794 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
7795 }
7796 else if (!UseNSA) {
7797 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
7798 }
7799
7800 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
7801 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
7802 SDValue Unorm;
7803 if (!BaseOpcode->Sampler) {
7804 Unorm = True;
7805 } else {
7806 uint64_t UnormConst =
7807 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
7808
7809 Unorm = UnormConst ? True : False;
7810 }
7811
7812 SDValue TFE;
7813 SDValue LWE;
7814 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
7815 bool IsTexFail = false;
7816 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
7817 return Op;
7818
7819 if (IsTexFail) {
7820 if (!DMaskLanes) {
7821 // Expecting to get an error flag since TFC is on - and dmask is 0
7822 // Force dmask to be at least 1 otherwise the instruction will fail
7823 DMask = 0x1;
7824 DMaskLanes = 1;
7825 NumVDataDwords = 1;
7826 }
7827 NumVDataDwords += 1;
7828 AdjustRetType = true;
7829 }
7830
7831 // Has something earlier tagged that the return type needs adjusting
7832 // This happens if the instruction is a load or has set TexFailCtrl flags
7833 if (AdjustRetType) {
7834 // NumVDataDwords reflects the true number of dwords required in the return type
7835 if (DMaskLanes == 0 && !BaseOpcode->Store) {
7836 // This is a no-op load. This can be eliminated
7837 SDValue Undef = DAG.getUNDEF(Op.getValueType());
7838 if (isa<MemSDNode>(Op))
7839 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
7840 return Undef;
7841 }
7842
7843 EVT NewVT = NumVDataDwords > 1 ?
7844 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
7845 : MVT::i32;
7846
7847 ResultTypes[0] = NewVT;
7848 if (ResultTypes.size() == 3) {
7849 // Original result was aggregate type used for TexFailCtrl results
7850 // The actual instruction returns as a vector type which has now been
7851 // created. Remove the aggregate result.
7852 ResultTypes.erase(&ResultTypes[1]);
7853 }
7854 }
7855
7856 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
7857 if (BaseOpcode->Atomic)
7858 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
7859 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
7861 return Op;
7862
7864 if (BaseOpcode->Store || BaseOpcode->Atomic)
7865 Ops.push_back(VData); // vdata
7866 if (UsePartialNSA) {
7867 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
7868 Ops.push_back(VAddr);
7869 }
7870 else if (UseNSA)
7871 append_range(Ops, VAddrs);
7872 else
7873 Ops.push_back(VAddr);
7874 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
7875 if (BaseOpcode->Sampler)
7876 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
7877 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
7878 if (IsGFX10Plus)
7879 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
7880 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7881 Ops.push_back(Unorm);
7882 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
7883 Ops.push_back(IsA16 && // r128, a16 for gfx9
7884 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
7885 if (IsGFX10Plus)
7886 Ops.push_back(IsA16 ? True : False);
7887 if (!Subtarget->hasGFX90AInsts()) {
7888 Ops.push_back(TFE); //tfe
7889 } else if (TFE->getAsZExtVal()) {
7890 report_fatal_error("TFE is not supported on this GPU");
7891 }
7892 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7893 Ops.push_back(LWE); // lwe
7894 if (!IsGFX10Plus)
7895 Ops.push_back(DimInfo->DA ? True : False);
7896 if (BaseOpcode->HasD16)
7897 Ops.push_back(IsD16 ? True : False);
7898 if (isa<MemSDNode>(Op))
7899 Ops.push_back(Op.getOperand(0)); // chain
7900
7901 int NumVAddrDwords =
7902 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
7903 int Opcode = -1;
7904
7905 if (IsGFX12Plus) {
7906 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
7907 NumVDataDwords, NumVAddrDwords);
7908 } else if (IsGFX11Plus) {
7909 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
7910 UseNSA ? AMDGPU::MIMGEncGfx11NSA
7911 : AMDGPU::MIMGEncGfx11Default,
7912 NumVDataDwords, NumVAddrDwords);
7913 } else if (IsGFX10Plus) {
7914 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
7915 UseNSA ? AMDGPU::MIMGEncGfx10NSA
7916 : AMDGPU::MIMGEncGfx10Default,
7917 NumVDataDwords, NumVAddrDwords);
7918 } else {
7919 if (Subtarget->hasGFX90AInsts()) {
7920 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
7921 NumVDataDwords, NumVAddrDwords);
7922 if (Opcode == -1)
7924 "requested image instruction is not supported on this GPU");
7925 }
7926 if (Opcode == -1 &&
7928 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
7929 NumVDataDwords, NumVAddrDwords);
7930 if (Opcode == -1)
7931 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
7932 NumVDataDwords, NumVAddrDwords);
7933 }
7934 if (Opcode == -1)
7935 return Op;
7936
7937 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
7938 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
7939 MachineMemOperand *MemRef = MemOp->getMemOperand();
7940 DAG.setNodeMemRefs(NewNode, {MemRef});
7941 }
7942
7943 if (BaseOpcode->AtomicX2) {
7945 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
7946 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
7947 }
7948 if (BaseOpcode->Store)
7949 return SDValue(NewNode, 0);
7950 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
7951 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
7952 NumVDataDwords, IsAtomicPacked16Bit, DL);
7953}
7954
7955SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
7956 SDValue Offset, SDValue CachePolicy,
7957 SelectionDAG &DAG) const {
7959
7960 const DataLayout &DataLayout = DAG.getDataLayout();
7961 Align Alignment =
7963
7968 VT.getStoreSize(), Alignment);
7969
7970 if (!Offset->isDivergent()) {
7971 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
7972
7973 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
7974 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
7975 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
7976 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
7977 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
7978 SDValue BufferLoad =
7980 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7981 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7982 }
7983
7984 // Widen vec3 load to vec4.
7985 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
7986 !Subtarget->hasScalarDwordx3Loads()) {
7987 EVT WidenedVT =
7989 auto WidenedOp = DAG.getMemIntrinsicNode(
7990 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
7991 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
7992 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
7993 DAG.getVectorIdxConstant(0, DL));
7994 return Subvector;
7995 }
7996
7998 DAG.getVTList(VT), Ops, VT, MMO);
7999 }
8000
8001 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8002 // assume that the buffer is unswizzled.
8003 SDValue Ops[] = {
8004 DAG.getEntryNode(), // Chain
8005 Rsrc, // rsrc
8006 DAG.getConstant(0, DL, MVT::i32), // vindex
8007 {}, // voffset
8008 {}, // soffset
8009 {}, // offset
8010 CachePolicy, // cachepolicy
8011 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8012 };
8013 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8014 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8015 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8016 }
8017
8019 unsigned NumLoads = 1;
8020 MVT LoadVT = VT.getSimpleVT();
8021 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8022 assert((LoadVT.getScalarType() == MVT::i32 ||
8023 LoadVT.getScalarType() == MVT::f32));
8024
8025 if (NumElts == 8 || NumElts == 16) {
8026 NumLoads = NumElts / 4;
8027 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8028 }
8029
8030 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8031
8032 // Use the alignment to ensure that the required offsets will fit into the
8033 // immediate offsets.
8034 setBufferOffsets(Offset, DAG, &Ops[3],
8035 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8036
8037 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8038 for (unsigned i = 0; i < NumLoads; ++i) {
8039 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8040 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8041 LoadVT, MMO, DAG));
8042 }
8043
8044 if (NumElts == 8 || NumElts == 16)
8045 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8046
8047 return Loads[0];
8048}
8049
8050SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8051 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8052 if (!Subtarget->hasArchitectedSGPRs())
8053 return {};
8054 SDLoc SL(Op);
8055 MVT VT = MVT::i32;
8056 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8057 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8058 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8059}
8060
8061SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8062 unsigned Dim,
8063 const ArgDescriptor &Arg) const {
8064 SDLoc SL(Op);
8066 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8067 if (MaxID == 0)
8068 return DAG.getConstant(0, SL, MVT::i32);
8069
8070 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8071 SDLoc(DAG.getEntryNode()), Arg);
8072
8073 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8074 // masking operations anyway.
8075 //
8076 // TODO: We could assert the top bit is 0 for the source copy.
8077 if (Arg.isMasked())
8078 return Val;
8079
8080 // Preserve the known bits after expansion to a copy.
8082 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8083 DAG.getValueType(SmallVT));
8084}
8085
8086SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8087 SelectionDAG &DAG) const {
8089 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8090
8091 EVT VT = Op.getValueType();
8092 SDLoc DL(Op);
8093 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8094
8095 // TODO: Should this propagate fast-math-flags?
8096
8097 switch (IntrinsicID) {
8098 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8099 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8100 return emitNonHSAIntrinsicError(DAG, DL, VT);
8101 return getPreloadedValue(DAG, *MFI, VT,
8103 }
8104 case Intrinsic::amdgcn_dispatch_ptr:
8105 case Intrinsic::amdgcn_queue_ptr: {
8106 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8107 DiagnosticInfoUnsupported BadIntrin(
8108 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8109 DL.getDebugLoc());
8110 DAG.getContext()->diagnose(BadIntrin);
8111 return DAG.getUNDEF(VT);
8112 }
8113
8114 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8116 return getPreloadedValue(DAG, *MFI, VT, RegID);
8117 }
8118 case Intrinsic::amdgcn_implicitarg_ptr: {
8119 if (MFI->isEntryFunction())
8120 return getImplicitArgPtr(DAG, DL);
8121 return getPreloadedValue(DAG, *MFI, VT,
8123 }
8124 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8126 // This only makes sense to call in a kernel, so just lower to null.
8127 return DAG.getConstant(0, DL, VT);
8128 }
8129
8130 return getPreloadedValue(DAG, *MFI, VT,
8132 }
8133 case Intrinsic::amdgcn_dispatch_id: {
8134 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8135 }
8136 case Intrinsic::amdgcn_rcp:
8137 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8138 case Intrinsic::amdgcn_rsq:
8139 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8140 case Intrinsic::amdgcn_rsq_legacy:
8142 return emitRemovedIntrinsicError(DAG, DL, VT);
8143 return SDValue();
8144 case Intrinsic::amdgcn_rcp_legacy:
8146 return emitRemovedIntrinsicError(DAG, DL, VT);
8147 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8148 case Intrinsic::amdgcn_rsq_clamp: {
8150 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8151
8152 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8155
8156 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8157 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8158 DAG.getConstantFP(Max, DL, VT));
8159 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8160 DAG.getConstantFP(Min, DL, VT));
8161 }
8162 case Intrinsic::r600_read_ngroups_x:
8163 if (Subtarget->isAmdHsaOS())
8164 return emitNonHSAIntrinsicError(DAG, DL, VT);
8165
8166 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8168 false);
8169 case Intrinsic::r600_read_ngroups_y:
8170 if (Subtarget->isAmdHsaOS())
8171 return emitNonHSAIntrinsicError(DAG, DL, VT);
8172
8173 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8175 false);
8176 case Intrinsic::r600_read_ngroups_z:
8177 if (Subtarget->isAmdHsaOS())
8178 return emitNonHSAIntrinsicError(DAG, DL, VT);
8179
8180 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8182 false);
8183 case Intrinsic::r600_read_global_size_x:
8184 if (Subtarget->isAmdHsaOS())
8185 return emitNonHSAIntrinsicError(DAG, DL, VT);
8186
8187 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8189 Align(4), false);
8190 case Intrinsic::r600_read_global_size_y:
8191 if (Subtarget->isAmdHsaOS())
8192 return emitNonHSAIntrinsicError(DAG, DL, VT);
8193
8194 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8196 Align(4), false);
8197 case Intrinsic::r600_read_global_size_z:
8198 if (Subtarget->isAmdHsaOS())
8199 return emitNonHSAIntrinsicError(DAG, DL, VT);
8200
8201 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8203 Align(4), false);
8204 case Intrinsic::r600_read_local_size_x:
8205 if (Subtarget->isAmdHsaOS())
8206 return emitNonHSAIntrinsicError(DAG, DL, VT);
8207
8208 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8210 case Intrinsic::r600_read_local_size_y:
8211 if (Subtarget->isAmdHsaOS())
8212 return emitNonHSAIntrinsicError(DAG, DL, VT);
8213
8214 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8216 case Intrinsic::r600_read_local_size_z:
8217 if (Subtarget->isAmdHsaOS())
8218 return emitNonHSAIntrinsicError(DAG, DL, VT);
8219
8220 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8222 case Intrinsic::amdgcn_workgroup_id_x:
8223 return getPreloadedValue(DAG, *MFI, VT,
8225 case Intrinsic::amdgcn_workgroup_id_y:
8226 return getPreloadedValue(DAG, *MFI, VT,
8228 case Intrinsic::amdgcn_workgroup_id_z:
8229 return getPreloadedValue(DAG, *MFI, VT,
8231 case Intrinsic::amdgcn_wave_id:
8232 return lowerWaveID(DAG, Op);
8233 case Intrinsic::amdgcn_lds_kernel_id: {
8234 if (MFI->isEntryFunction())
8235 return getLDSKernelId(DAG, DL);
8236 return getPreloadedValue(DAG, *MFI, VT,
8238 }
8239 case Intrinsic::amdgcn_workitem_id_x:
8240 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8241 case Intrinsic::amdgcn_workitem_id_y:
8242 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8243 case Intrinsic::amdgcn_workitem_id_z:
8244 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8245 case Intrinsic::amdgcn_wavefrontsize:
8247 SDLoc(Op), MVT::i32);
8248 case Intrinsic::amdgcn_s_buffer_load: {
8249 unsigned CPol = Op.getConstantOperandVal(3);
8250 // s_buffer_load, because of how it's optimized, can't be volatile
8251 // so reject ones with the volatile bit set.
8252 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8255 return Op;
8256 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8257 DAG);
8258 }
8259 case Intrinsic::amdgcn_fdiv_fast:
8260 return lowerFDIV_FAST(Op, DAG);
8261 case Intrinsic::amdgcn_sin:
8262 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8263
8264 case Intrinsic::amdgcn_cos:
8265 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8266
8267 case Intrinsic::amdgcn_mul_u24:
8268 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8269 case Intrinsic::amdgcn_mul_i24:
8270 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8271
8272 case Intrinsic::amdgcn_log_clamp: {
8274 return SDValue();
8275
8276 return emitRemovedIntrinsicError(DAG, DL, VT);
8277 }
8278 case Intrinsic::amdgcn_fract:
8279 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8280
8281 case Intrinsic::amdgcn_class:
8282 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8283 Op.getOperand(1), Op.getOperand(2));
8284 case Intrinsic::amdgcn_div_fmas:
8285 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8286 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8287 Op.getOperand(4));
8288
8289 case Intrinsic::amdgcn_div_fixup:
8290 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8291 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8292
8293 case Intrinsic::amdgcn_div_scale: {
8294 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8295
8296 // Translate to the operands expected by the machine instruction. The
8297 // first parameter must be the same as the first instruction.
8298 SDValue Numerator = Op.getOperand(1);
8299 SDValue Denominator = Op.getOperand(2);
8300
8301 // Note this order is opposite of the machine instruction's operations,
8302 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8303 // intrinsic has the numerator as the first operand to match a normal
8304 // division operation.
8305
8306 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8307
8308 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8309 Denominator, Numerator);
8310 }
8311 case Intrinsic::amdgcn_icmp: {
8312 // There is a Pat that handles this variant, so return it as-is.
8313 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8314 Op.getConstantOperandVal(2) == 0 &&
8315 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8316 return Op;
8317 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8318 }
8319 case Intrinsic::amdgcn_fcmp: {
8320 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8321 }
8322 case Intrinsic::amdgcn_ballot:
8323 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8324 case Intrinsic::amdgcn_fmed3:
8325 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8326 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8327 case Intrinsic::amdgcn_fdot2:
8328 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8329 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8330 Op.getOperand(4));
8331 case Intrinsic::amdgcn_fmul_legacy:
8332 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8333 Op.getOperand(1), Op.getOperand(2));
8334 case Intrinsic::amdgcn_sffbh:
8335 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8336 case Intrinsic::amdgcn_sbfe:
8337 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8338 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8339 case Intrinsic::amdgcn_ubfe:
8340 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8341 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8342 case Intrinsic::amdgcn_cvt_pkrtz:
8343 case Intrinsic::amdgcn_cvt_pknorm_i16:
8344 case Intrinsic::amdgcn_cvt_pknorm_u16:
8345 case Intrinsic::amdgcn_cvt_pk_i16:
8346 case Intrinsic::amdgcn_cvt_pk_u16: {
8347 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8348 EVT VT = Op.getValueType();
8349 unsigned Opcode;
8350
8351 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8353 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8355 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8357 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8359 else
8361
8362 if (isTypeLegal(VT))
8363 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8364
8365 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8366 Op.getOperand(1), Op.getOperand(2));
8367 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8368 }
8369 case Intrinsic::amdgcn_fmad_ftz:
8370 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8371 Op.getOperand(2), Op.getOperand(3));
8372
8373 case Intrinsic::amdgcn_if_break:
8374 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8375 Op->getOperand(1), Op->getOperand(2)), 0);
8376
8377 case Intrinsic::amdgcn_groupstaticsize: {
8379 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8380 return Op;
8381
8382 const Module *M = MF.getFunction().getParent();
8383 const GlobalValue *GV =
8384 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8385 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8387 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8388 }
8389 case Intrinsic::amdgcn_is_shared:
8390 case Intrinsic::amdgcn_is_private: {
8391 SDLoc SL(Op);
8392 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8394 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8395 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8396 Op.getOperand(1));
8397
8398 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8399 DAG.getConstant(1, SL, MVT::i32));
8400 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8401 }
8402 case Intrinsic::amdgcn_perm:
8403 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8404 Op.getOperand(2), Op.getOperand(3));
8405 case Intrinsic::amdgcn_reloc_constant: {
8406 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8407 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8408 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8409 auto RelocSymbol = cast<GlobalVariable>(
8410 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8411 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8413 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8414 }
8415 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8416 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8417 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8418 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8419 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8420 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8421 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8422 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8423 if (Op.getOperand(4).getValueType() == MVT::i32)
8424 return SDValue();
8425
8426 SDLoc SL(Op);
8427 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8428 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8429 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8430 Op.getOperand(3), IndexKeyi32);
8431 }
8432 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8433 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8434 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8435 if (Op.getOperand(6).getValueType() == MVT::i32)
8436 return SDValue();
8437
8438 SDLoc SL(Op);
8439 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8440 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8441 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8442 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8443 IndexKeyi32, Op.getOperand(7)});
8444 }
8445 case Intrinsic::amdgcn_addrspacecast_nonnull:
8446 return lowerADDRSPACECAST(Op, DAG);
8447 default:
8448 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8450 return lowerImage(Op, ImageDimIntr, DAG, false);
8451
8452 return Op;
8453 }
8454}
8455
8456// On targets not supporting constant in soffset field, turn zero to
8457// SGPR_NULL to avoid generating an extra s_mov with zero.
8459 const GCNSubtarget *Subtarget) {
8460 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8461 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8462 return SOffset;
8463}
8464
8465SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8466 SelectionDAG &DAG,
8467 unsigned NewOpcode) const {
8468 SDLoc DL(Op);
8469
8470 SDValue VData = Op.getOperand(2);
8471 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8472 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8473 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8474 SDValue Ops[] = {
8475 Op.getOperand(0), // Chain
8476 VData, // vdata
8477 Rsrc, // rsrc
8478 DAG.getConstant(0, DL, MVT::i32), // vindex
8479 Offsets.first, // voffset
8480 SOffset, // soffset
8481 Offsets.second, // offset
8482 Op.getOperand(6), // cachepolicy
8483 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8484 };
8485
8486 auto *M = cast<MemSDNode>(Op);
8487
8488 EVT MemVT = VData.getValueType();
8489 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8490 M->getMemOperand());
8491}
8492
8493// Return a value to use for the idxen operand by examining the vindex operand.
8494static unsigned getIdxEn(SDValue VIndex) {
8495 // No need to set idxen if vindex is known to be zero.
8496 return isNullConstant(VIndex) ? 0 : 1;
8497}
8498
8499SDValue
8500SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8501 unsigned NewOpcode) const {
8502 SDLoc DL(Op);
8503
8504 SDValue VData = Op.getOperand(2);
8505 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8506 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8507 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8508 SDValue Ops[] = {
8509 Op.getOperand(0), // Chain
8510 VData, // vdata
8511 Rsrc, // rsrc
8512 Op.getOperand(4), // vindex
8513 Offsets.first, // voffset
8514 SOffset, // soffset
8515 Offsets.second, // offset
8516 Op.getOperand(7), // cachepolicy
8517 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8518 };
8519
8520 auto *M = cast<MemSDNode>(Op);
8521
8522 EVT MemVT = VData.getValueType();
8523 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8524 M->getMemOperand());
8525}
8526
8527SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8528 SelectionDAG &DAG) const {
8529 unsigned IntrID = Op.getConstantOperandVal(1);
8530 SDLoc DL(Op);
8531
8532 switch (IntrID) {
8533 case Intrinsic::amdgcn_ds_ordered_add:
8534 case Intrinsic::amdgcn_ds_ordered_swap: {
8535 MemSDNode *M = cast<MemSDNode>(Op);
8536 SDValue Chain = M->getOperand(0);
8537 SDValue M0 = M->getOperand(2);
8538 SDValue Value = M->getOperand(3);
8539 unsigned IndexOperand = M->getConstantOperandVal(7);
8540 unsigned WaveRelease = M->getConstantOperandVal(8);
8541 unsigned WaveDone = M->getConstantOperandVal(9);
8542
8543 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8544 IndexOperand &= ~0x3f;
8545 unsigned CountDw = 0;
8546
8547 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8548 CountDw = (IndexOperand >> 24) & 0xf;
8549 IndexOperand &= ~(0xf << 24);
8550
8551 if (CountDw < 1 || CountDw > 4) {
8553 "ds_ordered_count: dword count must be between 1 and 4");
8554 }
8555 }
8556
8557 if (IndexOperand)
8558 report_fatal_error("ds_ordered_count: bad index operand");
8559
8560 if (WaveDone && !WaveRelease)
8561 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8562
8563 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8564 unsigned ShaderType =
8566 unsigned Offset0 = OrderedCountIndex << 2;
8567 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8568
8569 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8570 Offset1 |= (CountDw - 1) << 6;
8571
8572 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8573 Offset1 |= ShaderType << 2;
8574
8575 unsigned Offset = Offset0 | (Offset1 << 8);
8576
8577 SDValue Ops[] = {
8578 Chain,
8579 Value,
8580 DAG.getTargetConstant(Offset, DL, MVT::i16),
8581 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8582 };
8584 M->getVTList(), Ops, M->getMemoryVT(),
8585 M->getMemOperand());
8586 }
8587 case Intrinsic::amdgcn_ds_fadd: {
8588 MemSDNode *M = cast<MemSDNode>(Op);
8589 unsigned Opc;
8590 switch (IntrID) {
8591 case Intrinsic::amdgcn_ds_fadd:
8593 break;
8594 }
8595
8596 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
8597 M->getOperand(0), M->getOperand(2), M->getOperand(3),
8598 M->getMemOperand());
8599 }
8600 case Intrinsic::amdgcn_ds_fmin:
8601 case Intrinsic::amdgcn_ds_fmax: {
8602 MemSDNode *M = cast<MemSDNode>(Op);
8603 unsigned Opc;
8604 switch (IntrID) {
8605 case Intrinsic::amdgcn_ds_fmin:
8607 break;
8608 case Intrinsic::amdgcn_ds_fmax:
8610 break;
8611 default:
8612 llvm_unreachable("Unknown intrinsic!");
8613 }
8614 SDValue Ops[] = {
8615 M->getOperand(0), // Chain
8616 M->getOperand(2), // Ptr
8617 M->getOperand(3) // Value
8618 };
8619
8620 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
8621 M->getMemoryVT(), M->getMemOperand());
8622 }
8623 case Intrinsic::amdgcn_buffer_load:
8624 case Intrinsic::amdgcn_buffer_load_format: {
8625 unsigned Glc = Op.getConstantOperandVal(5);
8626 unsigned Slc = Op.getConstantOperandVal(6);
8627 unsigned IdxEn = getIdxEn(Op.getOperand(3));
8628 SDValue Ops[] = {
8629 Op.getOperand(0), // Chain
8630 Op.getOperand(2), // rsrc
8631 Op.getOperand(3), // vindex
8632 SDValue(), // voffset -- will be set by setBufferOffsets
8633 SDValue(), // soffset -- will be set by setBufferOffsets
8634 SDValue(), // offset -- will be set by setBufferOffsets
8635 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8636 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8637 };
8638 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
8639
8640 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
8642
8643 EVT VT = Op.getValueType();
8644 EVT IntVT = VT.changeTypeToInteger();
8645 auto *M = cast<MemSDNode>(Op);
8646 EVT LoadVT = Op.getValueType();
8647
8648 if (LoadVT.getScalarType() == MVT::f16)
8649 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
8650 M, DAG, Ops);
8651
8652 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
8653 if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
8654 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
8655 M->getMemOperand());
8656
8657 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
8658 M->getMemOperand(), DAG);
8659 }
8660 case Intrinsic::amdgcn_raw_buffer_load:
8661 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8662 case Intrinsic::amdgcn_raw_buffer_load_format:
8663 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8664 const bool IsFormat =
8665 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8666 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8667
8668 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8669 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8670 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8671 SDValue Ops[] = {
8672 Op.getOperand(0), // Chain
8673 Rsrc, // rsrc
8674 DAG.getConstant(0, DL, MVT::i32), // vindex
8675 Offsets.first, // voffset
8676 SOffset, // soffset
8677 Offsets.second, // offset
8678 Op.getOperand(5), // cachepolicy, swizzled buffer
8679 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8680 };
8681
8682 auto *M = cast<MemSDNode>(Op);
8683 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8684 }
8685 case Intrinsic::amdgcn_struct_buffer_load:
8686 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8687 case Intrinsic::amdgcn_struct_buffer_load_format:
8688 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8689 const bool IsFormat =
8690 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8691 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8692
8693 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8694 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8695 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8696 SDValue Ops[] = {
8697 Op.getOperand(0), // Chain
8698 Rsrc, // rsrc
8699 Op.getOperand(3), // vindex
8700 Offsets.first, // voffset
8701 SOffset, // soffset
8702 Offsets.second, // offset
8703 Op.getOperand(6), // cachepolicy, swizzled buffer
8704 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8705 };
8706
8707 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8708 }
8709 case Intrinsic::amdgcn_tbuffer_load: {
8710 MemSDNode *M = cast<MemSDNode>(Op);
8711 EVT LoadVT = Op.getValueType();
8712
8713 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8714 unsigned Dfmt = Op.getConstantOperandVal(7);
8715 unsigned Nfmt = Op.getConstantOperandVal(8);
8716 unsigned Glc = Op.getConstantOperandVal(9);
8717 unsigned Slc = Op.getConstantOperandVal(10);
8718 unsigned IdxEn = getIdxEn(Op.getOperand(3));
8719 SDValue Ops[] = {
8720 Op.getOperand(0), // Chain
8721 Op.getOperand(2), // rsrc
8722 Op.getOperand(3), // vindex
8723 Op.getOperand(4), // voffset
8724 SOffset, // soffset
8725 Op.getOperand(6), // offset
8726 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
8727 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8728 DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
8729 };
8730
8731 if (LoadVT.getScalarType() == MVT::f16)
8732 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8733 M, DAG, Ops);
8734 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8735 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8736 DAG);
8737 }
8738 case Intrinsic::amdgcn_raw_tbuffer_load:
8739 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8740 MemSDNode *M = cast<MemSDNode>(Op);
8741 EVT LoadVT = Op.getValueType();
8742 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8743 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8744 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8745
8746 SDValue Ops[] = {
8747 Op.getOperand(0), // Chain
8748 Rsrc, // rsrc
8749 DAG.getConstant(0, DL, MVT::i32), // vindex
8750 Offsets.first, // voffset
8751 SOffset, // soffset
8752 Offsets.second, // offset
8753 Op.getOperand(5), // format
8754 Op.getOperand(6), // cachepolicy, swizzled buffer
8755 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8756 };
8757
8758 if (LoadVT.getScalarType() == MVT::f16)
8759 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8760 M, DAG, Ops);
8761 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8762 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8763 DAG);
8764 }
8765 case Intrinsic::amdgcn_struct_tbuffer_load:
8766 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8767 MemSDNode *M = cast<MemSDNode>(Op);
8768 EVT LoadVT = Op.getValueType();
8769 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8770 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8771 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8772
8773 SDValue Ops[] = {
8774 Op.getOperand(0), // Chain
8775 Rsrc, // rsrc
8776 Op.getOperand(3), // vindex
8777 Offsets.first, // voffset
8778 SOffset, // soffset
8779 Offsets.second, // offset
8780 Op.getOperand(6), // format
8781 Op.getOperand(7), // cachepolicy, swizzled buffer
8782 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8783 };
8784
8785 if (LoadVT.getScalarType() == MVT::f16)
8786 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8787 M, DAG, Ops);
8788 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8789 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8790 DAG);
8791 }
8792 case Intrinsic::amdgcn_buffer_atomic_swap:
8793 case Intrinsic::amdgcn_buffer_atomic_add:
8794 case Intrinsic::amdgcn_buffer_atomic_sub:
8795 case Intrinsic::amdgcn_buffer_atomic_csub:
8796 case Intrinsic::amdgcn_buffer_atomic_smin:
8797 case Intrinsic::amdgcn_buffer_atomic_umin:
8798 case Intrinsic::amdgcn_buffer_atomic_smax:
8799 case Intrinsic::amdgcn_buffer_atomic_umax:
8800 case Intrinsic::amdgcn_buffer_atomic_and:
8801 case Intrinsic::amdgcn_buffer_atomic_or:
8802 case Intrinsic::amdgcn_buffer_atomic_xor:
8803 case Intrinsic::amdgcn_buffer_atomic_fadd: {
8804 unsigned Slc = Op.getConstantOperandVal(6);
8805 unsigned IdxEn = getIdxEn(Op.getOperand(4));
8806 SDValue Ops[] = {
8807 Op.getOperand(0), // Chain
8808 Op.getOperand(2), // vdata
8809 Op.getOperand(3), // rsrc
8810 Op.getOperand(4), // vindex
8811 SDValue(), // voffset -- will be set by setBufferOffsets
8812 SDValue(), // soffset -- will be set by setBufferOffsets
8813 SDValue(), // offset -- will be set by setBufferOffsets
8814 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
8815 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8816 };
8817 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
8818
8819 EVT VT = Op.getValueType();
8820
8821 auto *M = cast<MemSDNode>(Op);
8822 unsigned Opcode = 0;
8823
8824 switch (IntrID) {
8825 case Intrinsic::amdgcn_buffer_atomic_swap:
8827 break;
8828 case Intrinsic::amdgcn_buffer_atomic_add:
8830 break;
8831 case Intrinsic::amdgcn_buffer_atomic_sub:
8833 break;
8834 case Intrinsic::amdgcn_buffer_atomic_csub:
8836 break;
8837 case Intrinsic::amdgcn_buffer_atomic_smin:
8839 break;
8840 case Intrinsic::amdgcn_buffer_atomic_umin:
8842 break;
8843 case Intrinsic::amdgcn_buffer_atomic_smax:
8845 break;
8846 case Intrinsic::amdgcn_buffer_atomic_umax:
8848 break;
8849 case Intrinsic::amdgcn_buffer_atomic_and:
8851 break;
8852 case Intrinsic::amdgcn_buffer_atomic_or:
8854 break;
8855 case Intrinsic::amdgcn_buffer_atomic_xor:
8857 break;
8858 case Intrinsic::amdgcn_buffer_atomic_fadd:
8860 break;
8861 default:
8862 llvm_unreachable("unhandled atomic opcode");
8863 }
8864
8865 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
8866 M->getMemOperand());
8867 }
8868 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8869 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8870 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8871 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
8872 return lowerRawBufferAtomicIntrin(Op, DAG,
8874 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8875 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8876 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8877 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
8878 return lowerStructBufferAtomicIntrin(Op, DAG,
8880 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8881 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8882 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8883 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8884 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8885 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8886 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8887 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8888 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
8889 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8890 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8891 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
8892 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8893 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8894 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
8895 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8896 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8897 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
8898 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8900 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
8901 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8903 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
8904 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8905 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8906 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
8907 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8908 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8909 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
8910 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8911 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8912 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
8913 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8914 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8915 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
8916 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8918 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
8919 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8920 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8921 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
8922 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8923 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8924 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
8925 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8926 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8927 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
8928 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8929 return lowerRawBufferAtomicIntrin(Op, DAG,
8931 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8932 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8933 return lowerStructBufferAtomicIntrin(Op, DAG,
8935 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8936 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8937 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
8938 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8940 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
8941 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8942 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8943 return lowerStructBufferAtomicIntrin(Op, DAG,
8945 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8946 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8947 return lowerStructBufferAtomicIntrin(Op, DAG,
8949 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8950 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8951 return lowerStructBufferAtomicIntrin(Op, DAG,
8953 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8955 return lowerStructBufferAtomicIntrin(Op, DAG,
8957 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8958 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8959 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
8960 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8961 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8962 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
8963 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8965 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
8966 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8967 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8968 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
8969 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8970 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8971 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
8972 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8973 return lowerStructBufferAtomicIntrin(Op, DAG,
8975
8976 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
8977 unsigned Slc = Op.getConstantOperandVal(7);
8978 unsigned IdxEn = getIdxEn(Op.getOperand(5));
8979 SDValue Ops[] = {
8980 Op.getOperand(0), // Chain
8981 Op.getOperand(2), // src
8982 Op.getOperand(3), // cmp
8983 Op.getOperand(4), // rsrc
8984 Op.getOperand(5), // vindex
8985 SDValue(), // voffset -- will be set by setBufferOffsets
8986 SDValue(), // soffset -- will be set by setBufferOffsets
8987 SDValue(), // offset -- will be set by setBufferOffsets
8988 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
8989 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8990 };
8991 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
8992
8993 EVT VT = Op.getValueType();
8994 auto *M = cast<MemSDNode>(Op);
8995
8997 Op->getVTList(), Ops, VT, M->getMemOperand());
8998 }
8999 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9000 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9001 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9002 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9003 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9004 SDValue Ops[] = {
9005 Op.getOperand(0), // Chain
9006 Op.getOperand(2), // src
9007 Op.getOperand(3), // cmp
9008 Rsrc, // rsrc
9009 DAG.getConstant(0, DL, MVT::i32), // vindex
9010 Offsets.first, // voffset
9011 SOffset, // soffset
9012 Offsets.second, // offset
9013 Op.getOperand(7), // cachepolicy
9014 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9015 };
9016 EVT VT = Op.getValueType();
9017 auto *M = cast<MemSDNode>(Op);
9018
9020 Op->getVTList(), Ops, VT, M->getMemOperand());
9021 }
9022 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9023 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9024 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9025 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
9026 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9027 SDValue Ops[] = {
9028 Op.getOperand(0), // Chain
9029 Op.getOperand(2), // src
9030 Op.getOperand(3), // cmp
9031 Rsrc, // rsrc
9032 Op.getOperand(5), // vindex
9033 Offsets.first, // voffset
9034 SOffset, // soffset
9035 Offsets.second, // offset
9036 Op.getOperand(8), // cachepolicy
9037 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9038 };
9039 EVT VT = Op.getValueType();
9040 auto *M = cast<MemSDNode>(Op);
9041
9043 Op->getVTList(), Ops, VT, M->getMemOperand());
9044 }
9045 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9046 MemSDNode *M = cast<MemSDNode>(Op);
9047 SDValue NodePtr = M->getOperand(2);
9048 SDValue RayExtent = M->getOperand(3);
9049 SDValue RayOrigin = M->getOperand(4);
9050 SDValue RayDir = M->getOperand(5);
9051 SDValue RayInvDir = M->getOperand(6);
9052 SDValue TDescr = M->getOperand(7);
9053
9054 assert(NodePtr.getValueType() == MVT::i32 ||
9055 NodePtr.getValueType() == MVT::i64);
9056 assert(RayDir.getValueType() == MVT::v3f16 ||
9057 RayDir.getValueType() == MVT::v3f32);
9058
9059 if (!Subtarget->hasGFX10_AEncoding()) {
9060 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9061 return SDValue();
9062 }
9063
9064 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9065 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9066 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9067 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9068 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9069 const unsigned NumVDataDwords = 4;
9070 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9071 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9072 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9073 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9074 IsGFX12Plus;
9075 const unsigned BaseOpcodes[2][2] = {
9076 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9077 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9078 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9079 int Opcode;
9080 if (UseNSA) {
9081 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9082 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9083 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9084 : AMDGPU::MIMGEncGfx10NSA,
9085 NumVDataDwords, NumVAddrDwords);
9086 } else {
9087 assert(!IsGFX12Plus);
9088 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9089 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9090 : AMDGPU::MIMGEncGfx10Default,
9091 NumVDataDwords, NumVAddrDwords);
9092 }
9093 assert(Opcode != -1);
9094
9096
9097 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9099 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9100 if (Lanes[0].getValueSizeInBits() == 32) {
9101 for (unsigned I = 0; I < 3; ++I)
9102 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9103 } else {
9104 if (IsAligned) {
9105 Ops.push_back(
9106 DAG.getBitcast(MVT::i32,
9107 DAG.getBuildVector(MVT::v2f16, DL,
9108 { Lanes[0], Lanes[1] })));
9109 Ops.push_back(Lanes[2]);
9110 } else {
9111 SDValue Elt0 = Ops.pop_back_val();
9112 Ops.push_back(
9113 DAG.getBitcast(MVT::i32,
9114 DAG.getBuildVector(MVT::v2f16, DL,
9115 { Elt0, Lanes[0] })));
9116 Ops.push_back(
9117 DAG.getBitcast(MVT::i32,
9118 DAG.getBuildVector(MVT::v2f16, DL,
9119 { Lanes[1], Lanes[2] })));
9120 }
9121 }
9122 };
9123
9124 if (UseNSA && IsGFX11Plus) {
9125 Ops.push_back(NodePtr);
9126 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9127 Ops.push_back(RayOrigin);
9128 if (IsA16) {
9129 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9130 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9131 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9132 for (unsigned I = 0; I < 3; ++I) {
9133 MergedLanes.push_back(DAG.getBitcast(
9134 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9135 {DirLanes[I], InvDirLanes[I]})));
9136 }
9137 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9138 } else {
9139 Ops.push_back(RayDir);
9140 Ops.push_back(RayInvDir);
9141 }
9142 } else {
9143 if (Is64)
9144 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9145 2);
9146 else
9147 Ops.push_back(NodePtr);
9148
9149 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9150 packLanes(RayOrigin, true);
9151 packLanes(RayDir, true);
9152 packLanes(RayInvDir, false);
9153 }
9154
9155 if (!UseNSA) {
9156 // Build a single vector containing all the operands so far prepared.
9157 if (NumVAddrDwords > 12) {
9158 SDValue Undef = DAG.getUNDEF(MVT::i32);
9159 Ops.append(16 - Ops.size(), Undef);
9160 }
9161 assert(Ops.size() >= 8 && Ops.size() <= 12);
9162 SDValue MergedOps = DAG.getBuildVector(
9163 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9164 Ops.clear();
9165 Ops.push_back(MergedOps);
9166 }
9167
9168 Ops.push_back(TDescr);
9169 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9170 Ops.push_back(M->getChain());
9171
9172 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9173 MachineMemOperand *MemRef = M->getMemOperand();
9174 DAG.setNodeMemRefs(NewNode, {MemRef});
9175 return SDValue(NewNode, 0);
9176 }
9177 case Intrinsic::amdgcn_global_atomic_fmin:
9178 case Intrinsic::amdgcn_global_atomic_fmax:
9179 case Intrinsic::amdgcn_global_atomic_fmin_num:
9180 case Intrinsic::amdgcn_global_atomic_fmax_num:
9181 case Intrinsic::amdgcn_flat_atomic_fmin:
9182 case Intrinsic::amdgcn_flat_atomic_fmax:
9183 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9184 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9185 MemSDNode *M = cast<MemSDNode>(Op);
9186 SDValue Ops[] = {
9187 M->getOperand(0), // Chain
9188 M->getOperand(2), // Ptr
9189 M->getOperand(3) // Value
9190 };
9191 unsigned Opcode = 0;
9192 switch (IntrID) {
9193 case Intrinsic::amdgcn_global_atomic_fmin:
9194 case Intrinsic::amdgcn_global_atomic_fmin_num:
9195 case Intrinsic::amdgcn_flat_atomic_fmin:
9196 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9198 break;
9199 }
9200 case Intrinsic::amdgcn_global_atomic_fmax:
9201 case Intrinsic::amdgcn_global_atomic_fmax_num:
9202 case Intrinsic::amdgcn_flat_atomic_fmax:
9203 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9205 break;
9206 }
9207 default:
9208 llvm_unreachable("unhandled atomic opcode");
9209 }
9210 return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
9211 M->getVTList(), Ops, M->getMemoryVT(),
9212 M->getMemOperand());
9213 }
9214 case Intrinsic::amdgcn_s_get_barrier_state: {
9215 SDValue Chain = Op->getOperand(0);
9217 unsigned Opc;
9218 bool IsInlinableBarID = false;
9219 int64_t BarID;
9220
9221 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9222 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9223 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9224 }
9225
9226 if (IsInlinableBarID) {
9227 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9228 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9229 Ops.push_back(K);
9230 } else {
9231 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9232 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9233 Ops.push_back(M0Val.getValue(0));
9234 }
9235
9236 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9237 return SDValue(NewMI, 0);
9238 }
9239 default:
9240
9241 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9243 return lowerImage(Op, ImageDimIntr, DAG, true);
9244
9245 return SDValue();
9246 }
9247}
9248
9249// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9250// dwordx4 if on SI and handle TFE loads.
9251SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9252 SDVTList VTList,
9253 ArrayRef<SDValue> Ops, EVT MemVT,
9254 MachineMemOperand *MMO,
9255 SelectionDAG &DAG) const {
9256 LLVMContext &C = *DAG.getContext();
9258 EVT VT = VTList.VTs[0];
9259
9260 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9261 bool IsTFE = VTList.NumVTs == 3;
9262 if (IsTFE) {
9263 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9264 unsigned NumOpDWords = NumValueDWords + 1;
9265 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9266 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9267 MachineMemOperand *OpDWordsMMO =
9268 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9269 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9270 OpDWordsVT, OpDWordsMMO, DAG);
9272 DAG.getVectorIdxConstant(NumValueDWords, DL));
9273 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9274 SDValue ValueDWords =
9275 NumValueDWords == 1
9276 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9278 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9279 ZeroIdx);
9280 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9281 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9282 }
9283
9284 if (!Subtarget->hasDwordx3LoadStores() &&
9285 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9286 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9287 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9288 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9289 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9290 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9291 WidenedMemVT, WidenedMMO);
9293 DAG.getVectorIdxConstant(0, DL));
9294 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9295 }
9296
9297 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9298}
9299
9300SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9301 bool ImageStore) const {
9302 EVT StoreVT = VData.getValueType();
9303
9304 // No change for f16 and legal vector D16 types.
9305 if (!StoreVT.isVector())
9306 return VData;
9307
9308 SDLoc DL(VData);
9309 unsigned NumElements = StoreVT.getVectorNumElements();
9310
9311 if (Subtarget->hasUnpackedD16VMem()) {
9312 // We need to unpack the packed data to store.
9313 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9314 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9315
9316 EVT EquivStoreVT =
9317 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9318 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9319 return DAG.UnrollVectorOp(ZExt.getNode());
9320 }
9321
9322 // The sq block of gfx8.1 does not estimate register use correctly for d16
9323 // image store instructions. The data operand is computed as if it were not a
9324 // d16 image instruction.
9325 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9326 // Bitcast to i16
9327 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9328 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9329
9330 // Decompose into scalars
9332 DAG.ExtractVectorElements(IntVData, Elts);
9333
9334 // Group pairs of i16 into v2i16 and bitcast to i32
9335 SmallVector<SDValue, 4> PackedElts;
9336 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9337 SDValue Pair =
9338 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9339 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9340 PackedElts.push_back(IntPair);
9341 }
9342 if ((NumElements % 2) == 1) {
9343 // Handle v3i16
9344 unsigned I = Elts.size() / 2;
9345 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9346 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9347 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9348 PackedElts.push_back(IntPair);
9349 }
9350
9351 // Pad using UNDEF
9352 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9353
9354 // Build final vector
9355 EVT VecVT =
9356 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9357 return DAG.getBuildVector(VecVT, DL, PackedElts);
9358 }
9359
9360 if (NumElements == 3) {
9361 EVT IntStoreVT =
9363 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9364
9365 EVT WidenedStoreVT = EVT::getVectorVT(
9366 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9367 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9368 WidenedStoreVT.getStoreSizeInBits());
9369 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9370 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9371 }
9372
9373 assert(isTypeLegal(StoreVT));
9374 return VData;
9375}
9376
9377SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9378 SelectionDAG &DAG) const {
9379 SDLoc DL(Op);
9380 SDValue Chain = Op.getOperand(0);
9381 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9383
9384 switch (IntrinsicID) {
9385 case Intrinsic::amdgcn_exp_compr: {
9386 if (!Subtarget->hasCompressedExport()) {
9387 DiagnosticInfoUnsupported BadIntrin(
9389 "intrinsic not supported on subtarget", DL.getDebugLoc());
9390 DAG.getContext()->diagnose(BadIntrin);
9391 }
9392 SDValue Src0 = Op.getOperand(4);
9393 SDValue Src1 = Op.getOperand(5);
9394 // Hack around illegal type on SI by directly selecting it.
9395 if (isTypeLegal(Src0.getValueType()))
9396 return SDValue();
9397
9398 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9399 SDValue Undef = DAG.getUNDEF(MVT::f32);
9400 const SDValue Ops[] = {
9401 Op.getOperand(2), // tgt
9402 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9403 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9404 Undef, // src2
9405 Undef, // src3
9406 Op.getOperand(7), // vm
9407 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9408 Op.getOperand(3), // en
9409 Op.getOperand(0) // Chain
9410 };
9411
9412 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9413 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9414 }
9415 case Intrinsic::amdgcn_s_barrier: {
9418 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9419 if (WGSize <= ST.getWavefrontSize())
9420 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9421 Op.getOperand(0)), 0);
9422 }
9423
9424 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9425 if (ST.hasSplitBarriers()) {
9426 SDValue K =
9428 SDValue BarSignal =
9429 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9430 MVT::Other, K, Op.getOperand(0)),
9431 0);
9432 SDValue BarWait =
9433 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9434 BarSignal.getValue(0)),
9435 0);
9436 return BarWait;
9437 }
9438
9439 return SDValue();
9440 };
9441 case Intrinsic::amdgcn_tbuffer_store: {
9442 SDValue VData = Op.getOperand(2);
9443 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9444 if (IsD16)
9445 VData = handleD16VData(VData, DAG);
9446 unsigned Dfmt = Op.getConstantOperandVal(8);
9447 unsigned Nfmt = Op.getConstantOperandVal(9);
9448 unsigned Glc = Op.getConstantOperandVal(10);
9449 unsigned Slc = Op.getConstantOperandVal(11);
9450 unsigned IdxEn = getIdxEn(Op.getOperand(4));
9451 SDValue Ops[] = {
9452 Chain,
9453 VData, // vdata
9454 Op.getOperand(3), // rsrc
9455 Op.getOperand(4), // vindex
9456 Op.getOperand(5), // voffset
9457 Op.getOperand(6), // soffset
9458 Op.getOperand(7), // offset
9459 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
9460 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9461 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9462 };
9463 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9465 MemSDNode *M = cast<MemSDNode>(Op);
9466 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9467 M->getMemoryVT(), M->getMemOperand());
9468 }
9469
9470 case Intrinsic::amdgcn_struct_tbuffer_store:
9471 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9472 SDValue VData = Op.getOperand(2);
9473 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9474 if (IsD16)
9475 VData = handleD16VData(VData, DAG);
9476 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9477 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9478 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9479 SDValue Ops[] = {
9480 Chain,
9481 VData, // vdata
9482 Rsrc, // rsrc
9483 Op.getOperand(4), // vindex
9484 Offsets.first, // voffset
9485 SOffset, // soffset
9486 Offsets.second, // offset
9487 Op.getOperand(7), // format
9488 Op.getOperand(8), // cachepolicy, swizzled buffer
9489 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9490 };
9491 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9493 MemSDNode *M = cast<MemSDNode>(Op);
9494 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9495 M->getMemoryVT(), M->getMemOperand());
9496 }
9497
9498 case Intrinsic::amdgcn_raw_tbuffer_store:
9499 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9500 SDValue VData = Op.getOperand(2);
9501 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9502 if (IsD16)
9503 VData = handleD16VData(VData, DAG);
9504 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9505 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9506 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9507 SDValue Ops[] = {
9508 Chain,
9509 VData, // vdata
9510 Rsrc, // rsrc
9511 DAG.getConstant(0, DL, MVT::i32), // vindex
9512 Offsets.first, // voffset
9513 SOffset, // soffset
9514 Offsets.second, // offset
9515 Op.getOperand(6), // format
9516 Op.getOperand(7), // cachepolicy, swizzled buffer
9517 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9518 };
9519 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9521 MemSDNode *M = cast<MemSDNode>(Op);
9522 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9523 M->getMemoryVT(), M->getMemOperand());
9524 }
9525
9526 case Intrinsic::amdgcn_buffer_store:
9527 case Intrinsic::amdgcn_buffer_store_format: {
9528 SDValue VData = Op.getOperand(2);
9529 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9530 if (IsD16)
9531 VData = handleD16VData(VData, DAG);
9532 unsigned Glc = Op.getConstantOperandVal(6);
9533 unsigned Slc = Op.getConstantOperandVal(7);
9534 unsigned IdxEn = getIdxEn(Op.getOperand(4));
9535 SDValue Ops[] = {
9536 Chain,
9537 VData,
9538 Op.getOperand(3), // rsrc
9539 Op.getOperand(4), // vindex
9540 SDValue(), // voffset -- will be set by setBufferOffsets
9541 SDValue(), // soffset -- will be set by setBufferOffsets
9542 SDValue(), // offset -- will be set by setBufferOffsets
9543 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9544 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9545 };
9546 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
9547
9548 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
9550 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9551 MemSDNode *M = cast<MemSDNode>(Op);
9552
9553 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9554 EVT VDataType = VData.getValueType().getScalarType();
9555 if (VDataType == MVT::i8 || VDataType == MVT::i16)
9556 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9557
9558 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9559 M->getMemoryVT(), M->getMemOperand());
9560 }
9561
9562 case Intrinsic::amdgcn_raw_buffer_store:
9563 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9564 case Intrinsic::amdgcn_raw_buffer_store_format:
9565 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9566 const bool IsFormat =
9567 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9568 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9569
9570 SDValue VData = Op.getOperand(2);
9571 EVT VDataVT = VData.getValueType();
9572 EVT EltType = VDataVT.getScalarType();
9573 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9574 if (IsD16) {
9575 VData = handleD16VData(VData, DAG);
9576 VDataVT = VData.getValueType();
9577 }
9578
9579 if (!isTypeLegal(VDataVT)) {
9580 VData =
9581 DAG.getNode(ISD::BITCAST, DL,
9582 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9583 }
9584
9585 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9586 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9587 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9588 SDValue Ops[] = {
9589 Chain,
9590 VData,
9591 Rsrc,
9592 DAG.getConstant(0, DL, MVT::i32), // vindex
9593 Offsets.first, // voffset
9594 SOffset, // soffset
9595 Offsets.second, // offset
9596 Op.getOperand(6), // cachepolicy, swizzled buffer
9597 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9598 };
9599 unsigned Opc =
9601 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9602 MemSDNode *M = cast<MemSDNode>(Op);
9603
9604 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9605 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9606 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9607
9608 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9609 M->getMemoryVT(), M->getMemOperand());
9610 }
9611
9612 case Intrinsic::amdgcn_struct_buffer_store:
9613 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9614 case Intrinsic::amdgcn_struct_buffer_store_format:
9615 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9616 const bool IsFormat =
9617 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9618 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9619
9620 SDValue VData = Op.getOperand(2);
9621 EVT VDataVT = VData.getValueType();
9622 EVT EltType = VDataVT.getScalarType();
9623 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9624
9625 if (IsD16) {
9626 VData = handleD16VData(VData, DAG);
9627 VDataVT = VData.getValueType();
9628 }
9629
9630 if (!isTypeLegal(VDataVT)) {
9631 VData =
9632 DAG.getNode(ISD::BITCAST, DL,
9633 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9634 }
9635
9636 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9637 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9638 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9639 SDValue Ops[] = {
9640 Chain,
9641 VData,
9642 Rsrc,
9643 Op.getOperand(4), // vindex
9644 Offsets.first, // voffset
9645 SOffset, // soffset
9646 Offsets.second, // offset
9647 Op.getOperand(7), // cachepolicy, swizzled buffer
9648 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9649 };
9650 unsigned Opc =
9652 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9653 MemSDNode *M = cast<MemSDNode>(Op);
9654
9655 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9656 EVT VDataType = VData.getValueType().getScalarType();
9657 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9658 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9659
9660 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9661 M->getMemoryVT(), M->getMemOperand());
9662 }
9663 case Intrinsic::amdgcn_raw_buffer_load_lds:
9664 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9665 case Intrinsic::amdgcn_struct_buffer_load_lds:
9666 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9667 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9668 unsigned Opc;
9669 bool HasVIndex =
9670 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9671 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9672 unsigned OpOffset = HasVIndex ? 1 : 0;
9673 SDValue VOffset = Op.getOperand(5 + OpOffset);
9674 bool HasVOffset = !isNullConstant(VOffset);
9675 unsigned Size = Op->getConstantOperandVal(4);
9676
9677 switch (Size) {
9678 default:
9679 return SDValue();
9680 case 1:
9681 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9682 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9683 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9684 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9685 break;
9686 case 2:
9687 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9688 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9689 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9690 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9691 break;
9692 case 4:
9693 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9694 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9695 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9696 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9697 break;
9698 }
9699
9700 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9701
9703
9704 if (HasVIndex && HasVOffset)
9705 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9706 { Op.getOperand(5), // VIndex
9707 VOffset }));
9708 else if (HasVIndex)
9709 Ops.push_back(Op.getOperand(5));
9710 else if (HasVOffset)
9711 Ops.push_back(VOffset);
9712
9713 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9714 Ops.push_back(Rsrc);
9715 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9716 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9717 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9718 Ops.push_back(
9719 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9721 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9722 Ops.push_back(M0Val.getValue(0)); // Chain
9723 Ops.push_back(M0Val.getValue(1)); // Glue
9724
9725 auto *M = cast<MemSDNode>(Op);
9726 MachineMemOperand *LoadMMO = M->getMemOperand();
9727 // Don't set the offset value here because the pointer points to the base of
9728 // the buffer.
9729 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9730
9731 MachinePointerInfo StorePtrI = LoadPtrI;
9732 LoadPtrI.V = PoisonValue::get(
9736
9737 auto F = LoadMMO->getFlags() &
9739 LoadMMO =
9741 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9742
9744 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9745 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9746
9747 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9748 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9749
9750 return SDValue(Load, 0);
9751 }
9752 case Intrinsic::amdgcn_global_load_lds: {
9753 unsigned Opc;
9754 unsigned Size = Op->getConstantOperandVal(4);
9755 switch (Size) {
9756 default:
9757 return SDValue();
9758 case 1:
9759 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9760 break;
9761 case 2:
9762 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9763 break;
9764 case 4:
9765 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9766 break;
9767 }
9768
9769 auto *M = cast<MemSDNode>(Op);
9770 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9771
9773
9774 SDValue Addr = Op.getOperand(2); // Global ptr
9775 SDValue VOffset;
9776 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9777 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9778 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9779 SDValue LHS = Addr.getOperand(0);
9780 SDValue RHS = Addr.getOperand(1);
9781
9782 if (LHS->isDivergent())
9783 std::swap(LHS, RHS);
9784
9785 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9786 RHS.getOperand(0).getValueType() == MVT::i32) {
9787 // add (i64 sgpr), (zero_extend (i32 vgpr))
9788 Addr = LHS;
9789 VOffset = RHS.getOperand(0);
9790 }
9791 }
9792
9793 Ops.push_back(Addr);
9794 if (!Addr->isDivergent()) {
9795 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9796 if (!VOffset)
9797 VOffset = SDValue(
9798 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9799 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9800 Ops.push_back(VOffset);
9801 }
9802
9803 Ops.push_back(Op.getOperand(5)); // Offset
9804 Ops.push_back(Op.getOperand(6)); // CPol
9805 Ops.push_back(M0Val.getValue(0)); // Chain
9806 Ops.push_back(M0Val.getValue(1)); // Glue
9807
9808 MachineMemOperand *LoadMMO = M->getMemOperand();
9809 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9810 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9811 MachinePointerInfo StorePtrI = LoadPtrI;
9812 LoadPtrI.V = PoisonValue::get(
9816 auto F = LoadMMO->getFlags() &
9818 LoadMMO =
9820 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9822 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9823 LoadMMO->getAAInfo());
9824
9825 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9826 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9827
9828 return SDValue(Load, 0);
9829 }
9830 case Intrinsic::amdgcn_end_cf:
9831 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9832 Op->getOperand(2), Chain), 0);
9833 case Intrinsic::amdgcn_s_barrier_init:
9834 case Intrinsic::amdgcn_s_barrier_join:
9835 case Intrinsic::amdgcn_s_wakeup_barrier: {
9836 SDValue Chain = Op->getOperand(0);
9838 SDValue BarOp = Op->getOperand(2);
9839 unsigned Opc;
9840 bool IsInlinableBarID = false;
9841 int64_t BarVal;
9842
9843 if (isa<ConstantSDNode>(BarOp)) {
9844 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9845 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9846 }
9847
9848 if (IsInlinableBarID) {
9849 switch (IntrinsicID) {
9850 default:
9851 return SDValue();
9852 case Intrinsic::amdgcn_s_barrier_init:
9853 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9854 break;
9855 case Intrinsic::amdgcn_s_barrier_join:
9856 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9857 break;
9858 case Intrinsic::amdgcn_s_wakeup_barrier:
9859 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9860 break;
9861 }
9862
9863 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9864 Ops.push_back(K);
9865 } else {
9866 switch (IntrinsicID) {
9867 default:
9868 return SDValue();
9869 case Intrinsic::amdgcn_s_barrier_init:
9870 Opc = AMDGPU::S_BARRIER_INIT_M0;
9871 break;
9872 case Intrinsic::amdgcn_s_barrier_join:
9873 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9874 break;
9875 case Intrinsic::amdgcn_s_wakeup_barrier:
9876 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9877 break;
9878 }
9879 }
9880
9881 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9882 SDValue M0Val;
9883 // Member count will be read from M0[16:22]
9884 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9885 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9886
9887 if (!IsInlinableBarID) {
9888 // If reference to barrier id is not an inline constant then it must be
9889 // referenced with M0[4:0]. Perform an OR with the member count to
9890 // include it in M0.
9891 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9892 Op.getOperand(2), M0Val),
9893 0);
9894 }
9895 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9896 } else if (!IsInlinableBarID) {
9897 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
9898 }
9899
9900 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9901 return SDValue(NewMI, 0);
9902 }
9903 default: {
9904 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9906 return lowerImage(Op, ImageDimIntr, DAG, true);
9907
9908 return Op;
9909 }
9910 }
9911}
9912
9913// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
9914// offset (the offset that is included in bounds checking and swizzling, to be
9915// split between the instruction's voffset and immoffset fields) and soffset
9916// (the offset that is excluded from bounds checking and swizzling, to go in
9917// the instruction's soffset field). This function takes the first kind of
9918// offset and figures out how to split it between voffset and immoffset.
9919std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9920 SDValue Offset, SelectionDAG &DAG) const {
9921 SDLoc DL(Offset);
9922 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
9923 SDValue N0 = Offset;
9924 ConstantSDNode *C1 = nullptr;
9925
9926 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9927 N0 = SDValue();
9928 else if (DAG.isBaseWithConstantOffset(N0)) {
9929 C1 = cast<ConstantSDNode>(N0.getOperand(1));
9930 N0 = N0.getOperand(0);
9931 }
9932
9933 if (C1) {
9934 unsigned ImmOffset = C1->getZExtValue();
9935 // If the immediate value is too big for the immoffset field, put only bits
9936 // that would normally fit in the immoffset field. The remaining value that
9937 // is copied/added for the voffset field is a large power of 2, and it
9938 // stands more chance of being CSEd with the copy/add for another similar
9939 // load/store.
9940 // However, do not do that rounding down if that is a negative
9941 // number, as it appears to be illegal to have a negative offset in the
9942 // vgpr, even if adding the immediate offset makes it positive.
9943 unsigned Overflow = ImmOffset & ~MaxImm;
9944 ImmOffset -= Overflow;
9945 if ((int32_t)Overflow < 0) {
9946 Overflow += ImmOffset;
9947 ImmOffset = 0;
9948 }
9949 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
9950 if (Overflow) {
9951 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
9952 if (!N0)
9953 N0 = OverflowVal;
9954 else {
9955 SDValue Ops[] = { N0, OverflowVal };
9956 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
9957 }
9958 }
9959 }
9960 if (!N0)
9961 N0 = DAG.getConstant(0, DL, MVT::i32);
9962 if (!C1)
9963 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
9964 return {N0, SDValue(C1, 0)};
9965}
9966
9967// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
9968// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
9969// pointed to by Offsets.
9970void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
9971 SelectionDAG &DAG, SDValue *Offsets,
9972 Align Alignment) const {
9974 SDLoc DL(CombinedOffset);
9975 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
9976 uint32_t Imm = C->getZExtValue();
9977 uint32_t SOffset, ImmOffset;
9978 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
9979 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
9980 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
9981 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
9982 return;
9983 }
9984 }
9985 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
9986 SDValue N0 = CombinedOffset.getOperand(0);
9987 SDValue N1 = CombinedOffset.getOperand(1);
9988 uint32_t SOffset, ImmOffset;
9989 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
9990 if (Offset >= 0 &&
9991 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
9992 Offsets[0] = N0;
9993 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
9994 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
9995 return;
9996 }
9997 }
9998
9999 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10000 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10001 : DAG.getConstant(0, DL, MVT::i32);
10002
10003 Offsets[0] = CombinedOffset;
10004 Offsets[1] = SOffsetZero;
10005 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10006}
10007
10008SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10009 SelectionDAG &DAG) const {
10010 if (!MaybePointer.getValueType().isScalarInteger())
10011 return MaybePointer;
10012
10013 SDLoc DL(MaybePointer);
10014
10015 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10016 return Rsrc;
10017}
10018
10019// Wrap a global or flat pointer into a buffer intrinsic using the flags
10020// specified in the intrinsic.
10021SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10022 SelectionDAG &DAG) const {
10023 SDLoc Loc(Op);
10024
10025 SDValue Pointer = Op->getOperand(1);
10026 SDValue Stride = Op->getOperand(2);
10027 SDValue NumRecords = Op->getOperand(3);
10028 SDValue Flags = Op->getOperand(4);
10029
10030 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10031 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10032 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10033 std::optional<uint32_t> ConstStride = std::nullopt;
10034 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10035 ConstStride = ConstNode->getZExtValue();
10036
10037 SDValue NewHighHalf = Masked;
10038 if (!ConstStride || *ConstStride != 0) {
10039 SDValue ShiftedStride;
10040 if (ConstStride) {
10041 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10042 } else {
10043 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10044 ShiftedStride =
10045 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10046 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10047 }
10048 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10049 }
10050
10051 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10052 NewHighHalf, NumRecords, Flags);
10053 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10054 return RsrcPtr;
10055}
10056
10057// Handle 8 bit and 16 bit buffer loads
10058SDValue
10059SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
10061 MachineMemOperand *MMO) const {
10062 EVT IntVT = LoadVT.changeTypeToInteger();
10063 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10065
10066 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10067 SDValue BufferLoad =
10068 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10069 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10070 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10071
10072 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10073}
10074
10075// Handle 8 bit and 16 bit buffer stores
10076SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10077 EVT VDataType, SDLoc DL,
10078 SDValue Ops[],
10079 MemSDNode *M) const {
10080 if (VDataType == MVT::f16)
10081 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10082
10083 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10084 Ops[1] = BufferStoreExt;
10085 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10087 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10088 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10089 M->getMemOperand());
10090}
10091
10093 ISD::LoadExtType ExtType, SDValue Op,
10094 const SDLoc &SL, EVT VT) {
10095 if (VT.bitsLT(Op.getValueType()))
10096 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10097
10098 switch (ExtType) {
10099 case ISD::SEXTLOAD:
10100 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10101 case ISD::ZEXTLOAD:
10102 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10103 case ISD::EXTLOAD:
10104 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10105 case ISD::NON_EXTLOAD:
10106 return Op;
10107 }
10108
10109 llvm_unreachable("invalid ext type");
10110}
10111
10112// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10113// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10114SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10115 SelectionDAG &DAG = DCI.DAG;
10116 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10117 return SDValue();
10118
10119 // FIXME: Constant loads should all be marked invariant.
10120 unsigned AS = Ld->getAddressSpace();
10121 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10123 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10124 return SDValue();
10125
10126 // Don't do this early, since it may interfere with adjacent load merging for
10127 // illegal types. We can avoid losing alignment information for exotic types
10128 // pre-legalize.
10129 EVT MemVT = Ld->getMemoryVT();
10130 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10131 MemVT.getSizeInBits() >= 32)
10132 return SDValue();
10133
10134 SDLoc SL(Ld);
10135
10136 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10137 "unexpected vector extload");
10138
10139 // TODO: Drop only high part of range.
10140 SDValue Ptr = Ld->getBasePtr();
10141 SDValue NewLoad = DAG.getLoad(
10142 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10143 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10144 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10145 nullptr); // Drop ranges
10146
10147 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10148 if (MemVT.isFloatingPoint()) {
10150 "unexpected fp extload");
10151 TruncVT = MemVT.changeTypeToInteger();
10152 }
10153
10154 SDValue Cvt = NewLoad;
10155 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10156 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10157 DAG.getValueType(TruncVT));
10158 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10160 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10161 } else {
10163 }
10164
10165 EVT VT = Ld->getValueType(0);
10166 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10167
10168 DCI.AddToWorklist(Cvt.getNode());
10169
10170 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10171 // the appropriate extension from the 32-bit load.
10172 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10173 DCI.AddToWorklist(Cvt.getNode());
10174
10175 // Handle conversion back to floating point if necessary.
10176 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10177
10178 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10179}
10180
10182 const SIMachineFunctionInfo &Info) {
10183 // TODO: Should check if the address can definitely not access stack.
10184 if (Info.isEntryFunction())
10185 return Info.getUserSGPRInfo().hasFlatScratchInit();
10186 return true;
10187}
10188
10189SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10190 SDLoc DL(Op);
10191 LoadSDNode *Load = cast<LoadSDNode>(Op);
10192 ISD::LoadExtType ExtType = Load->getExtensionType();
10193 EVT MemVT = Load->getMemoryVT();
10194
10195 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10196 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10197 return SDValue();
10198
10199 // FIXME: Copied from PPC
10200 // First, load into 32 bits, then truncate to 1 bit.
10201
10202 SDValue Chain = Load->getChain();
10203 SDValue BasePtr = Load->getBasePtr();
10204 MachineMemOperand *MMO = Load->getMemOperand();
10205
10206 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10207
10208 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10209 BasePtr, RealMemVT, MMO);
10210
10211 if (!MemVT.isVector()) {
10212 SDValue Ops[] = {
10213 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10214 NewLD.getValue(1)
10215 };
10216
10217 return DAG.getMergeValues(Ops, DL);
10218 }
10219
10221 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10222 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10223 DAG.getConstant(I, DL, MVT::i32));
10224
10225 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10226 }
10227
10228 SDValue Ops[] = {
10229 DAG.getBuildVector(MemVT, DL, Elts),
10230 NewLD.getValue(1)
10231 };
10232
10233 return DAG.getMergeValues(Ops, DL);
10234 }
10235
10236 if (!MemVT.isVector())
10237 return SDValue();
10238
10239 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10240 "Custom lowering for non-i32 vectors hasn't been implemented.");
10241
10242 Align Alignment = Load->getAlign();
10243 unsigned AS = Load->getAddressSpace();
10244 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10245 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10246 return SplitVectorLoad(Op, DAG);
10247 }
10248
10251 // If there is a possibility that flat instruction access scratch memory
10252 // then we need to use the same legalization rules we use for private.
10253 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10255 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10257
10258 unsigned NumElements = MemVT.getVectorNumElements();
10259
10260 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10262 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10263 if (MemVT.isPow2VectorType() ||
10264 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10265 return SDValue();
10266 return WidenOrSplitVectorLoad(Op, DAG);
10267 }
10268 // Non-uniform loads will be selected to MUBUF instructions, so they
10269 // have the same legalization requirements as global and private
10270 // loads.
10271 //
10272 }
10273
10274 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10277 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10278 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10279 Alignment >= Align(4) && NumElements < 32) {
10280 if (MemVT.isPow2VectorType() ||
10281 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10282 return SDValue();
10283 return WidenOrSplitVectorLoad(Op, DAG);
10284 }
10285 // Non-uniform loads will be selected to MUBUF instructions, so they
10286 // have the same legalization requirements as global and private
10287 // loads.
10288 //
10289 }
10290 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10293 AS == AMDGPUAS::FLAT_ADDRESS) {
10294 if (NumElements > 4)
10295 return SplitVectorLoad(Op, DAG);
10296 // v3 loads not supported on SI.
10297 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10298 return WidenOrSplitVectorLoad(Op, DAG);
10299
10300 // v3 and v4 loads are supported for private and global memory.
10301 return SDValue();
10302 }
10303 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10304 // Depending on the setting of the private_element_size field in the
10305 // resource descriptor, we can only make private accesses up to a certain
10306 // size.
10307 switch (Subtarget->getMaxPrivateElementSize()) {
10308 case 4: {
10309 SDValue Ops[2];
10310 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10311 return DAG.getMergeValues(Ops, DL);
10312 }
10313 case 8:
10314 if (NumElements > 2)
10315 return SplitVectorLoad(Op, DAG);
10316 return SDValue();
10317 case 16:
10318 // Same as global/flat
10319 if (NumElements > 4)
10320 return SplitVectorLoad(Op, DAG);
10321 // v3 loads not supported on SI.
10322 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10323 return WidenOrSplitVectorLoad(Op, DAG);
10324
10325 return SDValue();
10326 default:
10327 llvm_unreachable("unsupported private_element_size");
10328 }
10329 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10330 unsigned Fast = 0;
10331 auto Flags = Load->getMemOperand()->getFlags();
10333 Load->getAlign(), Flags, &Fast) &&
10334 Fast > 1)
10335 return SDValue();
10336
10337 if (MemVT.isVector())
10338 return SplitVectorLoad(Op, DAG);
10339 }
10340
10342 MemVT, *Load->getMemOperand())) {
10343 SDValue Ops[2];
10344 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10345 return DAG.getMergeValues(Ops, DL);
10346 }
10347
10348 return SDValue();
10349}
10350
10351SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10352 EVT VT = Op.getValueType();
10353 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10354 VT.getSizeInBits() == 512)
10355 return splitTernaryVectorOp(Op, DAG);
10356
10357 assert(VT.getSizeInBits() == 64);
10358
10359 SDLoc DL(Op);
10360 SDValue Cond = Op.getOperand(0);
10361
10362 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10363 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10364
10365 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10366 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10367
10368 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10369 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10370
10371 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10372
10373 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10374 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10375
10376 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10377
10378 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10379 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10380}
10381
10382// Catch division cases where we can use shortcuts with rcp and rsq
10383// instructions.
10384SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10385 SelectionDAG &DAG) const {
10386 SDLoc SL(Op);
10387 SDValue LHS = Op.getOperand(0);
10388 SDValue RHS = Op.getOperand(1);
10389 EVT VT = Op.getValueType();
10390 const SDNodeFlags Flags = Op->getFlags();
10391
10392 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10394
10395 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10396 // Without !fpmath accuracy information, we can't do more because we don't
10397 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10398 // f16 is always accurate enough
10399 if (!AllowInaccurateRcp && VT != MVT::f16)
10400 return SDValue();
10401
10402 if (CLHS->isExactlyValue(1.0)) {
10403 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10404 // the CI documentation has a worst case error of 1 ulp.
10405 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10406 // use it as long as we aren't trying to use denormals.
10407 //
10408 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10409
10410 // 1.0 / sqrt(x) -> rsq(x)
10411
10412 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10413 // error seems really high at 2^29 ULP.
10414 // 1.0 / x -> rcp(x)
10415 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10416 }
10417
10418 // Same as for 1.0, but expand the sign out of the constant.
10419 if (CLHS->isExactlyValue(-1.0)) {
10420 // -1.0 / x -> rcp (fneg x)
10421 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10422 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10423 }
10424 }
10425
10426 // For f16 require afn or arcp.
10427 // For f32 require afn.
10428 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10429 return SDValue();
10430
10431 // Turn into multiply by the reciprocal.
10432 // x / y -> x * (1.0 / y)
10433 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10434 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10435}
10436
10437SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10438 SelectionDAG &DAG) const {
10439 SDLoc SL(Op);
10440 SDValue X = Op.getOperand(0);
10441 SDValue Y = Op.getOperand(1);
10442 EVT VT = Op.getValueType();
10443 const SDNodeFlags Flags = Op->getFlags();
10444
10445 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10447 if (!AllowInaccurateDiv)
10448 return SDValue();
10449
10450 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10451 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10452
10453 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10454 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10455
10456 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10457 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10458 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10459 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10460 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10461 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10462}
10463
10464static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10465 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10466 SDNodeFlags Flags) {
10467 if (GlueChain->getNumValues() <= 1) {
10468 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10469 }
10470
10471 assert(GlueChain->getNumValues() == 3);
10472
10473 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10474 switch (Opcode) {
10475 default: llvm_unreachable("no chain equivalent for opcode");
10476 case ISD::FMUL:
10477 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10478 break;
10479 }
10480
10481 return DAG.getNode(Opcode, SL, VTList,
10482 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10483 Flags);
10484}
10485
10486static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10487 EVT VT, SDValue A, SDValue B, SDValue C,
10488 SDValue GlueChain, SDNodeFlags Flags) {
10489 if (GlueChain->getNumValues() <= 1) {
10490 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10491 }
10492
10493 assert(GlueChain->getNumValues() == 3);
10494
10495 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10496 switch (Opcode) {
10497 default: llvm_unreachable("no chain equivalent for opcode");
10498 case ISD::FMA:
10499 Opcode = AMDGPUISD::FMA_W_CHAIN;
10500 break;
10501 }
10502
10503 return DAG.getNode(Opcode, SL, VTList,
10504 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10505 Flags);
10506}
10507
10508SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10509 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10510 return FastLowered;
10511
10512 SDLoc SL(Op);
10513 SDValue Src0 = Op.getOperand(0);
10514 SDValue Src1 = Op.getOperand(1);
10515
10516 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10517 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10518
10519 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10520 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10521
10522 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10523 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10524
10525 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10526}
10527
10528// Faster 2.5 ULP division that does not support denormals.
10529SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10530 SDNodeFlags Flags = Op->getFlags();
10531 SDLoc SL(Op);
10532 SDValue LHS = Op.getOperand(1);
10533 SDValue RHS = Op.getOperand(2);
10534
10535 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10536
10537 const APFloat K0Val(0x1p+96f);
10538 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10539
10540 const APFloat K1Val(0x1p-32f);
10541 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10542
10543 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10544
10545 EVT SetCCVT =
10546 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10547
10548 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10549
10550 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10551
10552 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10553
10554 // rcp does not support denormals.
10555 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10556
10557 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10558
10559 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10560}
10561
10562// Returns immediate value for setting the F32 denorm mode when using the
10563// S_DENORM_MODE instruction.
10565 const SIMachineFunctionInfo *Info,
10566 const GCNSubtarget *ST) {
10567 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10568 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10569 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10570 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10571}
10572
10573SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10574 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10575 return FastLowered;
10576
10577 // The selection matcher assumes anything with a chain selecting to a
10578 // mayRaiseFPException machine instruction. Since we're introducing a chain
10579 // here, we need to explicitly report nofpexcept for the regular fdiv
10580 // lowering.
10581 SDNodeFlags Flags = Op->getFlags();
10582 Flags.setNoFPExcept(true);
10583
10584 SDLoc SL(Op);
10585 SDValue LHS = Op.getOperand(0);
10586 SDValue RHS = Op.getOperand(1);
10587
10588 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10589
10590 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10591
10592 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10593 {RHS, RHS, LHS}, Flags);
10594 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10595 {LHS, RHS, LHS}, Flags);
10596
10597 // Denominator is scaled to not be denormal, so using rcp is ok.
10598 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10599 DenominatorScaled, Flags);
10600 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10601 DenominatorScaled, Flags);
10602
10603 using namespace AMDGPU::Hwreg;
10604 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10605 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10606
10607 const MachineFunction &MF = DAG.getMachineFunction();
10609 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10610
10611 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10612 const bool HasDynamicDenormals =
10613 (DenormMode.Input == DenormalMode::Dynamic) ||
10614 (DenormMode.Output == DenormalMode::Dynamic);
10615
10616 SDValue SavedDenormMode;
10617
10618 if (!PreservesDenormals) {
10619 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10620 // lowering. The chain dependence is insufficient, and we need glue. We do
10621 // not need the glue variants in a strictfp function.
10622
10623 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10624
10625 SDValue Glue = DAG.getEntryNode();
10626 if (HasDynamicDenormals) {
10627 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10628 DAG.getVTList(MVT::i32, MVT::Glue),
10629 {BitField, Glue});
10630 SavedDenormMode = SDValue(GetReg, 0);
10631
10632 Glue = DAG.getMergeValues(
10633 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10634 }
10635
10636 SDNode *EnableDenorm;
10637 if (Subtarget->hasDenormModeInst()) {
10638 const SDValue EnableDenormValue =
10639 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10640
10641 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10642 EnableDenormValue)
10643 .getNode();
10644 } else {
10645 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10646 SL, MVT::i32);
10647 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10648 {EnableDenormValue, BitField, Glue});
10649 }
10650
10651 SDValue Ops[3] = {
10652 NegDivScale0,
10653 SDValue(EnableDenorm, 0),
10654 SDValue(EnableDenorm, 1)
10655 };
10656
10657 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10658 }
10659
10660 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10661 ApproxRcp, One, NegDivScale0, Flags);
10662
10663 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10664 ApproxRcp, Fma0, Flags);
10665
10666 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10667 Fma1, Fma1, Flags);
10668
10669 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10670 NumeratorScaled, Mul, Flags);
10671
10672 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10673 Fma2, Fma1, Mul, Fma2, Flags);
10674
10675 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10676 NumeratorScaled, Fma3, Flags);
10677
10678 if (!PreservesDenormals) {
10679 SDNode *DisableDenorm;
10680 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10681 const SDValue DisableDenormValue = getSPDenormModeValue(
10682 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10683
10684 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10685 Fma4.getValue(1), DisableDenormValue,
10686 Fma4.getValue(2)).getNode();
10687 } else {
10688 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10689 const SDValue DisableDenormValue =
10690 HasDynamicDenormals
10691 ? SavedDenormMode
10692 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10693
10694 DisableDenorm = DAG.getMachineNode(
10695 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10696 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10697 }
10698
10699 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10700 SDValue(DisableDenorm, 0), DAG.getRoot());
10701 DAG.setRoot(OutputChain);
10702 }
10703
10704 SDValue Scale = NumeratorScaled.getValue(1);
10705 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10706 {Fma4, Fma1, Fma3, Scale}, Flags);
10707
10708 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10709}
10710
10711SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10712 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10713 return FastLowered;
10714
10715 SDLoc SL(Op);
10716 SDValue X = Op.getOperand(0);
10717 SDValue Y = Op.getOperand(1);
10718
10719 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10720
10721 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10722
10723 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10724
10725 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10726
10727 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10728
10729 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10730
10731 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10732
10733 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10734
10735 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10736
10737 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10738 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10739
10740 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10741 NegDivScale0, Mul, DivScale1);
10742
10743 SDValue Scale;
10744
10745 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10746 // Workaround a hardware bug on SI where the condition output from div_scale
10747 // is not usable.
10748
10749 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10750
10751 // Figure out if the scale to use for div_fmas.
10752 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10753 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10754 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10755 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10756
10757 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10758 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10759
10760 SDValue Scale0Hi
10761 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10762 SDValue Scale1Hi
10763 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10764
10765 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10766 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10767 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10768 } else {
10769 Scale = DivScale1.getValue(1);
10770 }
10771
10772 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10773 Fma4, Fma3, Mul, Scale);
10774
10775 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10776}
10777
10778SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10779 EVT VT = Op.getValueType();
10780
10781 if (VT == MVT::f32)
10782 return LowerFDIV32(Op, DAG);
10783
10784 if (VT == MVT::f64)
10785 return LowerFDIV64(Op, DAG);
10786
10787 if (VT == MVT::f16)
10788 return LowerFDIV16(Op, DAG);
10789
10790 llvm_unreachable("Unexpected type for fdiv");
10791}
10792
10793SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10794 SDLoc dl(Op);
10795 SDValue Val = Op.getOperand(0);
10796 EVT VT = Val.getValueType();
10797 EVT ResultExpVT = Op->getValueType(1);
10798 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10799
10800 SDValue Mant = DAG.getNode(
10802 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10803
10804 SDValue Exp = DAG.getNode(
10805 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10806 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10807
10808 if (Subtarget->hasFractBug()) {
10809 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10810 SDValue Inf = DAG.getConstantFP(
10812
10813 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10814 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10815 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10816 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10817 }
10818
10819 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10820 return DAG.getMergeValues({Mant, CastExp}, dl);
10821}
10822
10823SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10824 SDLoc DL(Op);
10825 StoreSDNode *Store = cast<StoreSDNode>(Op);
10826 EVT VT = Store->getMemoryVT();
10827
10828 if (VT == MVT::i1) {
10829 return DAG.getTruncStore(Store->getChain(), DL,
10830 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10831 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10832 }
10833
10834 assert(VT.isVector() &&
10835 Store->getValue().getValueType().getScalarType() == MVT::i32);
10836
10837 unsigned AS = Store->getAddressSpace();
10838 if (Subtarget->hasLDSMisalignedBug() &&
10839 AS == AMDGPUAS::FLAT_ADDRESS &&
10840 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10841 return SplitVectorStore(Op, DAG);
10842 }
10843
10846 // If there is a possibility that flat instruction access scratch memory
10847 // then we need to use the same legalization rules we use for private.
10848 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10850 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10852
10853 unsigned NumElements = VT.getVectorNumElements();
10854 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10855 AS == AMDGPUAS::FLAT_ADDRESS) {
10856 if (NumElements > 4)
10857 return SplitVectorStore(Op, DAG);
10858 // v3 stores not supported on SI.
10859 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10860 return SplitVectorStore(Op, DAG);
10861
10863 VT, *Store->getMemOperand()))
10864 return expandUnalignedStore(Store, DAG);
10865
10866 return SDValue();
10867 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10868 switch (Subtarget->getMaxPrivateElementSize()) {
10869 case 4:
10870 return scalarizeVectorStore(Store, DAG);
10871 case 8:
10872 if (NumElements > 2)
10873 return SplitVectorStore(Op, DAG);
10874 return SDValue();
10875 case 16:
10876 if (NumElements > 4 ||
10877 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10878 return SplitVectorStore(Op, DAG);
10879 return SDValue();
10880 default:
10881 llvm_unreachable("unsupported private_element_size");
10882 }
10883 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10884 unsigned Fast = 0;
10885 auto Flags = Store->getMemOperand()->getFlags();
10887 Store->getAlign(), Flags, &Fast) &&
10888 Fast > 1)
10889 return SDValue();
10890
10891 if (VT.isVector())
10892 return SplitVectorStore(Op, DAG);
10893
10894 return expandUnalignedStore(Store, DAG);
10895 }
10896
10897 // Probably an invalid store. If so we'll end up emitting a selection error.
10898 return SDValue();
10899}
10900
10901// Avoid the full correct expansion for f32 sqrt when promoting from f16.
10902SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
10903 SDLoc SL(Op);
10904 assert(!Subtarget->has16BitInsts());
10905 SDNodeFlags Flags = Op->getFlags();
10906 SDValue Ext =
10907 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
10908
10909 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
10910 SDValue Sqrt =
10911 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
10912
10913 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
10914 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
10915}
10916
10917SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
10918 SDLoc DL(Op);
10919 SDNodeFlags Flags = Op->getFlags();
10920 MVT VT = Op.getValueType().getSimpleVT();
10921 const SDValue X = Op.getOperand(0);
10922
10923 if (allowApproxFunc(DAG, Flags)) {
10924 // Instruction is 1ulp but ignores denormals.
10925 return DAG.getNode(
10927 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
10928 }
10929
10930 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
10931 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
10932
10933 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
10934
10935 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
10936
10937 SDValue SqrtX =
10938 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
10939
10940 SDValue SqrtS;
10941 if (needsDenormHandlingF32(DAG, X, Flags)) {
10942 SDValue SqrtID =
10943 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
10944 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
10945
10946 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
10947 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
10948 DAG.getConstant(-1, DL, MVT::i32));
10949 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
10950
10951 SDValue NegSqrtSNextDown =
10952 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
10953
10954 SDValue SqrtVP =
10955 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
10956
10957 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
10958 DAG.getConstant(1, DL, MVT::i32));
10959 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
10960
10961 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
10962 SDValue SqrtVS =
10963 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
10964
10965 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
10966 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
10967
10968 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
10969 Flags);
10970
10971 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
10972 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
10973 Flags);
10974 } else {
10975 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
10976
10977 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
10978
10979 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
10980 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
10981 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
10982
10983 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
10984 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
10985 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
10986
10987 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
10988 SDValue SqrtD =
10989 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
10990 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
10991 }
10992
10993 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
10994
10995 SDValue ScaledDown =
10996 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
10997
10998 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
10999 SDValue IsZeroOrInf =
11000 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11001 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11002
11003 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11004}
11005
11006SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11007 // For double type, the SQRT and RSQ instructions don't have required
11008 // precision, we apply Goldschmidt's algorithm to improve the result:
11009 //
11010 // y0 = rsq(x)
11011 // g0 = x * y0
11012 // h0 = 0.5 * y0
11013 //
11014 // r0 = 0.5 - h0 * g0
11015 // g1 = g0 * r0 + g0
11016 // h1 = h0 * r0 + h0
11017 //
11018 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11019 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11020 // h2 = h1 * r1 + h1
11021 //
11022 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11023 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11024 //
11025 // sqrt(x) = g3
11026
11027 SDNodeFlags Flags = Op->getFlags();
11028
11029 SDLoc DL(Op);
11030
11031 SDValue X = Op.getOperand(0);
11032 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11033
11034 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11035
11036 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11037
11038 // Scale up input if it is too small.
11039 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11040 SDValue ScaleUp =
11041 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11042 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11043
11044 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11045
11046 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11047
11048 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11049 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11050
11051 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11052 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11053
11054 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11055
11056 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11057
11058 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11059 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11060
11061 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11062
11063 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11064 SDValue SqrtD1 =
11065 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11066
11067 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11068
11069 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11070 SDValue ScaleDown =
11071 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11072 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11073
11074 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11075 // with finite only or nsz because rsq(+/-0) = +/-inf
11076
11077 // TODO: Check for DAZ and expand to subnormals
11078 SDValue IsZeroOrInf =
11079 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11080 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11081
11082 // If x is +INF, +0, or -0, use its original value
11083 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11084 Flags);
11085}
11086
11087SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11088 SDLoc DL(Op);
11089 EVT VT = Op.getValueType();
11090 SDValue Arg = Op.getOperand(0);
11091 SDValue TrigVal;
11092
11093 // Propagate fast-math flags so that the multiply we introduce can be folded
11094 // if Arg is already the result of a multiply by constant.
11095 auto Flags = Op->getFlags();
11096
11097 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11098
11099 if (Subtarget->hasTrigReducedRange()) {
11100 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11101 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11102 } else {
11103 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11104 }
11105
11106 switch (Op.getOpcode()) {
11107 case ISD::FCOS:
11108 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11109 case ISD::FSIN:
11110 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11111 default:
11112 llvm_unreachable("Wrong trig opcode");
11113 }
11114}
11115
11116SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11117 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11118 assert(AtomicNode->isCompareAndSwap());
11119 unsigned AS = AtomicNode->getAddressSpace();
11120
11121 // No custom lowering required for local address space
11123 return Op;
11124
11125 // Non-local address space requires custom lowering for atomic compare
11126 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11127 SDLoc DL(Op);
11128 SDValue ChainIn = Op.getOperand(0);
11129 SDValue Addr = Op.getOperand(1);
11130 SDValue Old = Op.getOperand(2);
11131 SDValue New = Op.getOperand(3);
11132 EVT VT = Op.getValueType();
11133 MVT SimpleVT = VT.getSimpleVT();
11134 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11135
11136 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11137 SDValue Ops[] = { ChainIn, Addr, NewOld };
11138
11139 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11140 Ops, VT, AtomicNode->getMemOperand());
11141}
11142
11143//===----------------------------------------------------------------------===//
11144// Custom DAG optimizations
11145//===----------------------------------------------------------------------===//
11146
11147SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11148 DAGCombinerInfo &DCI) const {
11149 EVT VT = N->getValueType(0);
11150 EVT ScalarVT = VT.getScalarType();
11151 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11152 return SDValue();
11153
11154 SelectionDAG &DAG = DCI.DAG;
11155 SDLoc DL(N);
11156
11157 SDValue Src = N->getOperand(0);
11158 EVT SrcVT = Src.getValueType();
11159
11160 // TODO: We could try to match extracting the higher bytes, which would be
11161 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11162 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11163 // about in practice.
11164 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11165 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11166 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11167 DCI.AddToWorklist(Cvt.getNode());
11168
11169 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11170 if (ScalarVT != MVT::f32) {
11171 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11172 DAG.getTargetConstant(0, DL, MVT::i32));
11173 }
11174 return Cvt;
11175 }
11176 }
11177
11178 return SDValue();
11179}
11180
11181SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11182 DAGCombinerInfo &DCI) const {
11183 SDValue MagnitudeOp = N->getOperand(0);
11184 SDValue SignOp = N->getOperand(1);
11185 SelectionDAG &DAG = DCI.DAG;
11186 SDLoc DL(N);
11187
11188 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11189 // lower half with a copy.
11190 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11191 if (MagnitudeOp.getValueType() == MVT::f64) {
11192 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11193 SDValue MagLo =
11194 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11195 DAG.getConstant(0, DL, MVT::i32));
11196 SDValue MagHi =
11197 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11198 DAG.getConstant(1, DL, MVT::i32));
11199
11200 SDValue HiOp =
11201 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11202
11203 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11204
11205 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11206 }
11207
11208 if (SignOp.getValueType() != MVT::f64)
11209 return SDValue();
11210
11211 // Reduce width of sign operand, we only need the highest bit.
11212 //
11213 // fcopysign f64:x, f64:y ->
11214 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11215 // TODO: In some cases it might make sense to go all the way to f16.
11216 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11217 SDValue SignAsF32 =
11218 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11219 DAG.getConstant(1, DL, MVT::i32));
11220
11221 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11222 SignAsF32);
11223}
11224
11225// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11226// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11227// bits
11228
11229// This is a variant of
11230// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11231//
11232// The normal DAG combiner will do this, but only if the add has one use since
11233// that would increase the number of instructions.
11234//
11235// This prevents us from seeing a constant offset that can be folded into a
11236// memory instruction's addressing mode. If we know the resulting add offset of
11237// a pointer can be folded into an addressing offset, we can replace the pointer
11238// operand with the add of new constant offset. This eliminates one of the uses,
11239// and may allow the remaining use to also be simplified.
11240//
11241SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11242 unsigned AddrSpace,
11243 EVT MemVT,
11244 DAGCombinerInfo &DCI) const {
11245 SDValue N0 = N->getOperand(0);
11246 SDValue N1 = N->getOperand(1);
11247
11248 // We only do this to handle cases where it's profitable when there are
11249 // multiple uses of the add, so defer to the standard combine.
11250 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11251 N0->hasOneUse())
11252 return SDValue();
11253
11254 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11255 if (!CN1)
11256 return SDValue();
11257
11258 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11259 if (!CAdd)
11260 return SDValue();
11261
11262 SelectionDAG &DAG = DCI.DAG;
11263
11264 if (N0->getOpcode() == ISD::OR &&
11265 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11266 return SDValue();
11267
11268 // If the resulting offset is too large, we can't fold it into the
11269 // addressing mode offset.
11270 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11271 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11272
11273 AddrMode AM;
11274 AM.HasBaseReg = true;
11275 AM.BaseOffs = Offset.getSExtValue();
11276 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11277 return SDValue();
11278
11279 SDLoc SL(N);
11280 EVT VT = N->getValueType(0);
11281
11282 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11283 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11284
11286 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11287 (N0.getOpcode() == ISD::OR ||
11288 N0->getFlags().hasNoUnsignedWrap()));
11289
11290 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11291}
11292
11293/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11294/// by the chain and intrinsic ID. Theoretically we would also need to check the
11295/// specific intrinsic, but they all place the pointer operand first.
11296static unsigned getBasePtrIndex(const MemSDNode *N) {
11297 switch (N->getOpcode()) {
11298 case ISD::STORE:
11301 return 2;
11302 default:
11303 return 1;
11304 }
11305}
11306
11307SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11308 DAGCombinerInfo &DCI) const {
11309 SelectionDAG &DAG = DCI.DAG;
11310 SDLoc SL(N);
11311
11312 unsigned PtrIdx = getBasePtrIndex(N);
11313 SDValue Ptr = N->getOperand(PtrIdx);
11314
11315 // TODO: We could also do this for multiplies.
11316 if (Ptr.getOpcode() == ISD::SHL) {
11317 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11318 N->getMemoryVT(), DCI);
11319 if (NewPtr) {
11320 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11321
11322 NewOps[PtrIdx] = NewPtr;
11323 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11324 }
11325 }
11326
11327 return SDValue();
11328}
11329
11330static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11331 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11332 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11333 (Opc == ISD::XOR && Val == 0);
11334}
11335
11336// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11337// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11338// integer combine opportunities since most 64-bit operations are decomposed
11339// this way. TODO: We won't want this for SALU especially if it is an inline
11340// immediate.
11341SDValue SITargetLowering::splitBinaryBitConstantOp(
11342 DAGCombinerInfo &DCI,
11343 const SDLoc &SL,
11344 unsigned Opc, SDValue LHS,
11345 const ConstantSDNode *CRHS) const {
11346 uint64_t Val = CRHS->getZExtValue();
11347 uint32_t ValLo = Lo_32(Val);
11348 uint32_t ValHi = Hi_32(Val);
11350
11351 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11352 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11353 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11354 // If we need to materialize a 64-bit immediate, it will be split up later
11355 // anyway. Avoid creating the harder to understand 64-bit immediate
11356 // materialization.
11357 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11358 }
11359
11360 return SDValue();
11361}
11362
11364 if (V.getValueType() != MVT::i1)
11365 return false;
11366 switch (V.getOpcode()) {
11367 default:
11368 break;
11369 case ISD::SETCC:
11371 return true;
11372 case ISD::AND:
11373 case ISD::OR:
11374 case ISD::XOR:
11375 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11376 }
11377 return false;
11378}
11379
11380// If a constant has all zeroes or all ones within each byte return it.
11381// Otherwise return 0.
11383 // 0xff for any zero byte in the mask
11384 uint32_t ZeroByteMask = 0;
11385 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11386 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11387 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11388 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11389 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11390 if ((NonZeroByteMask & C) != NonZeroByteMask)
11391 return 0; // Partial bytes selected.
11392 return C;
11393}
11394
11395// Check if a node selects whole bytes from its operand 0 starting at a byte
11396// boundary while masking the rest. Returns select mask as in the v_perm_b32
11397// or -1 if not succeeded.
11398// Note byte select encoding:
11399// value 0-3 selects corresponding source byte;
11400// value 0xc selects zero;
11401// value 0xff selects 0xff.
11403 assert(V.getValueSizeInBits() == 32);
11404
11405 if (V.getNumOperands() != 2)
11406 return ~0;
11407
11408 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11409 if (!N1)
11410 return ~0;
11411
11412 uint32_t C = N1->getZExtValue();
11413
11414 switch (V.getOpcode()) {
11415 default:
11416 break;
11417 case ISD::AND:
11418 if (uint32_t ConstMask = getConstantPermuteMask(C))
11419 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11420 break;
11421
11422 case ISD::OR:
11423 if (uint32_t ConstMask = getConstantPermuteMask(C))
11424 return (0x03020100 & ~ConstMask) | ConstMask;
11425 break;
11426
11427 case ISD::SHL:
11428 if (C % 8)
11429 return ~0;
11430
11431 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11432
11433 case ISD::SRL:
11434 if (C % 8)
11435 return ~0;
11436
11437 return uint32_t(0x0c0c0c0c03020100ull >> C);
11438 }
11439
11440 return ~0;
11441}
11442
11443SDValue SITargetLowering::performAndCombine(SDNode *N,
11444 DAGCombinerInfo &DCI) const {
11445 if (DCI.isBeforeLegalize())
11446 return SDValue();
11447
11448 SelectionDAG &DAG = DCI.DAG;
11449 EVT VT = N->getValueType(0);
11450 SDValue LHS = N->getOperand(0);
11451 SDValue RHS = N->getOperand(1);
11452
11453
11454 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11455 if (VT == MVT::i64 && CRHS) {
11456 if (SDValue Split
11457 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11458 return Split;
11459 }
11460
11461 if (CRHS && VT == MVT::i32) {
11462 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11463 // nb = number of trailing zeroes in mask
11464 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11465 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11466 uint64_t Mask = CRHS->getZExtValue();
11467 unsigned Bits = llvm::popcount(Mask);
11468 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11469 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11470 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11471 unsigned Shift = CShift->getZExtValue();
11472 unsigned NB = CRHS->getAPIntValue().countr_zero();
11473 unsigned Offset = NB + Shift;
11474 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11475 SDLoc SL(N);
11476 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11477 LHS->getOperand(0),
11478 DAG.getConstant(Offset, SL, MVT::i32),
11479 DAG.getConstant(Bits, SL, MVT::i32));
11480 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11481 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11482 DAG.getValueType(NarrowVT));
11483 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11484 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11485 return Shl;
11486 }
11487 }
11488 }
11489
11490 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11491 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11492 isa<ConstantSDNode>(LHS.getOperand(2))) {
11493 uint32_t Sel = getConstantPermuteMask(Mask);
11494 if (!Sel)
11495 return SDValue();
11496
11497 // Select 0xc for all zero bytes
11498 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11499 SDLoc DL(N);
11500 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11501 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11502 }
11503 }
11504
11505 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11506 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11507 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11508 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11509 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11510
11511 SDValue X = LHS.getOperand(0);
11512 SDValue Y = RHS.getOperand(0);
11513 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11514 !isTypeLegal(X.getValueType()))
11515 return SDValue();
11516
11517 if (LCC == ISD::SETO) {
11518 if (X != LHS.getOperand(1))
11519 return SDValue();
11520
11521 if (RCC == ISD::SETUNE) {
11522 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11523 if (!C1 || !C1->isInfinity() || C1->isNegative())
11524 return SDValue();
11525
11532
11533 static_assert(((~(SIInstrFlags::S_NAN |
11536 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11537 "mask not equal");
11538
11539 SDLoc DL(N);
11540 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11541 X, DAG.getConstant(Mask, DL, MVT::i32));
11542 }
11543 }
11544 }
11545
11546 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11547 std::swap(LHS, RHS);
11548
11549 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11550 RHS.hasOneUse()) {
11551 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11552 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11553 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11554 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11555 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11556 (RHS.getOperand(0) == LHS.getOperand(0) &&
11557 LHS.getOperand(0) == LHS.getOperand(1))) {
11558 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11559 unsigned NewMask = LCC == ISD::SETO ?
11560 Mask->getZExtValue() & ~OrdMask :
11561 Mask->getZExtValue() & OrdMask;
11562
11563 SDLoc DL(N);
11564 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11565 DAG.getConstant(NewMask, DL, MVT::i32));
11566 }
11567 }
11568
11569 if (VT == MVT::i32 &&
11570 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11571 // and x, (sext cc from i1) => select cc, x, 0
11572 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11573 std::swap(LHS, RHS);
11574 if (isBoolSGPR(RHS.getOperand(0)))
11575 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11576 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11577 }
11578
11579 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11581 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11582 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11583 uint32_t LHSMask = getPermuteMask(LHS);
11584 uint32_t RHSMask = getPermuteMask(RHS);
11585 if (LHSMask != ~0u && RHSMask != ~0u) {
11586 // Canonicalize the expression in an attempt to have fewer unique masks
11587 // and therefore fewer registers used to hold the masks.
11588 if (LHSMask > RHSMask) {
11589 std::swap(LHSMask, RHSMask);
11590 std::swap(LHS, RHS);
11591 }
11592
11593 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11594 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11595 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11596 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11597
11598 // Check of we need to combine values from two sources within a byte.
11599 if (!(LHSUsedLanes & RHSUsedLanes) &&
11600 // If we select high and lower word keep it for SDWA.
11601 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11602 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11603 // Each byte in each mask is either selector mask 0-3, or has higher
11604 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11605 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11606 // mask which is not 0xff wins. By anding both masks we have a correct
11607 // result except that 0x0c shall be corrected to give 0x0c only.
11608 uint32_t Mask = LHSMask & RHSMask;
11609 for (unsigned I = 0; I < 32; I += 8) {
11610 uint32_t ByteSel = 0xff << I;
11611 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11612 Mask &= (0x0c << I) & 0xffffffff;
11613 }
11614
11615 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11616 // or 0x0c.
11617 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11618 SDLoc DL(N);
11619
11620 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11621 LHS.getOperand(0), RHS.getOperand(0),
11622 DAG.getConstant(Sel, DL, MVT::i32));
11623 }
11624 }
11625 }
11626
11627 return SDValue();
11628}
11629
11630// A key component of v_perm is a mapping between byte position of the src
11631// operands, and the byte position of the dest. To provide such, we need: 1. the
11632// node that provides x byte of the dest of the OR, and 2. the byte of the node
11633// used to provide that x byte. calculateByteProvider finds which node provides
11634// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11635// and finds an ultimate src and byte position For example: The supported
11636// LoadCombine pattern for vector loads is as follows
11637// t1
11638// or
11639// / \
11640// t2 t3
11641// zext shl
11642// | | \
11643// t4 t5 16
11644// or anyext
11645// / \ |
11646// t6 t7 t8
11647// srl shl or
11648// / | / \ / \
11649// t9 t10 t11 t12 t13 t14
11650// trunc* 8 trunc* 8 and and
11651// | | / | | \
11652// t15 t16 t17 t18 t19 t20
11653// trunc* 255 srl -256
11654// | / \
11655// t15 t15 16
11656//
11657// *In this example, the truncs are from i32->i16
11658//
11659// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11660// respectively. calculateSrcByte would find (given node) -> ultimate src &
11661// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11662// After finding the mapping, we can combine the tree into vperm t15, t16,
11663// 0x05000407
11664
11665// Find the source and byte position from a node.
11666// \p DestByte is the byte position of the dest of the or that the src
11667// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11668// dest of the or byte. \p Depth tracks how many recursive iterations we have
11669// performed.
11670static const std::optional<ByteProvider<SDValue>>
11671calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11672 unsigned Depth = 0) {
11673 // We may need to recursively traverse a series of SRLs
11674 if (Depth >= 6)
11675 return std::nullopt;
11676
11677 if (Op.getValueSizeInBits() < 8)
11678 return std::nullopt;
11679
11680 if (Op.getValueType().isVector())
11681 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11682
11683 switch (Op->getOpcode()) {
11684 case ISD::TRUNCATE: {
11685 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11686 }
11687
11688 case ISD::SIGN_EXTEND:
11689 case ISD::ZERO_EXTEND:
11691 SDValue NarrowOp = Op->getOperand(0);
11692 auto NarrowVT = NarrowOp.getValueType();
11693 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11694 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11695 NarrowVT = VTSign->getVT();
11696 }
11697 if (!NarrowVT.isByteSized())
11698 return std::nullopt;
11699 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11700
11701 if (SrcIndex >= NarrowByteWidth)
11702 return std::nullopt;
11703 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11704 }
11705
11706 case ISD::SRA:
11707 case ISD::SRL: {
11708 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11709 if (!ShiftOp)
11710 return std::nullopt;
11711
11712 uint64_t BitShift = ShiftOp->getZExtValue();
11713
11714 if (BitShift % 8 != 0)
11715 return std::nullopt;
11716
11717 SrcIndex += BitShift / 8;
11718
11719 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11720 }
11721
11722 default: {
11723 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11724 }
11725 }
11726 llvm_unreachable("fully handled switch");
11727}
11728
11729// For a byte position in the result of an Or, traverse the tree and find the
11730// node (and the byte of the node) which ultimately provides this {Or,
11731// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11732// the byte position of the Op that corresponds with the originally requested
11733// byte of the Or \p Depth tracks how many recursive iterations we have
11734// performed. \p StartingIndex is the originally requested byte of the Or
11735static const std::optional<ByteProvider<SDValue>>
11736calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11737 unsigned StartingIndex = 0) {
11738 // Finding Src tree of RHS of or typically requires at least 1 additional
11739 // depth
11740 if (Depth > 6)
11741 return std::nullopt;
11742
11743 unsigned BitWidth = Op.getScalarValueSizeInBits();
11744 if (BitWidth % 8 != 0)
11745 return std::nullopt;
11746 if (Index > BitWidth / 8 - 1)
11747 return std::nullopt;
11748
11749 bool IsVec = Op.getValueType().isVector();
11750 switch (Op.getOpcode()) {
11751 case ISD::OR: {
11752 if (IsVec)
11753 return std::nullopt;
11754
11755 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11756 StartingIndex);
11757 if (!RHS)
11758 return std::nullopt;
11759 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11760 StartingIndex);
11761 if (!LHS)
11762 return std::nullopt;
11763 // A well formed Or will have two ByteProviders for each byte, one of which
11764 // is constant zero
11765 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11766 return std::nullopt;
11767 if (!LHS || LHS->isConstantZero())
11768 return RHS;
11769 if (!RHS || RHS->isConstantZero())
11770 return LHS;
11771 return std::nullopt;
11772 }
11773
11774 case ISD::AND: {
11775 if (IsVec)
11776 return std::nullopt;
11777
11778 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11779 if (!BitMaskOp)
11780 return std::nullopt;
11781
11782 uint32_t BitMask = BitMaskOp->getZExtValue();
11783 // Bits we expect for our StartingIndex
11784 uint32_t IndexMask = 0xFF << (Index * 8);
11785
11786 if ((IndexMask & BitMask) != IndexMask) {
11787 // If the result of the and partially provides the byte, then it
11788 // is not well formatted
11789 if (IndexMask & BitMask)
11790 return std::nullopt;
11792 }
11793
11794 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11795 }
11796
11797 case ISD::FSHR: {
11798 if (IsVec)
11799 return std::nullopt;
11800
11801 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11802 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11803 if (!ShiftOp || Op.getValueType().isVector())
11804 return std::nullopt;
11805
11806 uint64_t BitsProvided = Op.getValueSizeInBits();
11807 if (BitsProvided % 8 != 0)
11808 return std::nullopt;
11809
11810 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11811 if (BitShift % 8)
11812 return std::nullopt;
11813
11814 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11815 uint64_t ByteShift = BitShift / 8;
11816
11817 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11818 uint64_t BytesProvided = BitsProvided / 8;
11819 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11820 NewIndex %= BytesProvided;
11821 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11822 }
11823
11824 case ISD::SRA:
11825 case ISD::SRL: {
11826 if (IsVec)
11827 return std::nullopt;
11828
11829 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11830 if (!ShiftOp)
11831 return std::nullopt;
11832
11833 uint64_t BitShift = ShiftOp->getZExtValue();
11834 if (BitShift % 8)
11835 return std::nullopt;
11836
11837 auto BitsProvided = Op.getScalarValueSizeInBits();
11838 if (BitsProvided % 8 != 0)
11839 return std::nullopt;
11840
11841 uint64_t BytesProvided = BitsProvided / 8;
11842 uint64_t ByteShift = BitShift / 8;
11843 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11844 // If the byte we are trying to provide (as tracked by index) falls in this
11845 // range, then the SRL provides the byte. The byte of interest of the src of
11846 // the SRL is Index + ByteShift
11847 return BytesProvided - ByteShift > Index
11848 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11849 Index + ByteShift)
11851 }
11852
11853 case ISD::SHL: {
11854 if (IsVec)
11855 return std::nullopt;
11856
11857 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11858 if (!ShiftOp)
11859 return std::nullopt;
11860
11861 uint64_t BitShift = ShiftOp->getZExtValue();
11862 if (BitShift % 8 != 0)
11863 return std::nullopt;
11864 uint64_t ByteShift = BitShift / 8;
11865
11866 // If we are shifting by an amount greater than (or equal to)
11867 // the index we are trying to provide, then it provides 0s. If not,
11868 // then this bytes are not definitively 0s, and the corresponding byte
11869 // of interest is Index - ByteShift of the src
11870 return Index < ByteShift
11872 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11873 Depth + 1, StartingIndex);
11874 }
11875 case ISD::ANY_EXTEND:
11876 case ISD::SIGN_EXTEND:
11877 case ISD::ZERO_EXTEND:
11879 case ISD::AssertZext:
11880 case ISD::AssertSext: {
11881 if (IsVec)
11882 return std::nullopt;
11883
11884 SDValue NarrowOp = Op->getOperand(0);
11885 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11886 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11887 Op->getOpcode() == ISD::AssertZext ||
11888 Op->getOpcode() == ISD::AssertSext) {
11889 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11890 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11891 }
11892 if (NarrowBitWidth % 8 != 0)
11893 return std::nullopt;
11894 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11895
11896 if (Index >= NarrowByteWidth)
11897 return Op.getOpcode() == ISD::ZERO_EXTEND
11898 ? std::optional<ByteProvider<SDValue>>(
11900 : std::nullopt;
11901 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
11902 }
11903
11904 case ISD::TRUNCATE: {
11905 if (IsVec)
11906 return std::nullopt;
11907
11908 uint64_t NarrowByteWidth = BitWidth / 8;
11909
11910 if (NarrowByteWidth >= Index) {
11911 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11912 StartingIndex);
11913 }
11914
11915 return std::nullopt;
11916 }
11917
11918 case ISD::CopyFromReg: {
11919 if (BitWidth / 8 > Index)
11920 return calculateSrcByte(Op, StartingIndex, Index);
11921
11922 return std::nullopt;
11923 }
11924
11925 case ISD::LOAD: {
11926 auto L = cast<LoadSDNode>(Op.getNode());
11927
11928 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11929 if (NarrowBitWidth % 8 != 0)
11930 return std::nullopt;
11931 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11932
11933 // If the width of the load does not reach byte we are trying to provide for
11934 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
11935 // question
11936 if (Index >= NarrowByteWidth) {
11937 return L->getExtensionType() == ISD::ZEXTLOAD
11938 ? std::optional<ByteProvider<SDValue>>(
11940 : std::nullopt;
11941 }
11942
11943 if (NarrowByteWidth > Index) {
11944 return calculateSrcByte(Op, StartingIndex, Index);
11945 }
11946
11947 return std::nullopt;
11948 }
11949
11950 case ISD::BSWAP: {
11951 if (IsVec)
11952 return std::nullopt;
11953
11954 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
11955 Depth + 1, StartingIndex);
11956 }
11957
11959 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11960 if (!IdxOp)
11961 return std::nullopt;
11962 auto VecIdx = IdxOp->getZExtValue();
11963 auto ScalarSize = Op.getScalarValueSizeInBits();
11964 if (ScalarSize != 32) {
11965 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
11966 }
11967
11968 return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
11969 StartingIndex, Index);
11970 }
11971
11972 case AMDGPUISD::PERM: {
11973 if (IsVec)
11974 return std::nullopt;
11975
11976 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11977 if (!PermMask)
11978 return std::nullopt;
11979
11980 auto IdxMask =
11981 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
11982 if (IdxMask > 0x07 && IdxMask != 0x0c)
11983 return std::nullopt;
11984
11985 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
11986 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
11987
11988 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
11991 }
11992
11993 default: {
11994 return std::nullopt;
11995 }
11996 }
11997
11998 llvm_unreachable("fully handled switch");
11999}
12000
12001// Returns true if the Operand is a scalar and is 16 bits
12002static bool isExtendedFrom16Bits(SDValue &Operand) {
12003
12004 switch (Operand.getOpcode()) {
12005 case ISD::ANY_EXTEND:
12006 case ISD::SIGN_EXTEND:
12007 case ISD::ZERO_EXTEND: {
12008 auto OpVT = Operand.getOperand(0).getValueType();
12009 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12010 }
12011 case ISD::LOAD: {
12012 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12013 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12014 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12015 ExtType == ISD::EXTLOAD) {
12016 auto MemVT = L->getMemoryVT();
12017 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12018 }
12019 return L->getMemoryVT().getSizeInBits() == 16;
12020 }
12021 default:
12022 return false;
12023 }
12024}
12025
12026// Returns true if the mask matches consecutive bytes, and the first byte
12027// begins at a power of 2 byte offset from 0th byte
12028static bool addresses16Bits(int Mask) {
12029 int Low8 = Mask & 0xff;
12030 int Hi8 = (Mask & 0xff00) >> 8;
12031
12032 assert(Low8 < 8 && Hi8 < 8);
12033 // Are the bytes contiguous in the order of increasing addresses.
12034 bool IsConsecutive = (Hi8 - Low8 == 1);
12035 // Is the first byte at location that is aligned for 16 bit instructions.
12036 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12037 // In this case, we still need code to extract the 16 bit operand, so it
12038 // is better to use i8 v_perm
12039 bool Is16Aligned = !(Low8 % 2);
12040
12041 return IsConsecutive && Is16Aligned;
12042}
12043
12044// Do not lower into v_perm if the operands are actually 16 bit
12045// and the selected bits (based on PermMask) correspond with two
12046// easily addressable 16 bit operands.
12048 SDValue &OtherOp) {
12049 int Low16 = PermMask & 0xffff;
12050 int Hi16 = (PermMask & 0xffff0000) >> 16;
12051
12052 auto TempOp = peekThroughBitcasts(Op);
12053 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12054
12055 auto OpIs16Bit =
12056 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12057 if (!OpIs16Bit)
12058 return true;
12059
12060 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12061 isExtendedFrom16Bits(TempOtherOp);
12062 if (!OtherOpIs16Bit)
12063 return true;
12064
12065 // Do we cleanly address both
12066 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12067}
12068
12070 unsigned DWordOffset) {
12071 SDValue Ret;
12072
12073 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12074 // ByteProvider must be at least 8 bits
12075 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12076
12077 if (TypeSize <= 32)
12078 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12079
12080 if (Src.getValueType().isVector()) {
12081 auto ScalarTySize = Src.getScalarValueSizeInBits();
12082 auto ScalarTy = Src.getValueType().getScalarType();
12083 if (ScalarTySize == 32) {
12084 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12085 DAG.getConstant(DWordOffset, SL, MVT::i32));
12086 }
12087 if (ScalarTySize > 32) {
12088 Ret = DAG.getNode(
12089 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12090 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12091 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12092 if (ShiftVal)
12093 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12094 DAG.getConstant(ShiftVal, SL, MVT::i32));
12095 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12096 }
12097
12098 assert(ScalarTySize < 32);
12099 auto NumElements = TypeSize / ScalarTySize;
12100 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12101 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12102 auto NumElementsIn32 = 32 / ScalarTySize;
12103 auto NumAvailElements = DWordOffset < Trunc32Elements
12104 ? NumElementsIn32
12105 : NumElements - NormalizedTrunc;
12106
12108 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12109 NumAvailElements);
12110
12111 Ret = DAG.getBuildVector(
12112 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12113 VecSrcs);
12114 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12115 }
12116
12117 /// Scalar Type
12118 auto ShiftVal = 32 * DWordOffset;
12119 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12120 DAG.getConstant(ShiftVal, SL, MVT::i32));
12121 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12122}
12123
12125 SelectionDAG &DAG = DCI.DAG;
12126 [[maybe_unused]] EVT VT = N->getValueType(0);
12128
12129 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12130 assert(VT == MVT::i32);
12131 for (int i = 0; i < 4; i++) {
12132 // Find the ByteProvider that provides the ith byte of the result of OR
12133 std::optional<ByteProvider<SDValue>> P =
12134 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12135 // TODO support constantZero
12136 if (!P || P->isConstantZero())
12137 return SDValue();
12138
12139 PermNodes.push_back(*P);
12140 }
12141 if (PermNodes.size() != 4)
12142 return SDValue();
12143
12144 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12145 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12146 uint64_t PermMask = 0x00000000;
12147 for (size_t i = 0; i < PermNodes.size(); i++) {
12148 auto PermOp = PermNodes[i];
12149 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12150 // by sizeof(Src2) = 4
12151 int SrcByteAdjust = 4;
12152
12153 // If the Src uses a byte from a different DWORD, then it corresponds
12154 // with a difference source
12155 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12156 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12157 if (SecondSrc)
12158 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12159 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12160 return SDValue();
12161
12162 // Set the index of the second distinct Src node
12163 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12164 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12165 SrcByteAdjust = 0;
12166 }
12167 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12169 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12170 }
12171 SDLoc DL(N);
12172 SDValue Op = *PermNodes[FirstSrc.first].Src;
12173 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12174 assert(Op.getValueSizeInBits() == 32);
12175
12176 // Check that we are not just extracting the bytes in order from an op
12177 if (!SecondSrc) {
12178 int Low16 = PermMask & 0xffff;
12179 int Hi16 = (PermMask & 0xffff0000) >> 16;
12180
12181 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12182 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12183
12184 // The perm op would really just produce Op. So combine into Op
12185 if (WellFormedLow && WellFormedHi)
12186 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12187 }
12188
12189 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12190
12191 if (SecondSrc) {
12192 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12193 assert(OtherOp.getValueSizeInBits() == 32);
12194 }
12195
12196 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12197
12198 assert(Op.getValueType().isByteSized() &&
12199 OtherOp.getValueType().isByteSized());
12200
12201 // If the ultimate src is less than 32 bits, then we will only be
12202 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12203 // CalculateByteProvider would not have returned Op as source if we
12204 // used a byte that is outside its ValueType. Thus, we are free to
12205 // ANY_EXTEND as the extended bits are dont-cares.
12206 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12207 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12208
12209 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12210 DAG.getConstant(PermMask, DL, MVT::i32));
12211 }
12212 return SDValue();
12213}
12214
12215SDValue SITargetLowering::performOrCombine(SDNode *N,
12216 DAGCombinerInfo &DCI) const {
12217 SelectionDAG &DAG = DCI.DAG;
12218 SDValue LHS = N->getOperand(0);
12219 SDValue RHS = N->getOperand(1);
12220
12221 EVT VT = N->getValueType(0);
12222 if (VT == MVT::i1) {
12223 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12224 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12225 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12226 SDValue Src = LHS.getOperand(0);
12227 if (Src != RHS.getOperand(0))
12228 return SDValue();
12229
12230 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12231 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12232 if (!CLHS || !CRHS)
12233 return SDValue();
12234
12235 // Only 10 bits are used.
12236 static const uint32_t MaxMask = 0x3ff;
12237
12238 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12239 SDLoc DL(N);
12240 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12241 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12242 }
12243
12244 return SDValue();
12245 }
12246
12247 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12248 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12249 LHS.getOpcode() == AMDGPUISD::PERM &&
12250 isa<ConstantSDNode>(LHS.getOperand(2))) {
12251 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12252 if (!Sel)
12253 return SDValue();
12254
12255 Sel |= LHS.getConstantOperandVal(2);
12256 SDLoc DL(N);
12257 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12258 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12259 }
12260
12261 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12263 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12264 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12265
12266 // If all the uses of an or need to extract the individual elements, do not
12267 // attempt to lower into v_perm
12268 auto usesCombinedOperand = [](SDNode *OrUse) {
12269 // If we have any non-vectorized use, then it is a candidate for v_perm
12270 if (OrUse->getOpcode() != ISD::BITCAST ||
12271 !OrUse->getValueType(0).isVector())
12272 return true;
12273
12274 // If we have any non-vectorized use, then it is a candidate for v_perm
12275 for (auto VUse : OrUse->uses()) {
12276 if (!VUse->getValueType(0).isVector())
12277 return true;
12278
12279 // If the use of a vector is a store, then combining via a v_perm
12280 // is beneficial.
12281 // TODO -- whitelist more uses
12282 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12283 if (VUse->getOpcode() == VectorwiseOp)
12284 return true;
12285 }
12286 return false;
12287 };
12288
12289 if (!any_of(N->uses(), usesCombinedOperand))
12290 return SDValue();
12291
12292 uint32_t LHSMask = getPermuteMask(LHS);
12293 uint32_t RHSMask = getPermuteMask(RHS);
12294
12295 if (LHSMask != ~0u && RHSMask != ~0u) {
12296 // Canonicalize the expression in an attempt to have fewer unique masks
12297 // and therefore fewer registers used to hold the masks.
12298 if (LHSMask > RHSMask) {
12299 std::swap(LHSMask, RHSMask);
12300 std::swap(LHS, RHS);
12301 }
12302
12303 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12304 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12305 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12306 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12307
12308 // Check of we need to combine values from two sources within a byte.
12309 if (!(LHSUsedLanes & RHSUsedLanes) &&
12310 // If we select high and lower word keep it for SDWA.
12311 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12312 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12313 // Kill zero bytes selected by other mask. Zero value is 0xc.
12314 LHSMask &= ~RHSUsedLanes;
12315 RHSMask &= ~LHSUsedLanes;
12316 // Add 4 to each active LHS lane
12317 LHSMask |= LHSUsedLanes & 0x04040404;
12318 // Combine masks
12319 uint32_t Sel = LHSMask | RHSMask;
12320 SDLoc DL(N);
12321
12322 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12323 LHS.getOperand(0), RHS.getOperand(0),
12324 DAG.getConstant(Sel, DL, MVT::i32));
12325 }
12326 }
12327 if (LHSMask == ~0u || RHSMask == ~0u) {
12328 if (SDValue Perm = matchPERM(N, DCI))
12329 return Perm;
12330 }
12331 }
12332
12333 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12334 return SDValue();
12335
12336 // TODO: This could be a generic combine with a predicate for extracting the
12337 // high half of an integer being free.
12338
12339 // (or i64:x, (zero_extend i32:y)) ->
12340 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12341 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12342 RHS.getOpcode() != ISD::ZERO_EXTEND)
12343 std::swap(LHS, RHS);
12344
12345 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12346 SDValue ExtSrc = RHS.getOperand(0);
12347 EVT SrcVT = ExtSrc.getValueType();
12348 if (SrcVT == MVT::i32) {
12349 SDLoc SL(N);
12350 SDValue LowLHS, HiBits;
12351 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12352 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12353
12354 DCI.AddToWorklist(LowOr.getNode());
12355 DCI.AddToWorklist(HiBits.getNode());
12356
12357 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12358 LowOr, HiBits);
12359 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12360 }
12361 }
12362
12363 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12364 if (CRHS) {
12365 if (SDValue Split
12366 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12367 N->getOperand(0), CRHS))
12368 return Split;
12369 }
12370
12371 return SDValue();
12372}
12373
12374SDValue SITargetLowering::performXorCombine(SDNode *N,
12375 DAGCombinerInfo &DCI) const {
12376 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12377 return RV;
12378
12379 SDValue LHS = N->getOperand(0);
12380 SDValue RHS = N->getOperand(1);
12381
12382 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12383 SelectionDAG &DAG = DCI.DAG;
12384
12385 EVT VT = N->getValueType(0);
12386 if (CRHS && VT == MVT::i64) {
12387 if (SDValue Split
12388 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12389 return Split;
12390 }
12391
12392 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12393 // fneg-like xors into 64-bit select.
12394 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12395 // This looks like an fneg, try to fold as a source modifier.
12396 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12397 shouldFoldFNegIntoSrc(N, LHS)) {
12398 // xor (select c, a, b), 0x80000000 ->
12399 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12400 SDLoc DL(N);
12401 SDValue CastLHS =
12402 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12403 SDValue CastRHS =
12404 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12405 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12406 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12407 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12408 LHS->getOperand(0), FNegLHS, FNegRHS);
12409 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12410 }
12411 }
12412
12413 return SDValue();
12414}
12415
12416SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12417 DAGCombinerInfo &DCI) const {
12418 if (!Subtarget->has16BitInsts() ||
12419 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12420 return SDValue();
12421
12422 EVT VT = N->getValueType(0);
12423 if (VT != MVT::i32)
12424 return SDValue();
12425
12426 SDValue Src = N->getOperand(0);
12427 if (Src.getValueType() != MVT::i16)
12428 return SDValue();
12429
12430 return SDValue();
12431}
12432
12433SDValue
12434SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12435 DAGCombinerInfo &DCI) const {
12436 SDValue Src = N->getOperand(0);
12437 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12438
12439 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12440 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12441 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12442 VTSign->getVT() == MVT::i8) ||
12443 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12444 VTSign->getVT() == MVT::i16))) {
12445 assert(Subtarget->hasScalarSubwordLoads() &&
12446 "s_buffer_load_{u8, i8} are supported "
12447 "in GFX12 (or newer) architectures.");
12448 EVT VT = Src.getValueType();
12449 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12452 SDLoc DL(N);
12453 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12454 SDValue Ops[] = {
12455 Src.getOperand(0), // source register
12456 Src.getOperand(1), // offset
12457 Src.getOperand(2) // cachePolicy
12458 };
12459 auto *M = cast<MemSDNode>(Src);
12460 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12461 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12462 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12463 return LoadVal;
12464 } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12465 VTSign->getVT() == MVT::i8) ||
12466 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12467 VTSign->getVT() == MVT::i16)) &&
12468 Src.hasOneUse()) {
12469 auto *M = cast<MemSDNode>(Src);
12470 SDValue Ops[] = {
12471 Src.getOperand(0), // Chain
12472 Src.getOperand(1), // rsrc
12473 Src.getOperand(2), // vindex
12474 Src.getOperand(3), // voffset
12475 Src.getOperand(4), // soffset
12476 Src.getOperand(5), // offset
12477 Src.getOperand(6),
12478 Src.getOperand(7)
12479 };
12480 // replace with BUFFER_LOAD_BYTE/SHORT
12481 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12482 Src.getOperand(0).getValueType());
12483 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12485 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12486 ResList,
12487 Ops, M->getMemoryVT(),
12488 M->getMemOperand());
12489 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12490 BufferLoadSignExt.getValue(1)}, SDLoc(N));
12491 }
12492 return SDValue();
12493}
12494
12495SDValue SITargetLowering::performClassCombine(SDNode *N,
12496 DAGCombinerInfo &DCI) const {
12497 SelectionDAG &DAG = DCI.DAG;
12498 SDValue Mask = N->getOperand(1);
12499
12500 // fp_class x, 0 -> false
12501 if (isNullConstant(Mask))
12502 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12503
12504 if (N->getOperand(0).isUndef())
12505 return DAG.getUNDEF(MVT::i1);
12506
12507 return SDValue();
12508}
12509
12510SDValue SITargetLowering::performRcpCombine(SDNode *N,
12511 DAGCombinerInfo &DCI) const {
12512 EVT VT = N->getValueType(0);
12513 SDValue N0 = N->getOperand(0);
12514
12515 if (N0.isUndef()) {
12516 return DCI.DAG.getConstantFP(
12518 VT);
12519 }
12520
12521 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12522 N0.getOpcode() == ISD::SINT_TO_FP)) {
12523 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12524 N->getFlags());
12525 }
12526
12527 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12528 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12529 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12530 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12531 N0.getOperand(0), N->getFlags());
12532 }
12533
12535}
12536
12538 unsigned MaxDepth) const {
12539 unsigned Opcode = Op.getOpcode();
12540 if (Opcode == ISD::FCANONICALIZE)
12541 return true;
12542
12543 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12544 const auto &F = CFP->getValueAPF();
12545 if (F.isNaN() && F.isSignaling())
12546 return false;
12547 if (!F.isDenormal())
12548 return true;
12549
12550 DenormalMode Mode =
12551 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12552 return Mode == DenormalMode::getIEEE();
12553 }
12554
12555 // If source is a result of another standard FP operation it is already in
12556 // canonical form.
12557 if (MaxDepth == 0)
12558 return false;
12559
12560 switch (Opcode) {
12561 // These will flush denorms if required.
12562 case ISD::FADD:
12563 case ISD::FSUB:
12564 case ISD::FMUL:
12565 case ISD::FCEIL:
12566 case ISD::FFLOOR:
12567 case ISD::FMA:
12568 case ISD::FMAD:
12569 case ISD::FSQRT:
12570 case ISD::FDIV:
12571 case ISD::FREM:
12572 case ISD::FP_ROUND:
12573 case ISD::FP_EXTEND:
12574 case ISD::FP16_TO_FP:
12575 case ISD::FP_TO_FP16:
12576 case ISD::BF16_TO_FP:
12577 case ISD::FP_TO_BF16:
12578 case ISD::FLDEXP:
12581 case AMDGPUISD::RCP:
12582 case AMDGPUISD::RSQ:
12586 case AMDGPUISD::LOG:
12587 case AMDGPUISD::EXP:
12591 case AMDGPUISD::FRACT:
12598 case AMDGPUISD::SIN_HW:
12599 case AMDGPUISD::COS_HW:
12600 return true;
12601
12602 // It can/will be lowered or combined as a bit operation.
12603 // Need to check their input recursively to handle.
12604 case ISD::FNEG:
12605 case ISD::FABS:
12606 case ISD::FCOPYSIGN:
12607 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12608
12609 case ISD::AND:
12610 if (Op.getValueType() == MVT::i32) {
12611 // Be careful as we only know it is a bitcast floating point type. It
12612 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12613 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12614 // is valid to optimize for all types.
12615 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12616 if (RHS->getZExtValue() == 0xffff0000) {
12617 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12618 }
12619 }
12620 }
12621 break;
12622
12623 case ISD::FSIN:
12624 case ISD::FCOS:
12625 case ISD::FSINCOS:
12626 return Op.getValueType().getScalarType() != MVT::f16;
12627
12628 case ISD::FMINNUM:
12629 case ISD::FMAXNUM:
12630 case ISD::FMINNUM_IEEE:
12631 case ISD::FMAXNUM_IEEE:
12632 case ISD::FMINIMUM:
12633 case ISD::FMAXIMUM:
12634 case AMDGPUISD::CLAMP:
12635 case AMDGPUISD::FMED3:
12636 case AMDGPUISD::FMAX3:
12637 case AMDGPUISD::FMIN3:
12639 case AMDGPUISD::FMINIMUM3: {
12640 // FIXME: Shouldn't treat the generic operations different based these.
12641 // However, we aren't really required to flush the result from
12642 // minnum/maxnum..
12643
12644 // snans will be quieted, so we only need to worry about denormals.
12645 if (Subtarget->supportsMinMaxDenormModes() ||
12646 // FIXME: denormalsEnabledForType is broken for dynamic
12647 denormalsEnabledForType(DAG, Op.getValueType()))
12648 return true;
12649
12650 // Flushing may be required.
12651 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12652 // targets need to check their input recursively.
12653
12654 // FIXME: Does this apply with clamp? It's implemented with max.
12655 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12656 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12657 return false;
12658 }
12659
12660 return true;
12661 }
12662 case ISD::SELECT: {
12663 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12664 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12665 }
12666 case ISD::BUILD_VECTOR: {
12667 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12668 SDValue SrcOp = Op.getOperand(i);
12669 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12670 return false;
12671 }
12672
12673 return true;
12674 }
12677 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12678 }
12680 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12681 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12682 }
12683 case ISD::UNDEF:
12684 // Could be anything.
12685 return false;
12686
12687 case ISD::BITCAST:
12688 // TODO: This is incorrect as it loses track of the operand's type. We may
12689 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12690 // same bits that are canonicalized in one type need not be in the other.
12691 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12692 case ISD::TRUNCATE: {
12693 // Hack round the mess we make when legalizing extract_vector_elt
12694 if (Op.getValueType() == MVT::i16) {
12695 SDValue TruncSrc = Op.getOperand(0);
12696 if (TruncSrc.getValueType() == MVT::i32 &&
12697 TruncSrc.getOpcode() == ISD::BITCAST &&
12698 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12699 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12700 }
12701 }
12702 return false;
12703 }
12705 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12706 // TODO: Handle more intrinsics
12707 switch (IntrinsicID) {
12708 case Intrinsic::amdgcn_cvt_pkrtz:
12709 case Intrinsic::amdgcn_cubeid:
12710 case Intrinsic::amdgcn_frexp_mant:
12711 case Intrinsic::amdgcn_fdot2:
12712 case Intrinsic::amdgcn_rcp:
12713 case Intrinsic::amdgcn_rsq:
12714 case Intrinsic::amdgcn_rsq_clamp:
12715 case Intrinsic::amdgcn_rcp_legacy:
12716 case Intrinsic::amdgcn_rsq_legacy:
12717 case Intrinsic::amdgcn_trig_preop:
12718 case Intrinsic::amdgcn_log:
12719 case Intrinsic::amdgcn_exp2:
12720 case Intrinsic::amdgcn_sqrt:
12721 return true;
12722 default:
12723 break;
12724 }
12725
12726 break;
12727 }
12728 default:
12729 break;
12730 }
12731
12732 // FIXME: denormalsEnabledForType is broken for dynamic
12733 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12734 DAG.isKnownNeverSNaN(Op);
12735}
12736
12738 unsigned MaxDepth) const {
12739 const MachineRegisterInfo &MRI = MF.getRegInfo();
12740 MachineInstr *MI = MRI.getVRegDef(Reg);
12741 unsigned Opcode = MI->getOpcode();
12742
12743 if (Opcode == AMDGPU::G_FCANONICALIZE)
12744 return true;
12745
12746 std::optional<FPValueAndVReg> FCR;
12747 // Constant splat (can be padded with undef) or scalar constant.
12748 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12749 if (FCR->Value.isSignaling())
12750 return false;
12751 if (!FCR->Value.isDenormal())
12752 return true;
12753
12754 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12755 return Mode == DenormalMode::getIEEE();
12756 }
12757
12758 if (MaxDepth == 0)
12759 return false;
12760
12761 switch (Opcode) {
12762 case AMDGPU::G_FADD:
12763 case AMDGPU::G_FSUB:
12764 case AMDGPU::G_FMUL:
12765 case AMDGPU::G_FCEIL:
12766 case AMDGPU::G_FFLOOR:
12767 case AMDGPU::G_FRINT:
12768 case AMDGPU::G_FNEARBYINT:
12769 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12770 case AMDGPU::G_INTRINSIC_TRUNC:
12771 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12772 case AMDGPU::G_FMA:
12773 case AMDGPU::G_FMAD:
12774 case AMDGPU::G_FSQRT:
12775 case AMDGPU::G_FDIV:
12776 case AMDGPU::G_FREM:
12777 case AMDGPU::G_FPOW:
12778 case AMDGPU::G_FPEXT:
12779 case AMDGPU::G_FLOG:
12780 case AMDGPU::G_FLOG2:
12781 case AMDGPU::G_FLOG10:
12782 case AMDGPU::G_FPTRUNC:
12783 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12784 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12785 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12786 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12787 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12788 return true;
12789 case AMDGPU::G_FNEG:
12790 case AMDGPU::G_FABS:
12791 case AMDGPU::G_FCOPYSIGN:
12792 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12793 case AMDGPU::G_FMINNUM:
12794 case AMDGPU::G_FMAXNUM:
12795 case AMDGPU::G_FMINNUM_IEEE:
12796 case AMDGPU::G_FMAXNUM_IEEE:
12797 case AMDGPU::G_FMINIMUM:
12798 case AMDGPU::G_FMAXIMUM: {
12799 if (Subtarget->supportsMinMaxDenormModes() ||
12800 // FIXME: denormalsEnabledForType is broken for dynamic
12801 denormalsEnabledForType(MRI.getType(Reg), MF))
12802 return true;
12803
12804 [[fallthrough]];
12805 }
12806 case AMDGPU::G_BUILD_VECTOR:
12807 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12808 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12809 return false;
12810 return true;
12811 case AMDGPU::G_INTRINSIC:
12812 case AMDGPU::G_INTRINSIC_CONVERGENT:
12813 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12814 case Intrinsic::amdgcn_fmul_legacy:
12815 case Intrinsic::amdgcn_fmad_ftz:
12816 case Intrinsic::amdgcn_sqrt:
12817 case Intrinsic::amdgcn_fmed3:
12818 case Intrinsic::amdgcn_sin:
12819 case Intrinsic::amdgcn_cos:
12820 case Intrinsic::amdgcn_log:
12821 case Intrinsic::amdgcn_exp2:
12822 case Intrinsic::amdgcn_log_clamp:
12823 case Intrinsic::amdgcn_rcp:
12824 case Intrinsic::amdgcn_rcp_legacy:
12825 case Intrinsic::amdgcn_rsq:
12826 case Intrinsic::amdgcn_rsq_clamp:
12827 case Intrinsic::amdgcn_rsq_legacy:
12828 case Intrinsic::amdgcn_div_scale:
12829 case Intrinsic::amdgcn_div_fmas:
12830 case Intrinsic::amdgcn_div_fixup:
12831 case Intrinsic::amdgcn_fract:
12832 case Intrinsic::amdgcn_cvt_pkrtz:
12833 case Intrinsic::amdgcn_cubeid:
12834 case Intrinsic::amdgcn_cubema:
12835 case Intrinsic::amdgcn_cubesc:
12836 case Intrinsic::amdgcn_cubetc:
12837 case Intrinsic::amdgcn_frexp_mant:
12838 case Intrinsic::amdgcn_fdot2:
12839 case Intrinsic::amdgcn_trig_preop:
12840 return true;
12841 default:
12842 break;
12843 }
12844
12845 [[fallthrough]];
12846 default:
12847 return false;
12848 }
12849
12850 llvm_unreachable("invalid operation");
12851}
12852
12853// Constant fold canonicalize.
12854SDValue SITargetLowering::getCanonicalConstantFP(
12855 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12856 // Flush denormals to 0 if not enabled.
12857 if (C.isDenormal()) {
12858 DenormalMode Mode =
12859 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12860 if (Mode == DenormalMode::getPreserveSign()) {
12861 return DAG.getConstantFP(
12862 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12863 }
12864
12865 if (Mode != DenormalMode::getIEEE())
12866 return SDValue();
12867 }
12868
12869 if (C.isNaN()) {
12870 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12871 if (C.isSignaling()) {
12872 // Quiet a signaling NaN.
12873 // FIXME: Is this supposed to preserve payload bits?
12874 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12875 }
12876
12877 // Make sure it is the canonical NaN bitpattern.
12878 //
12879 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12880 // immediate?
12881 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12882 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12883 }
12884
12885 // Already canonical.
12886 return DAG.getConstantFP(C, SL, VT);
12887}
12888
12890 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
12891}
12892
12893SDValue SITargetLowering::performFCanonicalizeCombine(
12894 SDNode *N,
12895 DAGCombinerInfo &DCI) const {
12896 SelectionDAG &DAG = DCI.DAG;
12897 SDValue N0 = N->getOperand(0);
12898 EVT VT = N->getValueType(0);
12899
12900 // fcanonicalize undef -> qnan
12901 if (N0.isUndef()) {
12903 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
12904 }
12905
12906 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
12907 EVT VT = N->getValueType(0);
12908 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
12909 }
12910
12911 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12912 // (fcanonicalize k)
12913 //
12914 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12915
12916 // TODO: This could be better with wider vectors that will be split to v2f16,
12917 // and to consider uses since there aren't that many packed operations.
12918 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
12919 isTypeLegal(MVT::v2f16)) {
12920 SDLoc SL(N);
12921 SDValue NewElts[2];
12922 SDValue Lo = N0.getOperand(0);
12923 SDValue Hi = N0.getOperand(1);
12924 EVT EltVT = Lo.getValueType();
12925
12927 for (unsigned I = 0; I != 2; ++I) {
12928 SDValue Op = N0.getOperand(I);
12929 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12930 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
12931 CFP->getValueAPF());
12932 } else if (Op.isUndef()) {
12933 // Handled below based on what the other operand is.
12934 NewElts[I] = Op;
12935 } else {
12936 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
12937 }
12938 }
12939
12940 // If one half is undef, and one is constant, prefer a splat vector rather
12941 // than the normal qNaN. If it's a register, prefer 0.0 since that's
12942 // cheaper to use and may be free with a packed operation.
12943 if (NewElts[0].isUndef()) {
12944 if (isa<ConstantFPSDNode>(NewElts[1]))
12945 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
12946 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
12947 }
12948
12949 if (NewElts[1].isUndef()) {
12950 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
12951 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
12952 }
12953
12954 return DAG.getBuildVector(VT, SL, NewElts);
12955 }
12956 }
12957
12958 return SDValue();
12959}
12960
12961static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
12962 switch (Opc) {
12963 case ISD::FMAXNUM:
12964 case ISD::FMAXNUM_IEEE:
12965 return AMDGPUISD::FMAX3;
12966 case ISD::FMAXIMUM:
12967 return AMDGPUISD::FMAXIMUM3;
12968 case ISD::SMAX:
12969 return AMDGPUISD::SMAX3;
12970 case ISD::UMAX:
12971 return AMDGPUISD::UMAX3;
12972 case ISD::FMINNUM:
12973 case ISD::FMINNUM_IEEE:
12974 return AMDGPUISD::FMIN3;
12975 case ISD::FMINIMUM:
12976 return AMDGPUISD::FMINIMUM3;
12977 case ISD::SMIN:
12978 return AMDGPUISD::SMIN3;
12979 case ISD::UMIN:
12980 return AMDGPUISD::UMIN3;
12981 default:
12982 llvm_unreachable("Not a min/max opcode");
12983 }
12984}
12985
12986SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
12987 const SDLoc &SL, SDValue Src,
12988 SDValue MinVal,
12989 SDValue MaxVal,
12990 bool Signed) const {
12991
12992 // med3 comes from
12993 // min(max(x, K0), K1), K0 < K1
12994 // max(min(x, K0), K1), K1 < K0
12995 //
12996 // "MinVal" and "MaxVal" respectively refer to the rhs of the
12997 // min/max op.
12998 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
12999 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13000
13001 if (!MinK || !MaxK)
13002 return SDValue();
13003
13004 if (Signed) {
13005 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13006 return SDValue();
13007 } else {
13008 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13009 return SDValue();
13010 }
13011
13012 EVT VT = MinK->getValueType(0);
13013 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13014 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13015 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13016
13017 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13018 // not available, but this is unlikely to be profitable as constants
13019 // will often need to be materialized & extended, especially on
13020 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13021 return SDValue();
13022}
13023
13025 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13026 return C;
13027
13028 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13029 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13030 return C;
13031 }
13032
13033 return nullptr;
13034}
13035
13036SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13037 const SDLoc &SL,
13038 SDValue Op0,
13039 SDValue Op1) const {
13041 if (!K1)
13042 return SDValue();
13043
13045 if (!K0)
13046 return SDValue();
13047
13048 // Ordered >= (although NaN inputs should have folded away by now).
13049 if (K0->getValueAPF() > K1->getValueAPF())
13050 return SDValue();
13051
13052 const MachineFunction &MF = DAG.getMachineFunction();
13054
13055 // TODO: Check IEEE bit enabled?
13056 EVT VT = Op0.getValueType();
13057 if (Info->getMode().DX10Clamp) {
13058 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13059 // hardware fmed3 behavior converting to a min.
13060 // FIXME: Should this be allowing -0.0?
13061 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13062 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13063 }
13064
13065 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13066 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13067 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13068 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13069 // then give the other result, which is different from med3 with a NaN
13070 // input.
13071 SDValue Var = Op0.getOperand(0);
13072 if (!DAG.isKnownNeverSNaN(Var))
13073 return SDValue();
13074
13076
13077 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13078 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13079 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13080 Var, SDValue(K0, 0), SDValue(K1, 0));
13081 }
13082 }
13083
13084 return SDValue();
13085}
13086
13087SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13088 DAGCombinerInfo &DCI) const {
13089 SelectionDAG &DAG = DCI.DAG;
13090
13091 EVT VT = N->getValueType(0);
13092 unsigned Opc = N->getOpcode();
13093 SDValue Op0 = N->getOperand(0);
13094 SDValue Op1 = N->getOperand(1);
13095
13096 // Only do this if the inner op has one use since this will just increases
13097 // register pressure for no benefit.
13098
13099 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
13100 !VT.isVector() &&
13101 (VT == MVT::i32 || VT == MVT::f32 ||
13102 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
13103 // max(max(a, b), c) -> max3(a, b, c)
13104 // min(min(a, b), c) -> min3(a, b, c)
13105 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13106 SDLoc DL(N);
13107 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13108 DL,
13109 N->getValueType(0),
13110 Op0.getOperand(0),
13111 Op0.getOperand(1),
13112 Op1);
13113 }
13114
13115 // Try commuted.
13116 // max(a, max(b, c)) -> max3(a, b, c)
13117 // min(a, min(b, c)) -> min3(a, b, c)
13118 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13119 SDLoc DL(N);
13120 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13121 DL,
13122 N->getValueType(0),
13123 Op0,
13124 Op1.getOperand(0),
13125 Op1.getOperand(1));
13126 }
13127 }
13128
13129 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13130 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13131 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13132 if (SDValue Med3 = performIntMed3ImmCombine(
13133 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13134 return Med3;
13135 }
13136 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13137 if (SDValue Med3 = performIntMed3ImmCombine(
13138 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13139 return Med3;
13140 }
13141
13142 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13143 if (SDValue Med3 = performIntMed3ImmCombine(
13144 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13145 return Med3;
13146 }
13147 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13148 if (SDValue Med3 = performIntMed3ImmCombine(
13149 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13150 return Med3;
13151 }
13152
13153 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13154 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13155 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13156 (Opc == AMDGPUISD::FMIN_LEGACY &&
13157 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13158 (VT == MVT::f32 || VT == MVT::f64 ||
13159 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13160 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13161 Op0.hasOneUse()) {
13162 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13163 return Res;
13164 }
13165
13166 return SDValue();
13167}
13168
13170 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13171 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13172 // FIXME: Should this be allowing -0.0?
13173 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13174 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13175 }
13176 }
13177
13178 return false;
13179}
13180
13181// FIXME: Should only worry about snans for version with chain.
13182SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13183 DAGCombinerInfo &DCI) const {
13184 EVT VT = N->getValueType(0);
13185 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13186 // NaNs. With a NaN input, the order of the operands may change the result.
13187
13188 SelectionDAG &DAG = DCI.DAG;
13189 SDLoc SL(N);
13190
13191 SDValue Src0 = N->getOperand(0);
13192 SDValue Src1 = N->getOperand(1);
13193 SDValue Src2 = N->getOperand(2);
13194
13195 if (isClampZeroToOne(Src0, Src1)) {
13196 // const_a, const_b, x -> clamp is safe in all cases including signaling
13197 // nans.
13198 // FIXME: Should this be allowing -0.0?
13199 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13200 }
13201
13202 const MachineFunction &MF = DAG.getMachineFunction();
13204
13205 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13206 // handling no dx10-clamp?
13207 if (Info->getMode().DX10Clamp) {
13208 // If NaNs is clamped to 0, we are free to reorder the inputs.
13209
13210 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13211 std::swap(Src0, Src1);
13212
13213 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13214 std::swap(Src1, Src2);
13215
13216 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13217 std::swap(Src0, Src1);
13218
13219 if (isClampZeroToOne(Src1, Src2))
13220 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13221 }
13222
13223 return SDValue();
13224}
13225
13226SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13227 DAGCombinerInfo &DCI) const {
13228 SDValue Src0 = N->getOperand(0);
13229 SDValue Src1 = N->getOperand(1);
13230 if (Src0.isUndef() && Src1.isUndef())
13231 return DCI.DAG.getUNDEF(N->getValueType(0));
13232 return SDValue();
13233}
13234
13235// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13236// expanded into a set of cmp/select instructions.
13238 unsigned NumElem,
13239 bool IsDivergentIdx,
13240 const GCNSubtarget *Subtarget) {
13242 return false;
13243
13244 unsigned VecSize = EltSize * NumElem;
13245
13246 // Sub-dword vectors of size 2 dword or less have better implementation.
13247 if (VecSize <= 64 && EltSize < 32)
13248 return false;
13249
13250 // Always expand the rest of sub-dword instructions, otherwise it will be
13251 // lowered via memory.
13252 if (EltSize < 32)
13253 return true;
13254
13255 // Always do this if var-idx is divergent, otherwise it will become a loop.
13256 if (IsDivergentIdx)
13257 return true;
13258
13259 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13260 unsigned NumInsts = NumElem /* Number of compares */ +
13261 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13262
13263 // On some architectures (GFX9) movrel is not available and it's better
13264 // to expand.
13265 if (!Subtarget->hasMovrel())
13266 return NumInsts <= 16;
13267
13268 // If movrel is available, use it instead of expanding for vector of 8
13269 // elements.
13270 return NumInsts <= 15;
13271}
13272
13274 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13275 if (isa<ConstantSDNode>(Idx))
13276 return false;
13277
13278 SDValue Vec = N->getOperand(0);
13279 EVT VecVT = Vec.getValueType();
13280 EVT EltVT = VecVT.getVectorElementType();
13281 unsigned EltSize = EltVT.getSizeInBits();
13282 unsigned NumElem = VecVT.getVectorNumElements();
13283
13285 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13286}
13287
13288SDValue SITargetLowering::performExtractVectorEltCombine(
13289 SDNode *N, DAGCombinerInfo &DCI) const {
13290 SDValue Vec = N->getOperand(0);
13291 SelectionDAG &DAG = DCI.DAG;
13292
13293 EVT VecVT = Vec.getValueType();
13294 EVT VecEltVT = VecVT.getVectorElementType();
13295 EVT ResVT = N->getValueType(0);
13296
13297 unsigned VecSize = VecVT.getSizeInBits();
13298 unsigned VecEltSize = VecEltVT.getSizeInBits();
13299
13300 if ((Vec.getOpcode() == ISD::FNEG ||
13302 SDLoc SL(N);
13303 SDValue Idx = N->getOperand(1);
13304 SDValue Elt =
13305 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13306 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13307 }
13308
13309 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13310 // =>
13311 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13312 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13313 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13314 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13315 SDLoc SL(N);
13316 SDValue Idx = N->getOperand(1);
13317 unsigned Opc = Vec.getOpcode();
13318
13319 switch(Opc) {
13320 default:
13321 break;
13322 // TODO: Support other binary operations.
13323 case ISD::FADD:
13324 case ISD::FSUB:
13325 case ISD::FMUL:
13326 case ISD::ADD:
13327 case ISD::UMIN:
13328 case ISD::UMAX:
13329 case ISD::SMIN:
13330 case ISD::SMAX:
13331 case ISD::FMAXNUM:
13332 case ISD::FMINNUM:
13333 case ISD::FMAXNUM_IEEE:
13334 case ISD::FMINNUM_IEEE:
13335 case ISD::FMAXIMUM:
13336 case ISD::FMINIMUM: {
13337 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13338 Vec.getOperand(0), Idx);
13339 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13340 Vec.getOperand(1), Idx);
13341
13342 DCI.AddToWorklist(Elt0.getNode());
13343 DCI.AddToWorklist(Elt1.getNode());
13344 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13345 }
13346 }
13347 }
13348
13349 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13351 SDLoc SL(N);
13352 SDValue Idx = N->getOperand(1);
13353 SDValue V;
13354 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13355 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13356 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13357 if (I == 0)
13358 V = Elt;
13359 else
13360 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13361 }
13362 return V;
13363 }
13364
13365 if (!DCI.isBeforeLegalize())
13366 return SDValue();
13367
13368 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13369 // elements. This exposes more load reduction opportunities by replacing
13370 // multiple small extract_vector_elements with a single 32-bit extract.
13371 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13372 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13373 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13374 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13375
13376 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13377 unsigned EltIdx = BitIndex / 32;
13378 unsigned LeftoverBitIdx = BitIndex % 32;
13379 SDLoc SL(N);
13380
13381 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13382 DCI.AddToWorklist(Cast.getNode());
13383
13384 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13385 DAG.getConstant(EltIdx, SL, MVT::i32));
13386 DCI.AddToWorklist(Elt.getNode());
13387 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13388 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13389 DCI.AddToWorklist(Srl.getNode());
13390
13391 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13392 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13393 DCI.AddToWorklist(Trunc.getNode());
13394
13395 if (VecEltVT == ResVT) {
13396 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13397 }
13398
13399 assert(ResVT.isScalarInteger());
13400 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13401 }
13402
13403 return SDValue();
13404}
13405
13406SDValue
13407SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13408 DAGCombinerInfo &DCI) const {
13409 SDValue Vec = N->getOperand(0);
13410 SDValue Idx = N->getOperand(2);
13411 EVT VecVT = Vec.getValueType();
13412 EVT EltVT = VecVT.getVectorElementType();
13413
13414 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13415 // => BUILD_VECTOR n x select (e, const-idx)
13417 return SDValue();
13418
13419 SelectionDAG &DAG = DCI.DAG;
13420 SDLoc SL(N);
13421 SDValue Ins = N->getOperand(1);
13422 EVT IdxVT = Idx.getValueType();
13423
13425 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13426 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13427 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13428 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13429 Ops.push_back(V);
13430 }
13431
13432 return DAG.getBuildVector(VecVT, SL, Ops);
13433}
13434
13435/// Return the source of an fp_extend from f16 to f32, or a converted FP
13436/// constant.
13438 if (Src.getOpcode() == ISD::FP_EXTEND &&
13439 Src.getOperand(0).getValueType() == MVT::f16) {
13440 return Src.getOperand(0);
13441 }
13442
13443 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13444 APFloat Val = CFP->getValueAPF();
13445 bool LosesInfo = true;
13447 if (!LosesInfo)
13448 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13449 }
13450
13451 return SDValue();
13452}
13453
13454SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13455 DAGCombinerInfo &DCI) const {
13456 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13457 "combine only useful on gfx8");
13458
13459 SDValue TruncSrc = N->getOperand(0);
13460 EVT VT = N->getValueType(0);
13461 if (VT != MVT::f16)
13462 return SDValue();
13463
13464 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13465 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13466 return SDValue();
13467
13468 SelectionDAG &DAG = DCI.DAG;
13469 SDLoc SL(N);
13470
13471 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13472 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13473 // casting back.
13474
13475 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13476 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13477 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13478 if (!A)
13479 return SDValue();
13480
13481 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13482 if (!B)
13483 return SDValue();
13484
13485 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13486 if (!C)
13487 return SDValue();
13488
13489 // This changes signaling nan behavior. If an input is a signaling nan, it
13490 // would have been quieted by the fpext originally. We don't care because
13491 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13492 // we would be worse off than just doing the promotion.
13493 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13494 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13495 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13496 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13497}
13498
13499unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13500 const SDNode *N0,
13501 const SDNode *N1) const {
13502 EVT VT = N0->getValueType(0);
13503
13504 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13505 // support denormals ever.
13506 if (((VT == MVT::f32 &&
13508 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13511 return ISD::FMAD;
13512
13513 const TargetOptions &Options = DAG.getTarget().Options;
13514 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13515 (N0->getFlags().hasAllowContract() &&
13516 N1->getFlags().hasAllowContract())) &&
13518 return ISD::FMA;
13519 }
13520
13521 return 0;
13522}
13523
13524// For a reassociatable opcode perform:
13525// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13526SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13527 SelectionDAG &DAG) const {
13528 EVT VT = N->getValueType(0);
13529 if (VT != MVT::i32 && VT != MVT::i64)
13530 return SDValue();
13531
13532 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13533 return SDValue();
13534
13535 unsigned Opc = N->getOpcode();
13536 SDValue Op0 = N->getOperand(0);
13537 SDValue Op1 = N->getOperand(1);
13538
13539 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13540 return SDValue();
13541
13542 if (Op0->isDivergent())
13543 std::swap(Op0, Op1);
13544
13545 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13546 return SDValue();
13547
13548 SDValue Op2 = Op1.getOperand(1);
13549 Op1 = Op1.getOperand(0);
13550 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13551 return SDValue();
13552
13553 if (Op1->isDivergent())
13554 std::swap(Op1, Op2);
13555
13556 SDLoc SL(N);
13557 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13558 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13559}
13560
13561static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13562 EVT VT,
13563 SDValue N0, SDValue N1, SDValue N2,
13564 bool Signed) {
13566 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13567 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13568 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13569}
13570
13571// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13572// multiplies, if any.
13573//
13574// Full 64-bit multiplies that feed into an addition are lowered here instead
13575// of using the generic expansion. The generic expansion ends up with
13576// a tree of ADD nodes that prevents us from using the "add" part of the
13577// MAD instruction. The expansion produced here results in a chain of ADDs
13578// instead of a tree.
13579SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13580 DAGCombinerInfo &DCI) const {
13581 assert(N->getOpcode() == ISD::ADD);
13582
13583 SelectionDAG &DAG = DCI.DAG;
13584 EVT VT = N->getValueType(0);
13585 SDLoc SL(N);
13586 SDValue LHS = N->getOperand(0);
13587 SDValue RHS = N->getOperand(1);
13588
13589 if (VT.isVector())
13590 return SDValue();
13591
13592 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13593 // result in scalar registers for uniform values.
13594 if (!N->isDivergent() && Subtarget->hasSMulHi())
13595 return SDValue();
13596
13597 unsigned NumBits = VT.getScalarSizeInBits();
13598 if (NumBits <= 32 || NumBits > 64)
13599 return SDValue();
13600
13601 if (LHS.getOpcode() != ISD::MUL) {
13602 assert(RHS.getOpcode() == ISD::MUL);
13603 std::swap(LHS, RHS);
13604 }
13605
13606 // Avoid the fold if it would unduly increase the number of multiplies due to
13607 // multiple uses, except on hardware with full-rate multiply-add (which is
13608 // part of full-rate 64-bit ops).
13609 if (!Subtarget->hasFullRate64Ops()) {
13610 unsigned NumUsers = 0;
13611 for (SDNode *Use : LHS->uses()) {
13612 // There is a use that does not feed into addition, so the multiply can't
13613 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13614 if (Use->getOpcode() != ISD::ADD)
13615 return SDValue();
13616
13617 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13618 // MUL + 3xADD + 3xADDC over 3xMAD.
13619 ++NumUsers;
13620 if (NumUsers >= 3)
13621 return SDValue();
13622 }
13623 }
13624
13625 SDValue MulLHS = LHS.getOperand(0);
13626 SDValue MulRHS = LHS.getOperand(1);
13627 SDValue AddRHS = RHS;
13628
13629 // Always check whether operands are small unsigned values, since that
13630 // knowledge is useful in more cases. Check for small signed values only if
13631 // doing so can unlock a shorter code sequence.
13632 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13633 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13634
13635 bool MulSignedLo = false;
13636 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13637 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13638 numBitsSigned(MulRHS, DAG) <= 32;
13639 }
13640
13641 // The operands and final result all have the same number of bits. If
13642 // operands need to be extended, they can be extended with garbage. The
13643 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13644 // truncated away in the end.
13645 if (VT != MVT::i64) {
13646 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13647 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13648 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13649 }
13650
13651 // The basic code generated is conceptually straightforward. Pseudo code:
13652 //
13653 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13654 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13655 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13656 //
13657 // The second and third lines are optional, depending on whether the factors
13658 // are {sign,zero}-extended or not.
13659 //
13660 // The actual DAG is noisier than the pseudo code, but only due to
13661 // instructions that disassemble values into low and high parts, and
13662 // assemble the final result.
13663 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13664
13665 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13666 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13667 SDValue Accum =
13668 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13669
13670 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13671 SDValue AccumLo, AccumHi;
13672 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13673
13674 if (!MulLHSUnsigned32) {
13675 auto MulLHSHi =
13676 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13677 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13678 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13679 }
13680
13681 if (!MulRHSUnsigned32) {
13682 auto MulRHSHi =
13683 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13684 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13685 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13686 }
13687
13688 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13689 Accum = DAG.getBitcast(MVT::i64, Accum);
13690 }
13691
13692 if (VT != MVT::i64)
13693 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13694 return Accum;
13695}
13696
13697// Collect the ultimate src of each of the mul node's operands, and confirm
13698// each operand is 8 bytes.
13699static std::optional<ByteProvider<SDValue>>
13700handleMulOperand(const SDValue &MulOperand) {
13701 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13702 if (!Byte0 || Byte0->isConstantZero()) {
13703 return std::nullopt;
13704 }
13705 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13706 if (Byte1 && !Byte1->isConstantZero()) {
13707 return std::nullopt;
13708 }
13709 return Byte0;
13710}
13711
13712static unsigned addPermMasks(unsigned First, unsigned Second) {
13713 unsigned FirstCs = First & 0x0c0c0c0c;
13714 unsigned SecondCs = Second & 0x0c0c0c0c;
13715 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13716 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13717
13718 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13719 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13720 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13721 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13722
13723 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13724}
13725
13726struct DotSrc {
13728 int64_t PermMask;
13730};
13731
13735 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13736
13737 assert(Src0.Src.has_value() && Src1.Src.has_value());
13738 // Src0s and Src1s are empty, just place arbitrarily.
13739 if (Step == 0) {
13740 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13741 Src0.SrcOffset / 4});
13742 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13743 Src1.SrcOffset / 4});
13744 return;
13745 }
13746
13747 for (int BPI = 0; BPI < 2; BPI++) {
13748 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13749 if (BPI == 1) {
13750 BPP = {Src1, Src0};
13751 }
13752 unsigned ZeroMask = 0x0c0c0c0c;
13753 unsigned FMask = 0xFF << (8 * (3 - Step));
13754
13755 unsigned FirstMask =
13756 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13757 unsigned SecondMask =
13758 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13759 // Attempt to find Src vector which contains our SDValue, if so, add our
13760 // perm mask to the existing one. If we are unable to find a match for the
13761 // first SDValue, attempt to find match for the second.
13762 int FirstGroup = -1;
13763 for (int I = 0; I < 2; I++) {
13764 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13765 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13766 return IterElt.SrcOp == *BPP.first.Src &&
13767 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13768 };
13769
13770 auto Match = llvm::find_if(Srcs, MatchesFirst);
13771 if (Match != Srcs.end()) {
13772 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13773 FirstGroup = I;
13774 break;
13775 }
13776 }
13777 if (FirstGroup != -1) {
13778 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13779 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13780 return IterElt.SrcOp == *BPP.second.Src &&
13781 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13782 };
13783 auto Match = llvm::find_if(Srcs, MatchesSecond);
13784 if (Match != Srcs.end()) {
13785 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13786 } else
13787 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13788 return;
13789 }
13790 }
13791
13792 // If we have made it here, then we could not find a match in Src0s or Src1s
13793 // for either Src0 or Src1, so just place them arbitrarily.
13794
13795 unsigned ZeroMask = 0x0c0c0c0c;
13796 unsigned FMask = 0xFF << (8 * (3 - Step));
13797
13798 Src0s.push_back(
13799 {*Src0.Src,
13800 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13801 Src1.SrcOffset / 4});
13802 Src1s.push_back(
13803 {*Src1.Src,
13804 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13805 Src1.SrcOffset / 4});
13806
13807 return;
13808}
13809
13811 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13812 bool IsAny) {
13813
13814 // If we just have one source, just permute it accordingly.
13815 if (Srcs.size() == 1) {
13816 auto Elt = Srcs.begin();
13817 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13818
13819 // v_perm will produce the original value
13820 if (Elt->PermMask == 0x3020100)
13821 return EltOp;
13822
13823 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13824 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13825 }
13826
13827 auto FirstElt = Srcs.begin();
13828 auto SecondElt = std::next(FirstElt);
13829
13831
13832 // If we have multiple sources in the chain, combine them via perms (using
13833 // calculated perm mask) and Ors.
13834 while (true) {
13835 auto FirstMask = FirstElt->PermMask;
13836 auto SecondMask = SecondElt->PermMask;
13837
13838 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13839 unsigned FirstPlusFour = FirstMask | 0x04040404;
13840 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13841 // original 0x0C.
13842 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13843
13844 auto PermMask = addPermMasks(FirstMask, SecondMask);
13845 auto FirstVal =
13846 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13847 auto SecondVal =
13848 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13849
13850 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13851 SecondVal,
13852 DAG.getConstant(PermMask, SL, MVT::i32)));
13853
13854 FirstElt = std::next(SecondElt);
13855 if (FirstElt == Srcs.end())
13856 break;
13857
13858 SecondElt = std::next(FirstElt);
13859 // If we only have a FirstElt, then just combine that into the cumulative
13860 // source node.
13861 if (SecondElt == Srcs.end()) {
13862 auto EltOp =
13863 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13864
13865 Perms.push_back(
13866 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13867 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13868 break;
13869 }
13870 }
13871
13872 assert(Perms.size() == 1 || Perms.size() == 2);
13873 return Perms.size() == 2
13874 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13875 : Perms[0];
13876}
13877
13878static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13879 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13880 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13881 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13882 EntryMask += ZeroMask;
13883 }
13884}
13885
13886static bool isMul(const SDValue Op) {
13887 auto Opcode = Op.getOpcode();
13888
13889 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13890 Opcode == AMDGPUISD::MUL_I24);
13891}
13892
13893static std::optional<bool>
13895 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
13896 const SDValue &S1Op, const SelectionDAG &DAG) {
13897 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
13898 // of the dot4 is irrelevant.
13899 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
13900 return false;
13901
13902 auto Known0 = DAG.computeKnownBits(S0Op, 0);
13903 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
13904 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13905 auto Known1 = DAG.computeKnownBits(S1Op, 0);
13906 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
13907 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13908
13909 assert(!(S0IsUnsigned && S0IsSigned));
13910 assert(!(S1IsUnsigned && S1IsSigned));
13911
13912 // There are 9 possible permutations of
13913 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
13914
13915 // In two permutations, the sign bits are known to be the same for both Ops,
13916 // so simply return Signed / Unsigned corresponding to the MSB
13917
13918 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
13919 return S0IsSigned;
13920
13921 // In another two permutations, the sign bits are known to be opposite. In
13922 // this case return std::nullopt to indicate a bad match.
13923
13924 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
13925 return std::nullopt;
13926
13927 // In the remaining five permutations, we don't know the value of the sign
13928 // bit for at least one Op. Since we have a valid ByteProvider, we know that
13929 // the upper bits must be extension bits. Thus, the only ways for the sign
13930 // bit to be unknown is if it was sign extended from unknown value, or if it
13931 // was any extended. In either case, it is correct to use the signed
13932 // version of the signedness semantics of dot4
13933
13934 // In two of such permutations, we known the sign bit is set for
13935 // one op, and the other is unknown. It is okay to used signed version of
13936 // dot4.
13937 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
13938 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
13939 return true;
13940
13941 // In one such permutation, we don't know either of the sign bits. It is okay
13942 // to used the signed version of dot4.
13943 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
13944 return true;
13945
13946 // In two of such permutations, we known the sign bit is unset for
13947 // one op, and the other is unknown. Return std::nullopt to indicate a
13948 // bad match.
13949 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
13950 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
13951 return std::nullopt;
13952
13953 llvm_unreachable("Fully covered condition");
13954}
13955
13956SDValue SITargetLowering::performAddCombine(SDNode *N,
13957 DAGCombinerInfo &DCI) const {
13958 SelectionDAG &DAG = DCI.DAG;
13959 EVT VT = N->getValueType(0);
13960 SDLoc SL(N);
13961 SDValue LHS = N->getOperand(0);
13962 SDValue RHS = N->getOperand(1);
13963
13964 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
13965 if (Subtarget->hasMad64_32()) {
13966 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
13967 return Folded;
13968 }
13969 }
13970
13971 if (SDValue V = reassociateScalarOps(N, DAG)) {
13972 return V;
13973 }
13974
13975 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
13976 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
13977 SDValue TempNode(N, 0);
13978 std::optional<bool> IsSigned;
13982
13983 // Match the v_dot4 tree, while collecting src nodes.
13984 int ChainLength = 0;
13985 for (int I = 0; I < 4; I++) {
13986 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
13987 if (MulIdx == -1)
13988 break;
13989 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
13990 if (!Src0)
13991 break;
13992 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
13993 if (!Src1)
13994 break;
13995
13996 auto IterIsSigned = checkDot4MulSignedness(
13997 TempNode->getOperand(MulIdx), *Src0, *Src1,
13998 TempNode->getOperand(MulIdx)->getOperand(0),
13999 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14000 if (!IterIsSigned)
14001 break;
14002 if (!IsSigned)
14003 IsSigned = *IterIsSigned;
14004 if (*IterIsSigned != *IsSigned)
14005 break;
14006 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14007 auto AddIdx = 1 - MulIdx;
14008 // Allow the special case where add (add (mul24, 0), mul24) became ->
14009 // add (mul24, mul24).
14010 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14011 Src2s.push_back(TempNode->getOperand(AddIdx));
14012 auto Src0 =
14013 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14014 if (!Src0)
14015 break;
14016 auto Src1 =
14017 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14018 if (!Src1)
14019 break;
14020 auto IterIsSigned = checkDot4MulSignedness(
14021 TempNode->getOperand(AddIdx), *Src0, *Src1,
14022 TempNode->getOperand(AddIdx)->getOperand(0),
14023 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14024 if (!IterIsSigned)
14025 break;
14026 assert(IsSigned);
14027 if (*IterIsSigned != *IsSigned)
14028 break;
14029 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14030 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14031 ChainLength = I + 2;
14032 break;
14033 }
14034
14035 TempNode = TempNode->getOperand(AddIdx);
14036 Src2s.push_back(TempNode);
14037 ChainLength = I + 1;
14038 if (TempNode->getNumOperands() < 2)
14039 break;
14040 LHS = TempNode->getOperand(0);
14041 RHS = TempNode->getOperand(1);
14042 }
14043
14044 if (ChainLength < 2)
14045 return SDValue();
14046
14047 // Masks were constructed with assumption that we would find a chain of
14048 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14049 // 0x0c) so they do not affect dot calculation.
14050 if (ChainLength < 4) {
14051 fixMasks(Src0s, ChainLength);
14052 fixMasks(Src1s, ChainLength);
14053 }
14054
14055 SDValue Src0, Src1;
14056
14057 // If we are just using a single source for both, and have permuted the
14058 // bytes consistently, we can just use the sources without permuting
14059 // (commutation).
14060 bool UseOriginalSrc = false;
14061 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14062 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14063 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14064 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14065 SmallVector<unsigned, 4> SrcBytes;
14066 auto Src0Mask = Src0s.begin()->PermMask;
14067 SrcBytes.push_back(Src0Mask & 0xFF000000);
14068 bool UniqueEntries = true;
14069 for (auto I = 1; I < 4; I++) {
14070 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14071
14072 if (is_contained(SrcBytes, NextByte)) {
14073 UniqueEntries = false;
14074 break;
14075 }
14076 SrcBytes.push_back(NextByte);
14077 }
14078
14079 if (UniqueEntries) {
14080 UseOriginalSrc = true;
14081
14082 auto FirstElt = Src0s.begin();
14083 auto FirstEltOp =
14084 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14085
14086 auto SecondElt = Src1s.begin();
14087 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14088 SecondElt->DWordOffset);
14089
14090 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14091 MVT::getIntegerVT(32));
14092 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14093 MVT::getIntegerVT(32));
14094 }
14095 }
14096
14097 if (!UseOriginalSrc) {
14098 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14099 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14100 }
14101
14102 assert(IsSigned);
14103 SDValue Src2 =
14104 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14105
14106 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14107 : Intrinsic::amdgcn_udot4,
14108 SL, MVT::i64);
14109
14110 assert(!VT.isVector());
14111 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14112 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14113
14114 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14115 }
14116
14117 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14118 return SDValue();
14119
14120 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14121 // add x, sext (setcc) => usubo_carry x, 0, setcc
14122 unsigned Opc = LHS.getOpcode();
14123 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14124 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14125 std::swap(RHS, LHS);
14126
14127 Opc = RHS.getOpcode();
14128 switch (Opc) {
14129 default: break;
14130 case ISD::ZERO_EXTEND:
14131 case ISD::SIGN_EXTEND:
14132 case ISD::ANY_EXTEND: {
14133 auto Cond = RHS.getOperand(0);
14134 // If this won't be a real VOPC output, we would still need to insert an
14135 // extra instruction anyway.
14136 if (!isBoolSGPR(Cond))
14137 break;
14138 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14139 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14141 return DAG.getNode(Opc, SL, VTList, Args);
14142 }
14143 case ISD::UADDO_CARRY: {
14144 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14145 if (!isNullConstant(RHS.getOperand(1)))
14146 break;
14147 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
14148 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14149 }
14150 }
14151 return SDValue();
14152}
14153
14154SDValue SITargetLowering::performSubCombine(SDNode *N,
14155 DAGCombinerInfo &DCI) const {
14156 SelectionDAG &DAG = DCI.DAG;
14157 EVT VT = N->getValueType(0);
14158
14159 if (VT != MVT::i32)
14160 return SDValue();
14161
14162 SDLoc SL(N);
14163 SDValue LHS = N->getOperand(0);
14164 SDValue RHS = N->getOperand(1);
14165
14166 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14167 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14168 unsigned Opc = RHS.getOpcode();
14169 switch (Opc) {
14170 default: break;
14171 case ISD::ZERO_EXTEND:
14172 case ISD::SIGN_EXTEND:
14173 case ISD::ANY_EXTEND: {
14174 auto Cond = RHS.getOperand(0);
14175 // If this won't be a real VOPC output, we would still need to insert an
14176 // extra instruction anyway.
14177 if (!isBoolSGPR(Cond))
14178 break;
14179 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14180 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14182 return DAG.getNode(Opc, SL, VTList, Args);
14183 }
14184 }
14185
14186 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14187 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14188 if (!isNullConstant(LHS.getOperand(1)))
14189 return SDValue();
14190 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
14191 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14192 }
14193 return SDValue();
14194}
14195
14196SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14197 DAGCombinerInfo &DCI) const {
14198
14199 if (N->getValueType(0) != MVT::i32)
14200 return SDValue();
14201
14202 if (!isNullConstant(N->getOperand(1)))
14203 return SDValue();
14204
14205 SelectionDAG &DAG = DCI.DAG;
14206 SDValue LHS = N->getOperand(0);
14207
14208 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14209 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14210 unsigned LHSOpc = LHS.getOpcode();
14211 unsigned Opc = N->getOpcode();
14212 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14213 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14214 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14215 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14216 }
14217 return SDValue();
14218}
14219
14220SDValue SITargetLowering::performFAddCombine(SDNode *N,
14221 DAGCombinerInfo &DCI) const {
14222 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14223 return SDValue();
14224
14225 SelectionDAG &DAG = DCI.DAG;
14226 EVT VT = N->getValueType(0);
14227
14228 SDLoc SL(N);
14229 SDValue LHS = N->getOperand(0);
14230 SDValue RHS = N->getOperand(1);
14231
14232 // These should really be instruction patterns, but writing patterns with
14233 // source modifiers is a pain.
14234
14235 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14236 if (LHS.getOpcode() == ISD::FADD) {
14237 SDValue A = LHS.getOperand(0);
14238 if (A == LHS.getOperand(1)) {
14239 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14240 if (FusedOp != 0) {
14241 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14242 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14243 }
14244 }
14245 }
14246
14247 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14248 if (RHS.getOpcode() == ISD::FADD) {
14249 SDValue A = RHS.getOperand(0);
14250 if (A == RHS.getOperand(1)) {
14251 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14252 if (FusedOp != 0) {
14253 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14254 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14255 }
14256 }
14257 }
14258
14259 return SDValue();
14260}
14261
14262SDValue SITargetLowering::performFSubCombine(SDNode *N,
14263 DAGCombinerInfo &DCI) const {
14264 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14265 return SDValue();
14266
14267 SelectionDAG &DAG = DCI.DAG;
14268 SDLoc SL(N);
14269 EVT VT = N->getValueType(0);
14270 assert(!VT.isVector());
14271
14272 // Try to get the fneg to fold into the source modifier. This undoes generic
14273 // DAG combines and folds them into the mad.
14274 //
14275 // Only do this if we are not trying to support denormals. v_mad_f32 does
14276 // not support denormals ever.
14277 SDValue LHS = N->getOperand(0);
14278 SDValue RHS = N->getOperand(1);
14279 if (LHS.getOpcode() == ISD::FADD) {
14280 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14281 SDValue A = LHS.getOperand(0);
14282 if (A == LHS.getOperand(1)) {
14283 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14284 if (FusedOp != 0){
14285 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14286 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14287
14288 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14289 }
14290 }
14291 }
14292
14293 if (RHS.getOpcode() == ISD::FADD) {
14294 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14295
14296 SDValue A = RHS.getOperand(0);
14297 if (A == RHS.getOperand(1)) {
14298 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14299 if (FusedOp != 0){
14300 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14301 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14302 }
14303 }
14304 }
14305
14306 return SDValue();
14307}
14308
14309SDValue SITargetLowering::performFDivCombine(SDNode *N,
14310 DAGCombinerInfo &DCI) const {
14311 SelectionDAG &DAG = DCI.DAG;
14312 SDLoc SL(N);
14313 EVT VT = N->getValueType(0);
14314 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14315 return SDValue();
14316
14317 SDValue LHS = N->getOperand(0);
14318 SDValue RHS = N->getOperand(1);
14319
14320 SDNodeFlags Flags = N->getFlags();
14321 SDNodeFlags RHSFlags = RHS->getFlags();
14322 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14323 !RHS->hasOneUse())
14324 return SDValue();
14325
14326 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14327 bool IsNegative = false;
14328 if (CLHS->isExactlyValue(1.0) ||
14329 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14330 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14331 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14332 if (RHS.getOpcode() == ISD::FSQRT) {
14333 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14334 SDValue Rsq =
14335 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14336 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14337 }
14338 }
14339 }
14340
14341 return SDValue();
14342}
14343
14344SDValue SITargetLowering::performFMACombine(SDNode *N,
14345 DAGCombinerInfo &DCI) const {
14346 SelectionDAG &DAG = DCI.DAG;
14347 EVT VT = N->getValueType(0);
14348 SDLoc SL(N);
14349
14350 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14351 return SDValue();
14352
14353 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14354 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14355 SDValue Op1 = N->getOperand(0);
14356 SDValue Op2 = N->getOperand(1);
14357 SDValue FMA = N->getOperand(2);
14358
14359 if (FMA.getOpcode() != ISD::FMA ||
14360 Op1.getOpcode() != ISD::FP_EXTEND ||
14361 Op2.getOpcode() != ISD::FP_EXTEND)
14362 return SDValue();
14363
14364 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14365 // regardless of the denorm mode setting. Therefore,
14366 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14367 const TargetOptions &Options = DAG.getTarget().Options;
14368 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14369 (N->getFlags().hasAllowContract() &&
14370 FMA->getFlags().hasAllowContract())) {
14371 Op1 = Op1.getOperand(0);
14372 Op2 = Op2.getOperand(0);
14373 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14375 return SDValue();
14376
14377 SDValue Vec1 = Op1.getOperand(0);
14378 SDValue Idx1 = Op1.getOperand(1);
14379 SDValue Vec2 = Op2.getOperand(0);
14380
14381 SDValue FMAOp1 = FMA.getOperand(0);
14382 SDValue FMAOp2 = FMA.getOperand(1);
14383 SDValue FMAAcc = FMA.getOperand(2);
14384
14385 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14386 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14387 return SDValue();
14388
14389 FMAOp1 = FMAOp1.getOperand(0);
14390 FMAOp2 = FMAOp2.getOperand(0);
14391 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14393 return SDValue();
14394
14395 SDValue Vec3 = FMAOp1.getOperand(0);
14396 SDValue Vec4 = FMAOp2.getOperand(0);
14397 SDValue Idx2 = FMAOp1.getOperand(1);
14398
14399 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14400 // Idx1 and Idx2 cannot be the same.
14401 Idx1 == Idx2)
14402 return SDValue();
14403
14404 if (Vec1 == Vec2 || Vec3 == Vec4)
14405 return SDValue();
14406
14407 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14408 return SDValue();
14409
14410 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14411 (Vec1 == Vec4 && Vec2 == Vec3)) {
14412 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14413 DAG.getTargetConstant(0, SL, MVT::i1));
14414 }
14415 }
14416 return SDValue();
14417}
14418
14419SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14420 DAGCombinerInfo &DCI) const {
14421 SelectionDAG &DAG = DCI.DAG;
14422 SDLoc SL(N);
14423
14424 SDValue LHS = N->getOperand(0);
14425 SDValue RHS = N->getOperand(1);
14426 EVT VT = LHS.getValueType();
14427 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14428
14429 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14430 if (!CRHS) {
14431 CRHS = dyn_cast<ConstantSDNode>(LHS);
14432 if (CRHS) {
14433 std::swap(LHS, RHS);
14435 }
14436 }
14437
14438 if (CRHS) {
14439 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14440 isBoolSGPR(LHS.getOperand(0))) {
14441 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14442 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14443 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14444 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14445 if ((CRHS->isAllOnes() &&
14446 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14447 (CRHS->isZero() &&
14448 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14449 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14450 DAG.getConstant(-1, SL, MVT::i1));
14451 if ((CRHS->isAllOnes() &&
14452 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14453 (CRHS->isZero() &&
14454 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14455 return LHS.getOperand(0);
14456 }
14457
14458 const APInt &CRHSVal = CRHS->getAPIntValue();
14459 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14460 LHS.getOpcode() == ISD::SELECT &&
14461 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14462 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14463 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14464 isBoolSGPR(LHS.getOperand(0))) {
14465 // Given CT != FT:
14466 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14467 // setcc (select cc, CT, CF), CF, ne => cc
14468 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14469 // setcc (select cc, CT, CF), CT, eq => cc
14470 const APInt &CT = LHS.getConstantOperandAPInt(1);
14471 const APInt &CF = LHS.getConstantOperandAPInt(2);
14472
14473 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14474 (CT == CRHSVal && CC == ISD::SETNE))
14475 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14476 DAG.getConstant(-1, SL, MVT::i1));
14477 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14478 (CT == CRHSVal && CC == ISD::SETEQ))
14479 return LHS.getOperand(0);
14480 }
14481 }
14482
14483 if (VT != MVT::f32 && VT != MVT::f64 &&
14484 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14485 return SDValue();
14486
14487 // Match isinf/isfinite pattern
14488 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14489 // (fcmp one (fabs x), inf) -> (fp_class x,
14490 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14491 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14492 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14493 if (!CRHS)
14494 return SDValue();
14495
14496 const APFloat &APF = CRHS->getValueAPF();
14497 if (APF.isInfinity() && !APF.isNegative()) {
14498 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14500 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14506 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14507 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14508 DAG.getConstant(Mask, SL, MVT::i32));
14509 }
14510 }
14511
14512 return SDValue();
14513}
14514
14515SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14516 DAGCombinerInfo &DCI) const {
14517 SelectionDAG &DAG = DCI.DAG;
14518 SDLoc SL(N);
14519 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14520
14521 SDValue Src = N->getOperand(0);
14522 SDValue Shift = N->getOperand(0);
14523
14524 // TODO: Extend type shouldn't matter (assuming legal types).
14525 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14526 Shift = Shift.getOperand(0);
14527
14528 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14529 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14530 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14531 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14532 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14533 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14534 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14535 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14536 SDLoc(Shift.getOperand(0)), MVT::i32);
14537
14538 unsigned ShiftOffset = 8 * Offset;
14539 if (Shift.getOpcode() == ISD::SHL)
14540 ShiftOffset -= C->getZExtValue();
14541 else
14542 ShiftOffset += C->getZExtValue();
14543
14544 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14545 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14546 MVT::f32, Shifted);
14547 }
14548 }
14549 }
14550
14551 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14552 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14553 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14554 // We simplified Src. If this node is not dead, visit it again so it is
14555 // folded properly.
14556 if (N->getOpcode() != ISD::DELETED_NODE)
14557 DCI.AddToWorklist(N);
14558 return SDValue(N, 0);
14559 }
14560
14561 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14562 if (SDValue DemandedSrc =
14564 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14565
14566 return SDValue();
14567}
14568
14569SDValue SITargetLowering::performClampCombine(SDNode *N,
14570 DAGCombinerInfo &DCI) const {
14571 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14572 if (!CSrc)
14573 return SDValue();
14574
14575 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14576 const APFloat &F = CSrc->getValueAPF();
14577 APFloat Zero = APFloat::getZero(F.getSemantics());
14578 if (F < Zero ||
14579 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14580 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14581 }
14582
14583 APFloat One(F.getSemantics(), "1.0");
14584 if (F > One)
14585 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14586
14587 return SDValue(CSrc, 0);
14588}
14589
14590
14592 DAGCombinerInfo &DCI) const {
14593 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14594 return SDValue();
14595 switch (N->getOpcode()) {
14596 case ISD::ADD:
14597 return performAddCombine(N, DCI);
14598 case ISD::SUB:
14599 return performSubCombine(N, DCI);
14600 case ISD::UADDO_CARRY:
14601 case ISD::USUBO_CARRY:
14602 return performAddCarrySubCarryCombine(N, DCI);
14603 case ISD::FADD:
14604 return performFAddCombine(N, DCI);
14605 case ISD::FSUB:
14606 return performFSubCombine(N, DCI);
14607 case ISD::FDIV:
14608 return performFDivCombine(N, DCI);
14609 case ISD::SETCC:
14610 return performSetCCCombine(N, DCI);
14611 case ISD::FMAXNUM:
14612 case ISD::FMINNUM:
14613 case ISD::FMAXNUM_IEEE:
14614 case ISD::FMINNUM_IEEE:
14615 case ISD::FMAXIMUM:
14616 case ISD::FMINIMUM:
14617 case ISD::SMAX:
14618 case ISD::SMIN:
14619 case ISD::UMAX:
14620 case ISD::UMIN:
14623 return performMinMaxCombine(N, DCI);
14624 case ISD::FMA:
14625 return performFMACombine(N, DCI);
14626 case ISD::AND:
14627 return performAndCombine(N, DCI);
14628 case ISD::OR:
14629 return performOrCombine(N, DCI);
14630 case ISD::FSHR: {
14632 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14633 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14634 return matchPERM(N, DCI);
14635 }
14636 break;
14637 }
14638 case ISD::XOR:
14639 return performXorCombine(N, DCI);
14640 case ISD::ZERO_EXTEND:
14641 return performZeroExtendCombine(N, DCI);
14643 return performSignExtendInRegCombine(N , DCI);
14645 return performClassCombine(N, DCI);
14646 case ISD::FCANONICALIZE:
14647 return performFCanonicalizeCombine(N, DCI);
14648 case AMDGPUISD::RCP:
14649 return performRcpCombine(N, DCI);
14650 case ISD::FLDEXP:
14651 case AMDGPUISD::FRACT:
14652 case AMDGPUISD::RSQ:
14655 case AMDGPUISD::RSQ_CLAMP: {
14656 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14657 SDValue Src = N->getOperand(0);
14658 if (Src.isUndef())
14659 return Src;
14660 break;
14661 }
14662 case ISD::SINT_TO_FP:
14663 case ISD::UINT_TO_FP:
14664 return performUCharToFloatCombine(N, DCI);
14665 case ISD::FCOPYSIGN:
14666 return performFCopySignCombine(N, DCI);
14671 return performCvtF32UByteNCombine(N, DCI);
14672 case AMDGPUISD::FMED3:
14673 return performFMed3Combine(N, DCI);
14675 return performCvtPkRTZCombine(N, DCI);
14676 case AMDGPUISD::CLAMP:
14677 return performClampCombine(N, DCI);
14678 case ISD::SCALAR_TO_VECTOR: {
14679 SelectionDAG &DAG = DCI.DAG;
14680 EVT VT = N->getValueType(0);
14681
14682 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14683 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14684 SDLoc SL(N);
14685 SDValue Src = N->getOperand(0);
14686 EVT EltVT = Src.getValueType();
14687 if (EltVT != MVT::i16)
14688 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14689
14690 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14691 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14692 }
14693
14694 break;
14695 }
14697 return performExtractVectorEltCombine(N, DCI);
14699 return performInsertVectorEltCombine(N, DCI);
14700 case ISD::FP_ROUND:
14701 return performFPRoundCombine(N, DCI);
14702 case ISD::LOAD: {
14703 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14704 return Widened;
14705 [[fallthrough]];
14706 }
14707 default: {
14708 if (!DCI.isBeforeLegalize()) {
14709 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14710 return performMemSDNodeCombine(MemNode, DCI);
14711 }
14712
14713 break;
14714 }
14715 }
14716
14718}
14719
14720/// Helper function for adjustWritemask
14721static unsigned SubIdx2Lane(unsigned Idx) {
14722 switch (Idx) {
14723 default: return ~0u;
14724 case AMDGPU::sub0: return 0;
14725 case AMDGPU::sub1: return 1;
14726 case AMDGPU::sub2: return 2;
14727 case AMDGPU::sub3: return 3;
14728 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14729 }
14730}
14731
14732/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14733SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14734 SelectionDAG &DAG) const {
14735 unsigned Opcode = Node->getMachineOpcode();
14736
14737 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14738 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14739 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14740 return Node; // not implemented for D16
14741
14742 SDNode *Users[5] = { nullptr };
14743 unsigned Lane = 0;
14744 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14745 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14746 unsigned NewDmask = 0;
14747 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14748 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14749 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14750 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14751 ? true
14752 : false;
14753 unsigned TFCLane = 0;
14754 bool HasChain = Node->getNumValues() > 1;
14755
14756 if (OldDmask == 0) {
14757 // These are folded out, but on the chance it happens don't assert.
14758 return Node;
14759 }
14760
14761 unsigned OldBitsSet = llvm::popcount(OldDmask);
14762 // Work out which is the TFE/LWE lane if that is enabled.
14763 if (UsesTFC) {
14764 TFCLane = OldBitsSet;
14765 }
14766
14767 // Try to figure out the used register components
14768 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14769 I != E; ++I) {
14770
14771 // Don't look at users of the chain.
14772 if (I.getUse().getResNo() != 0)
14773 continue;
14774
14775 // Abort if we can't understand the usage
14776 if (!I->isMachineOpcode() ||
14777 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14778 return Node;
14779
14780 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14781 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14782 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14783 // set, etc.
14784 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14785 if (Lane == ~0u)
14786 return Node;
14787
14788 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14789 if (UsesTFC && Lane == TFCLane) {
14790 Users[Lane] = *I;
14791 } else {
14792 // Set which texture component corresponds to the lane.
14793 unsigned Comp;
14794 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14795 Comp = llvm::countr_zero(Dmask);
14796 Dmask &= ~(1 << Comp);
14797 }
14798
14799 // Abort if we have more than one user per component.
14800 if (Users[Lane])
14801 return Node;
14802
14803 Users[Lane] = *I;
14804 NewDmask |= 1 << Comp;
14805 }
14806 }
14807
14808 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14809 bool NoChannels = !NewDmask;
14810 if (NoChannels) {
14811 if (!UsesTFC) {
14812 // No uses of the result and not using TFC. Then do nothing.
14813 return Node;
14814 }
14815 // If the original dmask has one channel - then nothing to do
14816 if (OldBitsSet == 1)
14817 return Node;
14818 // Use an arbitrary dmask - required for the instruction to work
14819 NewDmask = 1;
14820 }
14821 // Abort if there's no change
14822 if (NewDmask == OldDmask)
14823 return Node;
14824
14825 unsigned BitsSet = llvm::popcount(NewDmask);
14826
14827 // Check for TFE or LWE - increase the number of channels by one to account
14828 // for the extra return value
14829 // This will need adjustment for D16 if this is also included in
14830 // adjustWriteMask (this function) but at present D16 are excluded.
14831 unsigned NewChannels = BitsSet + UsesTFC;
14832
14833 int NewOpcode =
14834 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14835 assert(NewOpcode != -1 &&
14836 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14837 "failed to find equivalent MIMG op");
14838
14839 // Adjust the writemask in the node
14841 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14842 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14843 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14844
14845 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14846
14847 MVT ResultVT = NewChannels == 1 ?
14848 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
14849 NewChannels == 5 ? 8 : NewChannels);
14850 SDVTList NewVTList = HasChain ?
14851 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14852
14853
14854 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
14855 NewVTList, Ops);
14856
14857 if (HasChain) {
14858 // Update chain.
14859 DAG.setNodeMemRefs(NewNode, Node->memoperands());
14860 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
14861 }
14862
14863 if (NewChannels == 1) {
14864 assert(Node->hasNUsesOfValue(1, 0));
14865 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
14866 SDLoc(Node), Users[Lane]->getValueType(0),
14867 SDValue(NewNode, 0));
14868 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
14869 return nullptr;
14870 }
14871
14872 // Update the users of the node with the new indices
14873 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14874 SDNode *User = Users[i];
14875 if (!User) {
14876 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14877 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14878 if (i || !NoChannels)
14879 continue;
14880 } else {
14881 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14882 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
14883 if (NewUser != User) {
14884 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
14885 DAG.RemoveDeadNode(User);
14886 }
14887 }
14888
14889 switch (Idx) {
14890 default: break;
14891 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14892 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14893 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14894 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
14895 }
14896 }
14897
14898 DAG.RemoveDeadNode(Node);
14899 return nullptr;
14900}
14901
14903 if (Op.getOpcode() == ISD::AssertZext)
14904 Op = Op.getOperand(0);
14905
14906 return isa<FrameIndexSDNode>(Op);
14907}
14908
14909/// Legalize target independent instructions (e.g. INSERT_SUBREG)
14910/// with frame index operands.
14911/// LLVM assumes that inputs are to these instructions are registers.
14913 SelectionDAG &DAG) const {
14914 if (Node->getOpcode() == ISD::CopyToReg) {
14915 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
14916 SDValue SrcVal = Node->getOperand(2);
14917
14918 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
14919 // to try understanding copies to physical registers.
14920 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
14921 SDLoc SL(Node);
14923 SDValue VReg = DAG.getRegister(
14924 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
14925
14926 SDNode *Glued = Node->getGluedNode();
14927 SDValue ToVReg
14928 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
14929 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
14930 SDValue ToResultReg
14931 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
14932 VReg, ToVReg.getValue(1));
14933 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
14934 DAG.RemoveDeadNode(Node);
14935 return ToResultReg.getNode();
14936 }
14937 }
14938
14940 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
14941 if (!isFrameIndexOp(Node->getOperand(i))) {
14942 Ops.push_back(Node->getOperand(i));
14943 continue;
14944 }
14945
14946 SDLoc DL(Node);
14947 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
14948 Node->getOperand(i).getValueType(),
14949 Node->getOperand(i)), 0));
14950 }
14951
14952 return DAG.UpdateNodeOperands(Node, Ops);
14953}
14954
14955/// Fold the instructions after selecting them.
14956/// Returns null if users were already updated.
14958 SelectionDAG &DAG) const {
14960 unsigned Opcode = Node->getMachineOpcode();
14961
14962 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
14963 !TII->isGather4(Opcode) &&
14964 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
14965 return adjustWritemask(Node, DAG);
14966 }
14967
14968 if (Opcode == AMDGPU::INSERT_SUBREG ||
14969 Opcode == AMDGPU::REG_SEQUENCE) {
14971 return Node;
14972 }
14973
14974 switch (Opcode) {
14975 case AMDGPU::V_DIV_SCALE_F32_e64:
14976 case AMDGPU::V_DIV_SCALE_F64_e64: {
14977 // Satisfy the operand register constraint when one of the inputs is
14978 // undefined. Ordinarily each undef value will have its own implicit_def of
14979 // a vreg, so force these to use a single register.
14980 SDValue Src0 = Node->getOperand(1);
14981 SDValue Src1 = Node->getOperand(3);
14982 SDValue Src2 = Node->getOperand(5);
14983
14984 if ((Src0.isMachineOpcode() &&
14985 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
14986 (Src0 == Src1 || Src0 == Src2))
14987 break;
14988
14989 MVT VT = Src0.getValueType().getSimpleVT();
14990 const TargetRegisterClass *RC =
14991 getRegClassFor(VT, Src0.getNode()->isDivergent());
14992
14994 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
14995
14996 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
14997 UndefReg, Src0, SDValue());
14998
14999 // src0 must be the same register as src1 or src2, even if the value is
15000 // undefined, so make sure we don't violate this constraint.
15001 if (Src0.isMachineOpcode() &&
15002 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15003 if (Src1.isMachineOpcode() &&
15004 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15005 Src0 = Src1;
15006 else if (Src2.isMachineOpcode() &&
15007 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15008 Src0 = Src2;
15009 else {
15010 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15011 Src0 = UndefReg;
15012 Src1 = UndefReg;
15013 }
15014 } else
15015 break;
15016
15017 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15018 Ops[1] = Src0;
15019 Ops[3] = Src1;
15020 Ops[5] = Src2;
15021 Ops.push_back(ImpDef.getValue(1));
15022 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15023 }
15024 default:
15025 break;
15026 }
15027
15028 return Node;
15029}
15030
15031// Any MIMG instructions that use tfe or lwe require an initialization of the
15032// result register that will be written in the case of a memory access failure.
15033// The required code is also added to tie this init code to the result of the
15034// img instruction.
15037 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15038 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15039 MachineBasicBlock &MBB = *MI.getParent();
15040
15041 int DstIdx =
15042 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15043 unsigned InitIdx = 0;
15044
15045 if (TII->isImage(MI)) {
15046 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15047 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15048 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15049
15050 if (!TFE && !LWE) // intersect_ray
15051 return;
15052
15053 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15054 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15055 unsigned D16Val = D16 ? D16->getImm() : 0;
15056
15057 if (!TFEVal && !LWEVal)
15058 return;
15059
15060 // At least one of TFE or LWE are non-zero
15061 // We have to insert a suitable initialization of the result value and
15062 // tie this to the dest of the image instruction.
15063
15064 // Calculate which dword we have to initialize to 0.
15065 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15066
15067 // check that dmask operand is found.
15068 assert(MO_Dmask && "Expected dmask operand in instruction");
15069
15070 unsigned dmask = MO_Dmask->getImm();
15071 // Determine the number of active lanes taking into account the
15072 // Gather4 special case
15073 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15074
15075 bool Packed = !Subtarget->hasUnpackedD16VMem();
15076
15077 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15078
15079 // Abandon attempt if the dst size isn't large enough
15080 // - this is in fact an error but this is picked up elsewhere and
15081 // reported correctly.
15082 uint32_t DstSize =
15083 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15084 if (DstSize < InitIdx)
15085 return;
15086 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15087 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15088 } else {
15089 return;
15090 }
15091
15092 const DebugLoc &DL = MI.getDebugLoc();
15093
15094 // Create a register for the initialization value.
15095 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15096 unsigned NewDst = 0; // Final initialized value will be in here
15097
15098 // If PRTStrictNull feature is enabled (the default) then initialize
15099 // all the result registers to 0, otherwise just the error indication
15100 // register (VGPRn+1)
15101 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15102 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15103
15104 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15105 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15106 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15107 // Initialize dword
15108 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15109 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15110 .addImm(0);
15111 // Insert into the super-reg
15112 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15113 .addReg(PrevDst)
15114 .addReg(SubReg)
15116
15117 PrevDst = NewDst;
15118 }
15119
15120 // Add as an implicit operand
15121 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15122
15123 // Tie the just added implicit operand to the dst
15124 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15125}
15126
15127/// Assign the register class depending on the number of
15128/// bits set in the writemask
15130 SDNode *Node) const {
15132
15133 MachineFunction *MF = MI.getParent()->getParent();
15136
15137 if (TII->isVOP3(MI.getOpcode())) {
15138 // Make sure constant bus requirements are respected.
15139 TII->legalizeOperandsVOP3(MRI, MI);
15140
15141 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15142 // This saves a chain-copy of registers and better balance register
15143 // use between vgpr and agpr as agpr tuples tend to be big.
15144 if (!MI.getDesc().operands().empty()) {
15145 unsigned Opc = MI.getOpcode();
15146 bool HasAGPRs = Info->mayNeedAGPRs();
15147 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15148 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15149 for (auto I :
15150 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15151 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15152 if (I == -1)
15153 break;
15154 if ((I == Src2Idx) && (HasAGPRs))
15155 break;
15156 MachineOperand &Op = MI.getOperand(I);
15157 if (!Op.isReg() || !Op.getReg().isVirtual())
15158 continue;
15159 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15160 if (!TRI->hasAGPRs(RC))
15161 continue;
15162 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15163 if (!Src || !Src->isCopy() ||
15164 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15165 continue;
15166 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15167 // All uses of agpr64 and agpr32 can also accept vgpr except for
15168 // v_accvgpr_read, but we do not produce agpr reads during selection,
15169 // so no use checks are needed.
15170 MRI.setRegClass(Op.getReg(), NewRC);
15171 }
15172
15173 if (!HasAGPRs)
15174 return;
15175
15176 // Resolve the rest of AV operands to AGPRs.
15177 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15178 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15179 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15180 if (TRI->isVectorSuperClass(RC)) {
15181 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15182 MRI.setRegClass(Src2->getReg(), NewRC);
15183 if (Src2->isTied())
15184 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15185 }
15186 }
15187 }
15188 }
15189
15190 return;
15191 }
15192
15193 if (TII->isImage(MI))
15194 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15195}
15196
15198 uint64_t Val) {
15199 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15200 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15201}
15202
15204 const SDLoc &DL,
15205 SDValue Ptr) const {
15207
15208 // Build the half of the subregister with the constants before building the
15209 // full 128-bit register. If we are building multiple resource descriptors,
15210 // this will allow CSEing of the 2-component register.
15211 const SDValue Ops0[] = {
15212 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15213 buildSMovImm32(DAG, DL, 0),
15214 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15215 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15216 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15217 };
15218
15219 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15220 MVT::v2i32, Ops0), 0);
15221
15222 // Combine the constants and the pointer.
15223 const SDValue Ops1[] = {
15224 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15225 Ptr,
15226 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15227 SubRegHi,
15228 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15229 };
15230
15231 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15232}
15233
15234/// Return a resource descriptor with the 'Add TID' bit enabled
15235/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15236/// of the resource descriptor) to create an offset, which is added to
15237/// the resource pointer.
15239 SDValue Ptr, uint32_t RsrcDword1,
15240 uint64_t RsrcDword2And3) const {
15241 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15242 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15243 if (RsrcDword1) {
15244 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15245 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15246 0);
15247 }
15248
15249 SDValue DataLo = buildSMovImm32(DAG, DL,
15250 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15251 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15252
15253 const SDValue Ops[] = {
15254 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15255 PtrLo,
15256 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15257 PtrHi,
15258 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15259 DataLo,
15260 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15261 DataHi,
15262 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15263 };
15264
15265 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15266}
15267
15268//===----------------------------------------------------------------------===//
15269// SI Inline Assembly Support
15270//===----------------------------------------------------------------------===//
15271
15272std::pair<unsigned, const TargetRegisterClass *>
15274 StringRef Constraint,
15275 MVT VT) const {
15276 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15277
15278 const TargetRegisterClass *RC = nullptr;
15279 if (Constraint.size() == 1) {
15280 const unsigned BitWidth = VT.getSizeInBits();
15281 switch (Constraint[0]) {
15282 default:
15283 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15284 case 's':
15285 case 'r':
15286 switch (BitWidth) {
15287 case 16:
15288 RC = &AMDGPU::SReg_32RegClass;
15289 break;
15290 case 64:
15291 RC = &AMDGPU::SGPR_64RegClass;
15292 break;
15293 default:
15295 if (!RC)
15296 return std::pair(0U, nullptr);
15297 break;
15298 }
15299 break;
15300 case 'v':
15301 switch (BitWidth) {
15302 case 16:
15303 RC = &AMDGPU::VGPR_32RegClass;
15304 break;
15305 default:
15306 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15307 if (!RC)
15308 return std::pair(0U, nullptr);
15309 break;
15310 }
15311 break;
15312 case 'a':
15313 if (!Subtarget->hasMAIInsts())
15314 break;
15315 switch (BitWidth) {
15316 case 16:
15317 RC = &AMDGPU::AGPR_32RegClass;
15318 break;
15319 default:
15320 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15321 if (!RC)
15322 return std::pair(0U, nullptr);
15323 break;
15324 }
15325 break;
15326 }
15327 // We actually support i128, i16 and f16 as inline parameters
15328 // even if they are not reported as legal
15329 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15330 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15331 return std::pair(0U, RC);
15332 }
15333
15334 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15335 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15336 if (RegName.consume_front("v")) {
15337 RC = &AMDGPU::VGPR_32RegClass;
15338 } else if (RegName.consume_front("s")) {
15339 RC = &AMDGPU::SGPR_32RegClass;
15340 } else if (RegName.consume_front("a")) {
15341 RC = &AMDGPU::AGPR_32RegClass;
15342 }
15343
15344 if (RC) {
15345 uint32_t Idx;
15346 if (RegName.consume_front("[")) {
15347 uint32_t End;
15348 bool Failed = RegName.consumeInteger(10, Idx);
15349 Failed |= !RegName.consume_front(":");
15350 Failed |= RegName.consumeInteger(10, End);
15351 Failed |= !RegName.consume_back("]");
15352 if (!Failed) {
15353 uint32_t Width = (End - Idx + 1) * 32;
15354 MCRegister Reg = RC->getRegister(Idx);
15356 RC = TRI->getVGPRClassForBitWidth(Width);
15357 else if (SIRegisterInfo::isSGPRClass(RC))
15358 RC = TRI->getSGPRClassForBitWidth(Width);
15359 else if (SIRegisterInfo::isAGPRClass(RC))
15360 RC = TRI->getAGPRClassForBitWidth(Width);
15361 if (RC) {
15362 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15363 return std::pair(Reg, RC);
15364 }
15365 }
15366 } else {
15367 bool Failed = RegName.getAsInteger(10, Idx);
15368 if (!Failed && Idx < RC->getNumRegs())
15369 return std::pair(RC->getRegister(Idx), RC);
15370 }
15371 }
15372 }
15373
15374 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15375 if (Ret.first)
15376 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15377
15378 return Ret;
15379}
15380
15381static bool isImmConstraint(StringRef Constraint) {
15382 if (Constraint.size() == 1) {
15383 switch (Constraint[0]) {
15384 default: break;
15385 case 'I':
15386 case 'J':
15387 case 'A':
15388 case 'B':
15389 case 'C':
15390 return true;
15391 }
15392 } else if (Constraint == "DA" ||
15393 Constraint == "DB") {
15394 return true;
15395 }
15396 return false;
15397}
15398
15401 if (Constraint.size() == 1) {
15402 switch (Constraint[0]) {
15403 default: break;
15404 case 's':
15405 case 'v':
15406 case 'a':
15407 return C_RegisterClass;
15408 }
15409 }
15410 if (isImmConstraint(Constraint)) {
15411 return C_Other;
15412 }
15413 return TargetLowering::getConstraintType(Constraint);
15414}
15415
15416static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15418 Val = Val & maskTrailingOnes<uint64_t>(Size);
15419 }
15420 return Val;
15421}
15422
15424 StringRef Constraint,
15425 std::vector<SDValue> &Ops,
15426 SelectionDAG &DAG) const {
15427 if (isImmConstraint(Constraint)) {
15428 uint64_t Val;
15429 if (getAsmOperandConstVal(Op, Val) &&
15430 checkAsmConstraintVal(Op, Constraint, Val)) {
15431 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15432 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15433 }
15434 } else {
15435 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15436 }
15437}
15438
15440 unsigned Size = Op.getScalarValueSizeInBits();
15441 if (Size > 64)
15442 return false;
15443
15444 if (Size == 16 && !Subtarget->has16BitInsts())
15445 return false;
15446
15447 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15448 Val = C->getSExtValue();
15449 return true;
15450 }
15451 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15452 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15453 return true;
15454 }
15455 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15456 if (Size != 16 || Op.getNumOperands() != 2)
15457 return false;
15458 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15459 return false;
15460 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15461 Val = C->getSExtValue();
15462 return true;
15463 }
15464 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15465 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15466 return true;
15467 }
15468 }
15469
15470 return false;
15471}
15472
15474 uint64_t Val) const {
15475 if (Constraint.size() == 1) {
15476 switch (Constraint[0]) {
15477 case 'I':
15479 case 'J':
15480 return isInt<16>(Val);
15481 case 'A':
15482 return checkAsmConstraintValA(Op, Val);
15483 case 'B':
15484 return isInt<32>(Val);
15485 case 'C':
15486 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15488 default:
15489 break;
15490 }
15491 } else if (Constraint.size() == 2) {
15492 if (Constraint == "DA") {
15493 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15494 int64_t LoBits = static_cast<int32_t>(Val);
15495 return checkAsmConstraintValA(Op, HiBits, 32) &&
15496 checkAsmConstraintValA(Op, LoBits, 32);
15497 }
15498 if (Constraint == "DB") {
15499 return true;
15500 }
15501 }
15502 llvm_unreachable("Invalid asm constraint");
15503}
15504
15506 unsigned MaxSize) const {
15507 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15508 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15509 if (Size == 16) {
15510 MVT VT = Op.getSimpleValueType();
15511 switch (VT.SimpleTy) {
15512 default:
15513 return false;
15514 case MVT::i16:
15515 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15516 case MVT::f16:
15517 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15518 case MVT::bf16:
15519 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15520 case MVT::v2i16:
15521 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15522 case MVT::v2f16:
15523 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15524 case MVT::v2bf16:
15525 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15526 }
15527 }
15528 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15529 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15530 return true;
15531 return false;
15532}
15533
15534static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15535 switch (UnalignedClassID) {
15536 case AMDGPU::VReg_64RegClassID:
15537 return AMDGPU::VReg_64_Align2RegClassID;
15538 case AMDGPU::VReg_96RegClassID:
15539 return AMDGPU::VReg_96_Align2RegClassID;
15540 case AMDGPU::VReg_128RegClassID:
15541 return AMDGPU::VReg_128_Align2RegClassID;
15542 case AMDGPU::VReg_160RegClassID:
15543 return AMDGPU::VReg_160_Align2RegClassID;
15544 case AMDGPU::VReg_192RegClassID:
15545 return AMDGPU::VReg_192_Align2RegClassID;
15546 case AMDGPU::VReg_224RegClassID:
15547 return AMDGPU::VReg_224_Align2RegClassID;
15548 case AMDGPU::VReg_256RegClassID:
15549 return AMDGPU::VReg_256_Align2RegClassID;
15550 case AMDGPU::VReg_288RegClassID:
15551 return AMDGPU::VReg_288_Align2RegClassID;
15552 case AMDGPU::VReg_320RegClassID:
15553 return AMDGPU::VReg_320_Align2RegClassID;
15554 case AMDGPU::VReg_352RegClassID:
15555 return AMDGPU::VReg_352_Align2RegClassID;
15556 case AMDGPU::VReg_384RegClassID:
15557 return AMDGPU::VReg_384_Align2RegClassID;
15558 case AMDGPU::VReg_512RegClassID:
15559 return AMDGPU::VReg_512_Align2RegClassID;
15560 case AMDGPU::VReg_1024RegClassID:
15561 return AMDGPU::VReg_1024_Align2RegClassID;
15562 case AMDGPU::AReg_64RegClassID:
15563 return AMDGPU::AReg_64_Align2RegClassID;
15564 case AMDGPU::AReg_96RegClassID:
15565 return AMDGPU::AReg_96_Align2RegClassID;
15566 case AMDGPU::AReg_128RegClassID:
15567 return AMDGPU::AReg_128_Align2RegClassID;
15568 case AMDGPU::AReg_160RegClassID:
15569 return AMDGPU::AReg_160_Align2RegClassID;
15570 case AMDGPU::AReg_192RegClassID:
15571 return AMDGPU::AReg_192_Align2RegClassID;
15572 case AMDGPU::AReg_256RegClassID:
15573 return AMDGPU::AReg_256_Align2RegClassID;
15574 case AMDGPU::AReg_512RegClassID:
15575 return AMDGPU::AReg_512_Align2RegClassID;
15576 case AMDGPU::AReg_1024RegClassID:
15577 return AMDGPU::AReg_1024_Align2RegClassID;
15578 default:
15579 return -1;
15580 }
15581}
15582
15583// Figure out which registers should be reserved for stack access. Only after
15584// the function is legalized do we know all of the non-spill stack objects or if
15585// calls are present.
15589 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15590 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15591 const SIInstrInfo *TII = ST.getInstrInfo();
15592
15593 if (Info->isEntryFunction()) {
15594 // Callable functions have fixed registers used for stack access.
15596 }
15597
15598 // TODO: Move this logic to getReservedRegs()
15599 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15600 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15601 Register SReg = ST.isWave32()
15602 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15603 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15604 &AMDGPU::SGPR_64RegClass);
15605 Info->setSGPRForEXECCopy(SReg);
15606
15607 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15608 Info->getStackPtrOffsetReg()));
15609 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15610 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15611
15612 // We need to worry about replacing the default register with itself in case
15613 // of MIR testcases missing the MFI.
15614 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15615 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15616
15617 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15618 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15619
15620 Info->limitOccupancy(MF);
15621
15622 if (ST.isWave32() && !MF.empty()) {
15623 for (auto &MBB : MF) {
15624 for (auto &MI : MBB) {
15625 TII->fixImplicitOperands(MI);
15626 }
15627 }
15628 }
15629
15630 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15631 // classes if required. Ideally the register class constraints would differ
15632 // per-subtarget, but there's no easy way to achieve that right now. This is
15633 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15634 // from using them as the register class for legal types.
15635 if (ST.needsAlignedVGPRs()) {
15636 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15637 const Register Reg = Register::index2VirtReg(I);
15638 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15639 if (!RC)
15640 continue;
15641 int NewClassID = getAlignedAGPRClassID(RC->getID());
15642 if (NewClassID != -1)
15643 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15644 }
15645 }
15646
15648}
15649
15651 KnownBits &Known,
15652 const APInt &DemandedElts,
15653 const SelectionDAG &DAG,
15654 unsigned Depth) const {
15655 Known.resetAll();
15656 unsigned Opc = Op.getOpcode();
15657 switch (Opc) {
15659 unsigned IID = Op.getConstantOperandVal(0);
15660 switch (IID) {
15661 case Intrinsic::amdgcn_mbcnt_lo:
15662 case Intrinsic::amdgcn_mbcnt_hi: {
15663 const GCNSubtarget &ST =
15665 // These return at most the (wavefront size - 1) + src1
15666 // As long as src1 is an immediate we can calc known bits
15667 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15668 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15669 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15670 // Cater for potential carry
15671 MaxActiveBits += Src1ValBits ? 1 : 0;
15672 unsigned Size = Op.getValueType().getSizeInBits();
15673 if (MaxActiveBits < Size)
15674 Known.Zero.setHighBits(Size - MaxActiveBits);
15675 return;
15676 }
15677 }
15678 break;
15679 }
15680 }
15682 Op, Known, DemandedElts, DAG, Depth);
15683}
15684
15686 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15688
15689 // Set the high bits to zero based on the maximum allowed scratch size per
15690 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15691 // calculation won't overflow, so assume the sign bit is never set.
15692 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15693}
15694
15696 KnownBits &Known, unsigned Dim) {
15697 unsigned MaxValue =
15698 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15699 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15700}
15701
15703 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15704 const MachineRegisterInfo &MRI, unsigned Depth) const {
15705 const MachineInstr *MI = MRI.getVRegDef(R);
15706 switch (MI->getOpcode()) {
15707 case AMDGPU::G_INTRINSIC:
15708 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15709 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15710 case Intrinsic::amdgcn_workitem_id_x:
15711 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15712 break;
15713 case Intrinsic::amdgcn_workitem_id_y:
15714 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15715 break;
15716 case Intrinsic::amdgcn_workitem_id_z:
15717 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15718 break;
15719 case Intrinsic::amdgcn_mbcnt_lo:
15720 case Intrinsic::amdgcn_mbcnt_hi: {
15721 // These return at most the wavefront size - 1.
15722 unsigned Size = MRI.getType(R).getSizeInBits();
15723 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15724 break;
15725 }
15726 case Intrinsic::amdgcn_groupstaticsize: {
15727 // We can report everything over the maximum size as 0. We can't report
15728 // based on the actual size because we don't know if it's accurate or not
15729 // at any given point.
15730 Known.Zero.setHighBits(
15731 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15732 break;
15733 }
15734 }
15735 break;
15736 }
15737 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15738 Known.Zero.setHighBits(24);
15739 break;
15740 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15741 Known.Zero.setHighBits(16);
15742 break;
15743 case AMDGPU::G_AMDGPU_SMED3:
15744 case AMDGPU::G_AMDGPU_UMED3: {
15745 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15746
15747 KnownBits Known2;
15748 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15749 if (Known2.isUnknown())
15750 break;
15751
15752 KnownBits Known1;
15753 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15754 if (Known1.isUnknown())
15755 break;
15756
15757 KnownBits Known0;
15758 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15759 if (Known0.isUnknown())
15760 break;
15761
15762 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15763 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15764 Known.One = Known0.One & Known1.One & Known2.One;
15765 break;
15766 }
15767 }
15768}
15769
15772 unsigned Depth) const {
15773 const MachineInstr *MI = MRI.getVRegDef(R);
15774 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15775 // FIXME: Can this move to generic code? What about the case where the call
15776 // site specifies a lower alignment?
15777 Intrinsic::ID IID = GI->getIntrinsicID();
15779 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15780 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15781 return *RetAlign;
15782 }
15783 return Align(1);
15784}
15785
15788 const Align CacheLineAlign = Align(64);
15789
15790 // Pre-GFX10 target did not benefit from loop alignment
15791 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15792 getSubtarget()->hasInstFwdPrefetchBug())
15793 return PrefAlign;
15794
15795 // On GFX10 I$ is 4 x 64 bytes cache lines.
15796 // By default prefetcher keeps one cache line behind and reads two ahead.
15797 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15798 // behind and one ahead.
15799 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15800 // If loop fits 64 bytes it always spans no more than two cache lines and
15801 // does not need an alignment.
15802 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15803 // Else if loop is less or equal 192 bytes we need two lines behind.
15804
15806 const MachineBasicBlock *Header = ML->getHeader();
15807 if (Header->getAlignment() != PrefAlign)
15808 return Header->getAlignment(); // Already processed.
15809
15810 unsigned LoopSize = 0;
15811 for (const MachineBasicBlock *MBB : ML->blocks()) {
15812 // If inner loop block is aligned assume in average half of the alignment
15813 // size to be added as nops.
15814 if (MBB != Header)
15815 LoopSize += MBB->getAlignment().value() / 2;
15816
15817 for (const MachineInstr &MI : *MBB) {
15818 LoopSize += TII->getInstSizeInBytes(MI);
15819 if (LoopSize > 192)
15820 return PrefAlign;
15821 }
15822 }
15823
15824 if (LoopSize <= 64)
15825 return PrefAlign;
15826
15827 if (LoopSize <= 128)
15828 return CacheLineAlign;
15829
15830 // If any of parent loops is surrounded by prefetch instructions do not
15831 // insert new for inner loop, which would reset parent's settings.
15832 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15833 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15834 auto I = Exit->getFirstNonDebugInstr();
15835 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15836 return CacheLineAlign;
15837 }
15838 }
15839
15840 MachineBasicBlock *Pre = ML->getLoopPreheader();
15841 MachineBasicBlock *Exit = ML->getExitBlock();
15842
15843 if (Pre && Exit) {
15844 auto PreTerm = Pre->getFirstTerminator();
15845 if (PreTerm == Pre->begin() ||
15846 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15847 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15848 .addImm(1); // prefetch 2 lines behind PC
15849
15850 auto ExitHead = Exit->getFirstNonDebugInstr();
15851 if (ExitHead == Exit->end() ||
15852 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15853 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15854 .addImm(2); // prefetch 1 line behind PC
15855 }
15856
15857 return CacheLineAlign;
15858}
15859
15861static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15862 assert(N->getOpcode() == ISD::CopyFromReg);
15863 do {
15864 // Follow the chain until we find an INLINEASM node.
15865 N = N->getOperand(0).getNode();
15866 if (N->getOpcode() == ISD::INLINEASM ||
15867 N->getOpcode() == ISD::INLINEASM_BR)
15868 return true;
15869 } while (N->getOpcode() == ISD::CopyFromReg);
15870 return false;
15871}
15872
15875 UniformityInfo *UA) const {
15876 switch (N->getOpcode()) {
15877 case ISD::CopyFromReg: {
15878 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15879 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15880 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15881 Register Reg = R->getReg();
15882
15883 // FIXME: Why does this need to consider isLiveIn?
15884 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15885 return !TRI->isSGPRReg(MRI, Reg);
15886
15887 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15888 return UA->isDivergent(V);
15889
15891 return !TRI->isSGPRReg(MRI, Reg);
15892 }
15893 case ISD::LOAD: {
15894 const LoadSDNode *L = cast<LoadSDNode>(N);
15895 unsigned AS = L->getAddressSpace();
15896 // A flat load may access private memory.
15898 }
15899 case ISD::CALLSEQ_END:
15900 return true;
15902 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
15904 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
15926 // Target-specific read-modify-write atomics are sources of divergence.
15927 return true;
15928 default:
15929 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
15930 // Generic read-modify-write atomics are sources of divergence.
15931 return A->readMem() && A->writeMem();
15932 }
15933 return false;
15934 }
15935}
15936
15938 EVT VT) const {
15939 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
15940 case MVT::f32:
15942 case MVT::f64:
15943 case MVT::f16:
15945 default:
15946 return false;
15947 }
15948}
15949
15951 LLT Ty, const MachineFunction &MF) const {
15952 switch (Ty.getScalarSizeInBits()) {
15953 case 32:
15954 return !denormalModeIsFlushAllF32(MF);
15955 case 64:
15956 case 16:
15957 return !denormalModeIsFlushAllF64F16(MF);
15958 default:
15959 return false;
15960 }
15961}
15962
15964 const SelectionDAG &DAG,
15965 bool SNaN,
15966 unsigned Depth) const {
15967 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
15968 const MachineFunction &MF = DAG.getMachineFunction();
15970
15971 if (Info->getMode().DX10Clamp)
15972 return true; // Clamped to 0.
15973 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
15974 }
15975
15977 SNaN, Depth);
15978}
15979
15980// Global FP atomic instructions have a hardcoded FP mode and do not support
15981// FP32 denormals, and only support v2f16 denormals.
15984 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
15985 if (&Flt == &APFloat::IEEEsingle())
15986 return DenormMode == DenormalMode::getPreserveSign();
15987 return DenormMode == DenormalMode::getIEEE();
15988}
15989
15990// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
15991// floating point atomic instructions. May generate more efficient code,
15992// but may not respect rounding and denormal modes, and may give incorrect
15993// results for certain memory destinations.
15995 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
15996 "true";
15997}
15998
16000 LLVMContext &Ctx = RMW->getContext();
16002 Ctx.getSyncScopeNames(SSNs);
16003 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16004 ? "system"
16005 : SSNs[RMW->getSyncScopeID()];
16006
16007 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16008 << "Hardware instruction generated for atomic "
16009 << RMW->getOperationName(RMW->getOperation())
16010 << " operation at memory scope " << MemScope;
16011}
16012
16015 unsigned AS = RMW->getPointerAddressSpace();
16016 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16018
16019 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16021 ORE.emit([=]() {
16022 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16023 });
16024 return Kind;
16025 };
16026
16027 auto SSID = RMW->getSyncScopeID();
16028 bool HasSystemScope =
16029 SSID == SyncScope::System ||
16030 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16031
16032 switch (RMW->getOperation()) {
16033 case AtomicRMWInst::Sub:
16034 case AtomicRMWInst::Or:
16035 case AtomicRMWInst::Xor: {
16036 // Atomic sub/or/xor do not work over PCI express, but atomic add
16037 // does. InstCombine transforms these with 0 to or, so undo that.
16038 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16039 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16040 ConstVal && ConstVal->isNullValue())
16042 }
16043
16044 break;
16045 }
16046 case AtomicRMWInst::FAdd: {
16047 Type *Ty = RMW->getType();
16048
16049 if (Ty->isHalfTy())
16051
16052 if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
16054
16057 Subtarget->hasAtomicFaddNoRtnInsts()) {
16058 if (Subtarget->hasGFX940Insts())
16060
16063
16064 // Always expand system scope fp atomics.
16065 if (HasSystemScope)
16067
16070 Ty->isFloatTy()) {
16071 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16072 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16073 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16074 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16075 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16076 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16077 }
16078
16079 // flat atomic fadd f32: gfx940, gfx11+.
16080 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
16081 Subtarget->hasFlatAtomicFaddF32Inst())
16082 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16083
16084 // global and flat atomic fadd f64: gfx90a, gfx940.
16085 if (Ty->isDoubleTy() && Subtarget->hasGFX90AInsts())
16086 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16087
16088 // If it is in flat address space, and the type is float, we will try to
16089 // expand it, if the target supports global and lds atomic fadd. The
16090 // reason we need that is, in the expansion, we emit the check of address
16091 // space. If it is in global address space, we emit the global atomic
16092 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16093 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
16094 Subtarget->hasLDSFPAtomicAdd()) {
16095 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16097 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16099 }
16100
16102 }
16103
16104 // DS FP atomics do respect the denormal mode, but the rounding mode is
16105 // fixed to round-to-nearest-even.
16106 // The only exception is DS_ADD_F64 which never flushes regardless of mode.
16107 if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
16108 if (!Ty->isDoubleTy())
16110
16113
16114 return RMW->getFunction()
16115 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16116 .getValueAsString() == "true"
16117 ? ReportUnsafeHWInst(AtomicExpansionKind::None)
16119 }
16120
16122 }
16125 case AtomicRMWInst::Min:
16126 case AtomicRMWInst::Max:
16128 case AtomicRMWInst::UMax: {
16131 if (RMW->getType()->isFloatTy() &&
16134
16135 // Always expand system scope min/max atomics.
16136 if (HasSystemScope)
16138 }
16139 break;
16140 }
16141 default:
16142 break;
16143 }
16144
16146}
16147
16153}
16154
16157 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16160}
16161
16167}
16168
16169const TargetRegisterClass *
16170SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16172 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16173 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16174 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16175 : &AMDGPU::SReg_32RegClass;
16176 if (!TRI->isSGPRClass(RC) && !isDivergent)
16177 return TRI->getEquivalentSGPRClass(RC);
16178 else if (TRI->isSGPRClass(RC) && isDivergent)
16179 return TRI->getEquivalentVGPRClass(RC);
16180
16181 return RC;
16182}
16183
16184// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16185// uniform values (as produced by the mask results of control flow intrinsics)
16186// used outside of divergent blocks. The phi users need to also be treated as
16187// always uniform.
16188//
16189// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16190static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16191 unsigned WaveSize) {
16192 // FIXME: We assume we never cast the mask results of a control flow
16193 // intrinsic.
16194 // Early exit if the type won't be consistent as a compile time hack.
16195 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16196 if (!IT || IT->getBitWidth() != WaveSize)
16197 return false;
16198
16199 if (!isa<Instruction>(V))
16200 return false;
16201 if (!Visited.insert(V).second)
16202 return false;
16203 bool Result = false;
16204 for (const auto *U : V->users()) {
16205 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16206 if (V == U->getOperand(1)) {
16207 switch (Intrinsic->getIntrinsicID()) {
16208 default:
16209 Result = false;
16210 break;
16211 case Intrinsic::amdgcn_if_break:
16212 case Intrinsic::amdgcn_if:
16213 case Intrinsic::amdgcn_else:
16214 Result = true;
16215 break;
16216 }
16217 }
16218 if (V == U->getOperand(0)) {
16219 switch (Intrinsic->getIntrinsicID()) {
16220 default:
16221 Result = false;
16222 break;
16223 case Intrinsic::amdgcn_end_cf:
16224 case Intrinsic::amdgcn_loop:
16225 Result = true;
16226 break;
16227 }
16228 }
16229 } else {
16230 Result = hasCFUser(U, Visited, WaveSize);
16231 }
16232 if (Result)
16233 break;
16234 }
16235 return Result;
16236}
16237
16239 const Value *V) const {
16240 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16241 if (CI->isInlineAsm()) {
16242 // FIXME: This cannot give a correct answer. This should only trigger in
16243 // the case where inline asm returns mixed SGPR and VGPR results, used
16244 // outside the defining block. We don't have a specific result to
16245 // consider, so this assumes if any value is SGPR, the overall register
16246 // also needs to be SGPR.
16247 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16249 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16250 for (auto &TC : TargetConstraints) {
16251 if (TC.Type == InlineAsm::isOutput) {
16254 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16255 if (RC && SIRI->isSGPRClass(RC))
16256 return true;
16257 }
16258 }
16259 }
16260 }
16262 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16263}
16264
16266 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16267 for (; I != E; ++I) {
16268 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
16269 if (getBasePtrIndex(M) == I.getOperandNo())
16270 return true;
16271 }
16272 }
16273 return false;
16274}
16275
16277 SDValue N1) const {
16278 if (!N0.hasOneUse())
16279 return false;
16280 // Take care of the opportunity to keep N0 uniform
16281 if (N0->isDivergent() || !N1->isDivergent())
16282 return true;
16283 // Check if we have a good chance to form the memory access pattern with the
16284 // base and offset
16285 return (DAG.isBaseWithConstantOffset(N0) &&
16286 hasMemSDNodeUser(*N0->use_begin()));
16287}
16288
16290 Register N0, Register N1) const {
16291 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16292}
16293
16296 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16298 if (I.getMetadata("amdgpu.noclobber"))
16299 Flags |= MONoClobber;
16300 if (I.getMetadata("amdgpu.last.use"))
16301 Flags |= MOLastUse;
16302 return Flags;
16303}
16304
16306 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16307 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16308 if (User->getOpcode() != ISD::CopyToReg)
16309 return false;
16310 if (!Def->isMachineOpcode())
16311 return false;
16312 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16313 if (!MDef)
16314 return false;
16315
16316 unsigned ResNo = User->getOperand(Op).getResNo();
16317 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16318 return false;
16319 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16320 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16321 PhysReg = AMDGPU::SCC;
16322 const TargetRegisterClass *RC =
16323 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16324 Cost = RC->getCopyCost();
16325 return true;
16326 }
16327 return false;
16328}
16329
16332
16335 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16336 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16337 "this cannot be replaced with add");
16339 return;
16340 }
16341
16342 assert(Subtarget->hasAtomicFaddInsts() &&
16343 "target should have atomic fadd instructions");
16344 assert(AI->getType()->isFloatTy() &&
16346 "generic atomicrmw expansion only supports FP32 operand in flat "
16347 "address space");
16348 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16349
16350 // Given: atomicrmw fadd ptr %addr, float %val ordering
16351 //
16352 // With this expansion we produce the following code:
16353 // [...]
16354 // br label %atomicrmw.check.shared
16355 //
16356 // atomicrmw.check.shared:
16357 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16358 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16359 //
16360 // atomicrmw.shared:
16361 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16362 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16363 // float %val ordering
16364 // br label %atomicrmw.phi
16365 //
16366 // atomicrmw.check.private:
16367 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16368 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16369 //
16370 // atomicrmw.private:
16371 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16372 // %loaded.private = load float, ptr addrspace(5) %cast.private
16373 // %val.new = fadd float %loaded.private, %val
16374 // store float %val.new, ptr addrspace(5) %cast.private
16375 // br label %atomicrmw.phi
16376 //
16377 // atomicrmw.global:
16378 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16379 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16380 // float %val ordering
16381 // br label %atomicrmw.phi
16382 //
16383 // atomicrmw.phi:
16384 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16385 // [ %loaded.private, %atomicrmw.private ],
16386 // [ %loaded.global, %atomicrmw.global ]
16387 // br label %atomicrmw.end
16388 //
16389 // atomicrmw.end:
16390 // [...]
16391
16392 IRBuilder<> Builder(AI);
16393 LLVMContext &Ctx = Builder.getContext();
16394
16395 BasicBlock *BB = Builder.GetInsertBlock();
16396 Function *F = BB->getParent();
16397 BasicBlock *ExitBB =
16398 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16399 BasicBlock *CheckSharedBB =
16400 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
16401 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16402 BasicBlock *CheckPrivateBB =
16403 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16404 BasicBlock *PrivateBB =
16405 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16406 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16407 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16408
16409 Value *Val = AI->getValOperand();
16410 Type *ValTy = Val->getType();
16411 Value *Addr = AI->getPointerOperand();
16412
16413 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16414 Value *Val) -> Value * {
16415 AtomicRMWInst *OldVal =
16416 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16417 AI->getOrdering(), AI->getSyncScopeID());
16419 AI->getAllMetadata(MDs);
16420 for (auto &P : MDs)
16421 OldVal->setMetadata(P.first, P.second);
16422 return OldVal;
16423 };
16424
16425 std::prev(BB->end())->eraseFromParent();
16426 Builder.SetInsertPoint(BB);
16427 Builder.CreateBr(CheckSharedBB);
16428
16429 Builder.SetInsertPoint(CheckSharedBB);
16430 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16431 {Addr}, nullptr, "is.shared");
16432 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16433
16434 Builder.SetInsertPoint(SharedBB);
16435 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16437 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16438 Builder.CreateBr(PhiBB);
16439
16440 Builder.SetInsertPoint(CheckPrivateBB);
16441 CallInst *IsPrivate = Builder.CreateIntrinsic(
16442 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16443 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16444
16445 Builder.SetInsertPoint(PrivateBB);
16446 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16448 Value *LoadedPrivate =
16449 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
16450 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
16451 Builder.CreateStore(NewVal, CastToPrivate);
16452 Builder.CreateBr(PhiBB);
16453
16454 Builder.SetInsertPoint(GlobalBB);
16455 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16457 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16458 Builder.CreateBr(PhiBB);
16459
16460 Builder.SetInsertPoint(PhiBB);
16461 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
16462 Loaded->addIncoming(LoadedShared, SharedBB);
16463 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16464 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16465 Builder.CreateBr(ExitBB);
16466
16467 AI->replaceAllUsesWith(Loaded);
16468 AI->eraseFromParent();
16469}
16470
16471LoadInst *
16473 IRBuilder<> Builder(AI);
16474 auto Order = AI->getOrdering();
16475
16476 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16477 // must be flushed if the atomic ordering had a release semantics. This is
16478 // not necessary a fence, a release fence just coincides to do that flush.
16479 // Avoid replacing of an atomicrmw with a release semantics.
16480 if (isReleaseOrStronger(Order))
16481 return nullptr;
16482
16483 LoadInst *LI = Builder.CreateAlignedLoad(
16484 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16485 LI->setAtomic(Order, AI->getSyncScopeID());
16486 LI->copyMetadata(*AI);
16487 LI->takeName(AI);
16488 AI->replaceAllUsesWith(LI);
16489 AI->eraseFromParent();
16490 return LI;
16491}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
unsigned const TargetRegisterInfo * TRI
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1174
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1171
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
bool unsafeFPAtomicsDisabled(Function *F)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getIdxEn(SDValue VIndex)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
LLVM IR instance of the generic uniformity analysis.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:988
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5196
bool isNegative() const
Definition: APFloat.h:1295
APInt bitcastToAPInt() const
Definition: APFloat.h:1210
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1006
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:957
bool isInfinity() const
Definition: APFloat.h:1292
Class for arbitrary precision integers.
Definition: APInt.h:76
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:444
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1199
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:684
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:867
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:760
@ Add
*p = old + v
Definition: Instructions.h:764
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ Sub
*p = old - v
Definition: Instructions.h:766
@ Xor
*p = old ^ v
Definition: Instructions.h:774
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
Value * getPointerOperand()
Definition: Instructions.h:910
void setOperation(BinOp Operation)
Definition: Instructions.h:861
BinOp getOperation() const
Definition: Instructions.h:845
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:901
Value * getValOperand()
Definition: Instructions.h:914
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:887
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:918
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:349
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:443
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:199
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:570
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1828
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
unsigned arg_size() const
Definition: InstrTypes.h:1685
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
bool isSigned() const
Definition: InstrTypes.h:1265
bool isFPPredicate() const
Definition: InstrTypes.h:1122
bool isIntPredicate() const
Definition: InstrTypes.h:1123
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:41
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Definition: DerivedTypes.h:103
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:201
iterator_range< arg_iterator > args()
Definition: Function.h:838
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:701
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:263
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:738
bool hasPrefetch() const
Definition: GCNSubtarget.h:890
bool hasD16Images() const
Definition: GCNSubtarget.h:685
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:463
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:454
bool hasDot7Insts() const
Definition: GCNSubtarget.h:784
bool hasApertureRegs() const
Definition: GCNSubtarget.h:583
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:613
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:754
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:397
bool hasMAIInsts() const
Definition: GCNSubtarget.h:804
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:665
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:513
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:571
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:252
bool hasDot1Insts() const
Definition: GCNSubtarget.h:760
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:826
Align getStackAlignment() const
Definition: GCNSubtarget.h:903
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:441
bool enableFlatScratch() const
Definition: GCNSubtarget.h:638
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:609
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:447
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:842
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:264
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:730
bool useDS128() const
Definition: GCNSubtarget.h:523
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:443
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:256
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:575
bool hasLDSFPAtomicAdd() const
Definition: GCNSubtarget.h:963
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:413
bool hasIntClamp() const
Definition: GCNSubtarget.h:343
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:993
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:363
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:587
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:617
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:916
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:719
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:322
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:870
bool hasFFBL() const
Definition: GCNSubtarget.h:401
bool hasNSAEncoding() const
bool hasSMemRealTime() const
Definition: GCNSubtarget.h:935
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:545
bool hasMed3_16() const
Definition: GCNSubtarget.h:409
bool hasMovrel() const
Definition: GCNSubtarget.h:939
bool hasBFI() const
Definition: GCNSubtarget.h:389
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:563
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:330
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:508
bool hasFFBH() const
Definition: GCNSubtarget.h:405
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:822
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:828
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:953
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:533
bool hasDot8Insts() const
Definition: GCNSubtarget.h:788
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:528
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:517
Generation getGeneration() const
Definition: GCNSubtarget.h:303
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:717
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:721
bool hasAddr64() const
Definition: GCNSubtarget.h:367
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:417
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:713
bool hasFractBug() const
Definition: GCNSubtarget.h:381
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:385
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:700
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1533
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1120
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1790
LLVMContext & getContext() const
Definition: IRBuilder.h:176
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1803
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1854
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1114
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2132
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:341
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:86
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1636
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
Definition: Instruction.h:377
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:266
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
bool isCompare() const
Return true if this instruction is a comparison.
Definition: MCInstrDesc.h:341
bool hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI=nullptr) const
Return true if this instruction implicitly defines the specified physical register.
Definition: MCInstrDesc.cpp:32
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Metadata node.
Definition: Metadata.h:1067
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:556
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:721
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:953
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:550
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const Pass * getPass() const
Definition: SelectionDAG.h:469
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:477
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:827
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:471
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:472
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:772
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:675
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:468
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:798
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:844
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:484
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:559
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:553
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:849
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:257
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:271
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:381
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:246
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
constexpr bool isZero() const
Definition: TypeSize.h:156
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:422
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:271
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:236
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1128
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:723
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1004
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1277
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1247
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1278
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:487
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:985
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1037
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1260
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:913
@ FPTRUNC_ROUND
Definition: ISDOpcodes.h:480
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1273
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1274
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:939
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1406
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1280
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1194
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1053
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:722
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1227
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:994
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1083
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1276
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:500
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:507
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1243
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:222
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:208
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:880
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1022
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:999
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1271
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:984
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1217
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1254
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1279
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:331
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1047
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1103
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:922
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1285
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1269
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:465
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:990
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1270
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1188
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:470
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1214
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1268
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:944
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:414
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:907
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1100
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1076
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:763
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1284
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1529
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1509
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition: Function.cpp:1023
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1522
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double inv_pi
Definition: MathExtras.h:38
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:219
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:141
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:212
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:428
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:628
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals