LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
42#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
46#include "llvm/IR/MDBuilder.h"
49#include "llvm/Support/ModRef.h"
51#include <optional>
52
53using namespace llvm;
54using namespace llvm::SDPatternMatch;
55
56#define DEBUG_TYPE "si-lower"
57
58STATISTIC(NumTailCalls, "Number of tail calls");
59
60static cl::opt<bool>
61 DisableLoopAlignment("amdgpu-disable-loop-alignment",
62 cl::desc("Do not align and prefetch loops"),
63 cl::init(false));
64
66 "amdgpu-use-divergent-register-indexing", cl::Hidden,
67 cl::desc("Use indirect register addressing for divergent indexes"),
68 cl::init(false));
69
72 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
73}
74
77 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
78}
79
80static unsigned findFirstFreeSGPR(CCState &CCInfo) {
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
83 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
84 return AMDGPU::SGPR0 + Reg;
85 }
86 }
87 llvm_unreachable("Cannot allocate sgpr");
88}
89
91 const GCNSubtarget &STI)
92 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
93 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
94 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
95
96 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
97
98 const SIRegisterInfo *TRI = STI.getRegisterInfo();
99 const TargetRegisterClass *V32RegClass =
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
101 addRegisterClass(MVT::f32, V32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const TargetRegisterClass *V64RegClass =
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32,
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32,
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
149
150 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
151 addRegisterClass(MVT::v12f32,
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
153
154 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v16f32,
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
157
158 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
159 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
160
161 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
162 addRegisterClass(MVT::v16f64,
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
167 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
170 } else {
171 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
174 }
175
176 // Unless there are also VOP3P operations, not operations are really legal.
177 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
180 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
183 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
186 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
189 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
191 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
192 }
193
194 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
195 addRegisterClass(MVT::v32f32,
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
197
198 computeRegisterProperties(Subtarget->getRegisterInfo());
199
200 // The boolean content concept here is too inflexible. Compares only ever
201 // really produce a 1-bit result. Any copy/extend from these will turn into a
202 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
203 // it's what most targets use.
206
207 // We need to custom lower vector stores from local memory
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
213 Custom);
214
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
220 Custom);
221
222 if (isTypeLegal(MVT::bf16)) {
223 for (unsigned Opc :
232 ISD::SETCC}) {
233 // FIXME: The promoted to type shouldn't need to be explicit
234 setOperationAction(Opc, MVT::bf16, Promote);
235 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
236 }
237
239
241 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
242
246
247 // We only need to custom lower because we can't specify an action for bf16
248 // sources.
251 }
252
253 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
254 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
259 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
264 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
265 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
266 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
267 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
268 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
269
270 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
271 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
272 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
273 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
274 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
275 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
276 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
277
278 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
279
283 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
284
285 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
286
288 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
289
291 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
292 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
293
295 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
296 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
297 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
298 Expand);
300 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
301 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
302 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
303 Expand);
304
306 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
307 MVT::v3i16, MVT::v4i16, MVT::Other},
308 Custom);
309
312 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
313
315
317
319 Expand);
320
321#if 0
323#endif
324
325 // We only support LOAD/STORE and vector manipulation ops for vectors
326 // with > 4 elements.
327 for (MVT VT :
328 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
329 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
330 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
331 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
332 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
333 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
334 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
335 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
336 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
337 switch (Op) {
338 case ISD::LOAD:
339 case ISD::STORE:
341 case ISD::BITCAST:
342 case ISD::UNDEF:
346 case ISD::IS_FPCLASS:
347 break;
352 break;
353 default:
355 break;
356 }
357 }
358 }
359
361
362 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
363 // is expanded to avoid having two separate loops in case the index is a VGPR.
364
365 // Most operations are naturally 32-bit vector operations. We only support
366 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
367 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
369 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
370
372 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
373
375 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
376
378 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
379 }
380
381 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
383 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
384
386 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
387
389 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
390
392 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
393 }
394
395 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
397 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
398
400 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
401
403 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
404
406 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
407 }
408
409 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
411 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
412
414 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
415
417 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
418
420 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
421 }
422
423 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
425 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
426
428 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
429
431 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
432
434 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
435 }
436
438 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
439 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
440 Custom);
441
442 if (Subtarget->hasPkMovB32()) {
443 // TODO: 16-bit element vectors should be legal with even aligned elements.
444 // TODO: Can be legal with wider source types than the result with
445 // subregister extracts.
446 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
447 }
448
450 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
451 // instead lower to cndmask in SITargetLowering::LowerSELECT().
453 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
454 // alignbit.
455 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
456
457 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
458 Custom);
459
460 // Avoid stack access for these.
461 // TODO: Generalize to more vector types.
463 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
464 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
465 Custom);
466
467 // Deal with vec3 vector operations when widened to vec4.
469 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
470
471 // Deal with vec5/6/7 vector operations when widened to vec8.
473 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
474 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
475 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
476 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
477 Custom);
478
479 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
480 // and output demarshalling
481 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
482
483 // We can't return success/failure, only the old value,
484 // let LLVM add the comparison
486 Expand);
487
488 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
489
490 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
491
492 // FIXME: This should be narrowed to i32, but that only happens if i64 is
493 // illegal.
494 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
495 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
496
497 // On SI this is s_memtime and s_memrealtime on VI.
499
500 if (Subtarget->hasSMemRealTime() ||
501 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
504
505 if (Subtarget->has16BitInsts()) {
508 } else {
510 }
511
512 if (Subtarget->hasMadMacF32Insts())
514
515 if (!Subtarget->hasBFI())
516 // fcopysign can be done in a single instruction with BFI.
517 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
518
519 if (!Subtarget->hasBCNT(32))
521
522 if (!Subtarget->hasBCNT(64))
524
525 if (Subtarget->hasFFBH())
527
528 if (Subtarget->hasFFBL())
530
531 // We only really have 32-bit BFE instructions (and 16-bit on VI).
532 //
533 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
534 // effort to match them now. We want this to be false for i64 cases when the
535 // extraction isn't restricted to the upper or lower half. Ideally we would
536 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
537 // span the midpoint are probably relatively rare, so don't worry about them
538 // for now.
539 if (Subtarget->hasBFE())
541
542 // Clamp modifier on add/sub
543 if (Subtarget->hasIntClamp())
545
546 if (Subtarget->hasAddNoCarry())
547 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
548 Legal);
549
552 {MVT::f32, MVT::f64}, Custom);
553
554 // These are really only legal for ieee_mode functions. We should be avoiding
555 // them for functions that don't have ieee_mode enabled, so just say they are
556 // legal.
558 {MVT::f32, MVT::f64}, Legal);
559
560 if (Subtarget->haveRoundOpsF64())
562 Legal);
563 else
565 MVT::f64, Custom);
566
568 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
569 Legal);
570 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
571
574
575 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
576 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
577
578 // Custom lower these because we can't specify a rule based on an illegal
579 // source bf16.
582
583 if (Subtarget->has16BitInsts()) {
586 MVT::i16, Legal);
587
588 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
589
591 MVT::i16, Expand);
592
596 ISD::CTPOP},
597 MVT::i16, Promote);
598
600
601 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
602
604 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
606 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
607
611
613
614 // F16 - Constant Actions.
617
618 // F16 - Load/Store Actions.
620 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
622 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
623
624 // BF16 - Load/Store Actions.
626 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
628 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
629
630 // F16 - VOP1 Actions.
633 MVT::f16, Custom);
634
635 // BF16 - VOP1 Actions.
636 if (Subtarget->hasBF16TransInsts())
638
641
642 // F16 - VOP2 Actions.
643 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
644 Expand);
648
649 // F16 - VOP3 Actions.
651 if (STI.hasMadF16())
653
654 for (MVT VT :
655 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
656 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
657 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
658 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
659 switch (Op) {
660 case ISD::LOAD:
661 case ISD::STORE:
663 case ISD::BITCAST:
664 case ISD::UNDEF:
669 case ISD::IS_FPCLASS:
670 break;
674 break;
675 default:
677 break;
678 }
679 }
680 }
681
682 // v_perm_b32 can handle either of these.
683 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
685
686 // XXX - Do these do anything? Vector constants turn into build_vector.
687 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
688
689 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
690 Legal);
691
693 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
695 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
696
698 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
700 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
701
702 setOperationAction(ISD::AND, MVT::v2i16, Promote);
703 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
704 setOperationAction(ISD::OR, MVT::v2i16, Promote);
705 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
706 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
707 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
708
710 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
712 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
713 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
715
717 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
719 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
721 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
722
724 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
726 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
727 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
728 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
729
731 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
733 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
734
736 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
738 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
740 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
741
742 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
743 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
744 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
745 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
746 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
747 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
748
750 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
752 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
753 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
754 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
755
756 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
757 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
758 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
759 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
760 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
761 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
762
764 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
766 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
767 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
768 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
769
771 MVT::v2i32, Expand);
773
775 MVT::v4i32, Expand);
776
778 MVT::v8i32, Expand);
779
780 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
781 Subtarget->hasVOP3PInsts() ? Legal : Custom);
782
783 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
784 // This isn't really legal, but this avoids the legalizer unrolling it (and
785 // allows matching fneg (fabs x) patterns)
786 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
787
788 // Can do this in one BFI plus a constant materialize.
790 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
791 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
792 MVT::v32f16, MVT::v32bf16},
793 Custom);
794
797 MVT::f16, Custom);
799
802 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
803 Custom);
804
806 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
807 Expand);
808
809 for (MVT Vec16 :
810 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
811 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
814 Vec16, Custom);
816 }
817 }
818
819 if (Subtarget->hasVOP3PInsts()) {
823 MVT::v2i16, Legal);
824
827 MVT::v2f16, Legal);
828
830 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
831
833 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
834 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
835 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
836 Custom);
837
838 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
839 // Split vector operations.
844 VT, Custom);
845
846 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
847 // Split vector operations.
849 VT, Custom);
850
853 {MVT::v2f16, MVT::v4f16}, Custom);
854
855 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
856 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
857 Custom);
858
859 if (Subtarget->hasBF16PackedInsts()) {
860 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
861 // Split vector operations.
863 VT, Custom);
864 }
865
866 if (Subtarget->hasPackedFP32Ops()) {
868 MVT::v2f32, Legal);
870 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
871 Custom);
872 }
873 }
874
876
877 if (Subtarget->has16BitInsts()) {
879 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
881 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
882 } else {
883 // Legalization hack.
884 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
885
887 }
888
890 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
891 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
892 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
893 MVT::v32f16, MVT::v32bf16},
894 Custom);
895
897
898 if (Subtarget->hasVectorMulU64())
900 else if (Subtarget->hasScalarSMulU64())
902
903 if (Subtarget->hasMad64_32())
905
906 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
908
909 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
911 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
912 } else {
913 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
914 if (Subtarget->hasMinimum3Maximum3F32())
916
917 if (Subtarget->hasMinimum3Maximum3PKF16()) {
919
920 // If only the vector form is available, we need to widen to a vector.
921 if (!Subtarget->hasMinimum3Maximum3F16())
923 }
924 }
925
926 if (Subtarget->hasVOP3PInsts()) {
927 // We want to break these into v2f16 pieces, not scalarize.
929 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
930 Custom);
931 }
932
933 if (Subtarget->hasIntMinMax64())
935 Legal);
936
938 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
939 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
940 MVT::i8},
941 Custom);
942
944 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
945 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
946 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
947 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
948 Custom);
949
951 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
952 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
953 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
954 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
955 Custom);
956
962
963 // TODO: Could move this to custom lowering, could benefit from combines on
964 // extract of relevant bits.
966
968
969 if (Subtarget->hasBF16ConversionInsts()) {
970 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
972 }
973
974 if (Subtarget->hasBF16PackedInsts()) {
977 MVT::v2bf16, Legal);
978 }
979
980 if (Subtarget->hasBF16TransInsts()) {
982 }
983
984 if (Subtarget->hasCvtPkF16F32Inst()) {
986 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
987 Custom);
988 }
989
993 ISD::SUB,
995 ISD::MUL,
996 ISD::FADD,
997 ISD::FSUB,
998 ISD::FDIV,
999 ISD::FMUL,
1008 ISD::FMA,
1009 ISD::SMIN,
1010 ISD::SMAX,
1011 ISD::UMIN,
1012 ISD::UMAX,
1013 ISD::SETCC,
1015 ISD::SMIN,
1016 ISD::SMAX,
1017 ISD::UMIN,
1018 ISD::UMAX,
1019 ISD::AND,
1020 ISD::OR,
1021 ISD::XOR,
1022 ISD::SHL,
1023 ISD::SRL,
1024 ISD::SRA,
1025 ISD::FSHR,
1035
1036 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1038
1039 // All memory operations. Some folding on the pointer operand is done to help
1040 // matching the constant offsets in the addressing modes.
1042 ISD::STORE,
1067
1068 // FIXME: In other contexts we pretend this is a per-function property.
1070
1072}
1073
1074const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1075
1077 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1078 return RCRegs;
1079}
1080
1081//===----------------------------------------------------------------------===//
1082// TargetLowering queries
1083//===----------------------------------------------------------------------===//
1084
1085// v_mad_mix* support a conversion from f16 to f32.
1086//
1087// There is only one special case when denormals are enabled we don't currently,
1088// where this is OK to use.
1089bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1090 EVT DestVT, EVT SrcVT) const {
1091 return DestVT.getScalarType() == MVT::f32 &&
1092 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1093 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1094 SrcVT.getScalarType() == MVT::f16) ||
1095 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1096 SrcVT.getScalarType() == MVT::bf16)) &&
1097 // TODO: This probably only requires no input flushing?
1099}
1100
1102 LLT DestTy, LLT SrcTy) const {
1103 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1104 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1105 DestTy.getScalarSizeInBits() == 32 &&
1106 SrcTy.getScalarSizeInBits() == 16 &&
1107 // TODO: This probably only requires no input flushing?
1108 denormalModeIsFlushAllF32(*MI.getMF());
1109}
1110
1112 // SI has some legal vector types, but no legal vector operations. Say no
1113 // shuffles are legal in order to prefer scalarizing some vector operations.
1114 return false;
1115}
1116
1118 CallingConv::ID CC,
1119 EVT VT) const {
1121 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1122
1123 if (VT.isVector()) {
1124 EVT ScalarVT = VT.getScalarType();
1125 unsigned Size = ScalarVT.getSizeInBits();
1126 if (Size == 16) {
1127 if (Subtarget->has16BitInsts()) {
1128 if (VT.isInteger())
1129 return MVT::v2i16;
1130 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1131 }
1132 return VT.isInteger() ? MVT::i32 : MVT::f32;
1133 }
1134
1135 if (Size < 16)
1136 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1137 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1138 }
1139
1140 if (VT.getSizeInBits() > 32)
1141 return MVT::i32;
1142
1143 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1144}
1145
1147 CallingConv::ID CC,
1148 EVT VT) const {
1150 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1151
1152 if (VT.isVector()) {
1153 unsigned NumElts = VT.getVectorNumElements();
1154 EVT ScalarVT = VT.getScalarType();
1155 unsigned Size = ScalarVT.getSizeInBits();
1156
1157 // FIXME: Should probably promote 8-bit vectors to i16.
1158 if (Size == 16 && Subtarget->has16BitInsts())
1159 return (NumElts + 1) / 2;
1160
1161 if (Size <= 32)
1162 return NumElts;
1163
1164 if (Size > 32)
1165 return NumElts * ((Size + 31) / 32);
1166 } else if (VT.getSizeInBits() > 32)
1167 return (VT.getSizeInBits() + 31) / 32;
1168
1169 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1170}
1171
1173 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1174 unsigned &NumIntermediates, MVT &RegisterVT) const {
1175 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1176 unsigned NumElts = VT.getVectorNumElements();
1177 EVT ScalarVT = VT.getScalarType();
1178 unsigned Size = ScalarVT.getSizeInBits();
1179 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1180 // support, but unless we can properly handle 3-vectors, it will be still be
1181 // inconsistent.
1182 if (Size == 16 && Subtarget->has16BitInsts()) {
1183 if (ScalarVT == MVT::bf16) {
1184 RegisterVT = MVT::i32;
1185 IntermediateVT = MVT::v2bf16;
1186 } else {
1187 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1188 IntermediateVT = RegisterVT;
1189 }
1190 NumIntermediates = (NumElts + 1) / 2;
1191 return NumIntermediates;
1192 }
1193
1194 if (Size == 32) {
1195 RegisterVT = ScalarVT.getSimpleVT();
1196 IntermediateVT = RegisterVT;
1197 NumIntermediates = NumElts;
1198 return NumIntermediates;
1199 }
1200
1201 if (Size < 16 && Subtarget->has16BitInsts()) {
1202 // FIXME: Should probably form v2i16 pieces
1203 RegisterVT = MVT::i16;
1204 IntermediateVT = ScalarVT;
1205 NumIntermediates = NumElts;
1206 return NumIntermediates;
1207 }
1208
1209 if (Size != 16 && Size <= 32) {
1210 RegisterVT = MVT::i32;
1211 IntermediateVT = ScalarVT;
1212 NumIntermediates = NumElts;
1213 return NumIntermediates;
1214 }
1215
1216 if (Size > 32) {
1217 RegisterVT = MVT::i32;
1218 IntermediateVT = RegisterVT;
1219 NumIntermediates = NumElts * ((Size + 31) / 32);
1220 return NumIntermediates;
1221 }
1222 }
1223
1225 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1226}
1227
1229 const DataLayout &DL, Type *Ty,
1230 unsigned MaxNumLanes) {
1231 assert(MaxNumLanes != 0);
1232
1233 LLVMContext &Ctx = Ty->getContext();
1234 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1235 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1236 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1237 NumElts);
1238 }
1239
1240 return TLI.getValueType(DL, Ty);
1241}
1242
1243// Peek through TFE struct returns to only use the data size.
1245 const DataLayout &DL, Type *Ty,
1246 unsigned MaxNumLanes) {
1247 auto *ST = dyn_cast<StructType>(Ty);
1248 if (!ST)
1249 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1250
1251 // TFE intrinsics return an aggregate type.
1252 assert(ST->getNumContainedTypes() == 2 &&
1253 ST->getContainedType(1)->isIntegerTy(32));
1254 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1255}
1256
1257/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1258/// in-memory representation. This return value is a custom type because there
1259/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1260/// could cause issues during codegen, these address space 7 pointers will be
1261/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1262/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1263/// for cost modeling, to work. (This also sets us up decently for doing the
1264/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1266 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1267 return MVT::amdgpuBufferFatPointer;
1269 DL.getPointerSizeInBits(AS) == 192)
1270 return MVT::amdgpuBufferStridedPointer;
1272}
1273/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1274/// v8i32 when padding is added.
1275/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1276/// also v8i32 with padding.
1278 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1279 DL.getPointerSizeInBits(AS) == 160) ||
1281 DL.getPointerSizeInBits(AS) == 192))
1282 return MVT::v8i32;
1284}
1285
1286static unsigned getIntrMemWidth(unsigned IntrID) {
1287 switch (IntrID) {
1288 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1289 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1290 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1291 return 8;
1292 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1293 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1294 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1295 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1296 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1297 return 32;
1298 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1299 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1300 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1301 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1302 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1303 return 64;
1304 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1305 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1306 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1307 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1308 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1309 return 128;
1310 default:
1311 llvm_unreachable("Unknown width");
1312 }
1313}
1314
1315static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad,
1317 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1318 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1319 switch (AtomicOrderingCABI(Ord)) {
1322 break;
1325 break;
1328 break;
1329 default:
1331 break;
1332 }
1333
1334 Info.flags =
1336 Info.flags |= MOCooperative;
1337
1338 MDNode *ScopeMD = cast<MDNode>(
1339 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1340 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1341 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1342}
1343
1345 const CallBase &CI,
1346 MachineFunction &MF,
1347 unsigned IntrID) const {
1348 Info.flags = MachineMemOperand::MONone;
1349 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1350 Info.flags |= MachineMemOperand::MOInvariant;
1351 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1353 Info.flags |= getTargetMMOFlags(CI);
1354
1355 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1357 AttributeSet Attr =
1359 MemoryEffects ME = Attr.getMemoryEffects();
1360 if (ME.doesNotAccessMemory())
1361 return false;
1362
1363 // TODO: Should images get their own address space?
1364 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1365
1366 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1367 if (RsrcIntr->IsImage) {
1368 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1370 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1371 Info.align.reset();
1372 }
1373
1374 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1375 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1376 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1377 // We conservatively set the memory operand of a buffer intrinsic to the
1378 // base resource pointer, so that we can access alias information about
1379 // those pointers. Cases like "this points at the same value
1380 // but with a different offset" are handled in
1381 // areMemAccessesTriviallyDisjoint.
1382 Info.ptrVal = RsrcArg;
1383 }
1384
1385 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1386 if (!IsSPrefetch) {
1387 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1388 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1389 Info.flags |= MachineMemOperand::MOVolatile;
1390 }
1391
1393 if (ME.onlyReadsMemory()) {
1394 if (RsrcIntr->IsImage) {
1395 unsigned MaxNumLanes = 4;
1396
1397 if (!BaseOpcode->Gather4) {
1398 // If this isn't a gather, we may have excess loaded elements in the
1399 // IR type. Check the dmask for the real number of elements loaded.
1400 unsigned DMask =
1401 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1402 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1403 }
1404
1405 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1406 CI.getType(), MaxNumLanes);
1407 } else {
1408 Info.memVT =
1410 std::numeric_limits<unsigned>::max());
1411 }
1412
1413 // FIXME: What does alignment mean for an image?
1414 Info.opc = ISD::INTRINSIC_W_CHAIN;
1415 Info.flags |= MachineMemOperand::MOLoad;
1416 } else if (ME.onlyWritesMemory()) {
1417 Info.opc = ISD::INTRINSIC_VOID;
1418
1419 Type *DataTy = CI.getArgOperand(0)->getType();
1420 if (RsrcIntr->IsImage) {
1421 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1422 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1423 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1424 DMaskLanes);
1425 } else
1426 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1427
1428 Info.flags |= MachineMemOperand::MOStore;
1429 } else {
1430 // Atomic, NoReturn Sampler or prefetch
1431 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1433 Info.flags |=
1435
1436 if (!IsSPrefetch)
1437 Info.flags |= MachineMemOperand::MOStore;
1438
1439 switch (IntrID) {
1440 default:
1441 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1442 // Fake memory access type for no return sampler intrinsics
1443 Info.memVT = MVT::i32;
1444 } else {
1445 // XXX - Should this be volatile without known ordering?
1446 Info.flags |= MachineMemOperand::MOVolatile;
1447 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1448 }
1449 break;
1450 case Intrinsic::amdgcn_raw_buffer_load_lds:
1451 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1452 case Intrinsic::amdgcn_struct_buffer_load_lds:
1453 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1454 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1455 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1456 Info.ptrVal = CI.getArgOperand(1);
1457 return true;
1458 }
1459 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1460 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1461 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1462 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1463 Info.memVT =
1465 std::numeric_limits<unsigned>::max());
1466 Info.flags &= ~MachineMemOperand::MOStore;
1467 return true;
1468 }
1469 }
1470 }
1471 return true;
1472 }
1473
1474 switch (IntrID) {
1475 case Intrinsic::amdgcn_ds_ordered_add:
1476 case Intrinsic::amdgcn_ds_ordered_swap: {
1477 Info.opc = ISD::INTRINSIC_W_CHAIN;
1478 Info.memVT = MVT::getVT(CI.getType());
1479 Info.ptrVal = CI.getOperand(0);
1480 Info.align.reset();
1482
1483 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1484 if (!Vol->isZero())
1485 Info.flags |= MachineMemOperand::MOVolatile;
1486
1487 return true;
1488 }
1489 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1490 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1491 Info.opc = ISD::INTRINSIC_W_CHAIN;
1492 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1493 Info.ptrVal = nullptr;
1494 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1496 return true;
1497 }
1498 case Intrinsic::amdgcn_ds_append:
1499 case Intrinsic::amdgcn_ds_consume: {
1500 Info.opc = ISD::INTRINSIC_W_CHAIN;
1501 Info.memVT = MVT::getVT(CI.getType());
1502 Info.ptrVal = CI.getOperand(0);
1503 Info.align.reset();
1505
1506 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1507 if (!Vol->isZero())
1508 Info.flags |= MachineMemOperand::MOVolatile;
1509
1510 return true;
1511 }
1512 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1513 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1514 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1517 Info.memVT = MVT::getVT(CI.getType());
1518 Info.ptrVal = CI.getOperand(0);
1519 Info.memVT = MVT::i64;
1520 Info.size = 8;
1521 Info.align.reset();
1523 return true;
1524 }
1525 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1526 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1527 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1528 Info.opc = ISD::INTRINSIC_W_CHAIN;
1529 Info.memVT =
1530 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1531 ? CI.getType()
1533 ->getElementType(0)); // XXX: what is correct VT?
1534
1535 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1536 Info.align.reset();
1537 Info.flags |=
1539 return true;
1540 }
1541 case Intrinsic::amdgcn_global_atomic_fmin_num:
1542 case Intrinsic::amdgcn_global_atomic_fmax_num:
1543 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1544 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1545 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1546 Info.opc = ISD::INTRINSIC_W_CHAIN;
1547 Info.memVT = MVT::getVT(CI.getType());
1548 Info.ptrVal = CI.getOperand(0);
1549 Info.align.reset();
1553 return true;
1554 }
1555 case Intrinsic::amdgcn_flat_load_monitor_b32:
1556 case Intrinsic::amdgcn_flat_load_monitor_b64:
1557 case Intrinsic::amdgcn_flat_load_monitor_b128:
1558 case Intrinsic::amdgcn_global_load_monitor_b32:
1559 case Intrinsic::amdgcn_global_load_monitor_b64:
1560 case Intrinsic::amdgcn_global_load_monitor_b128:
1561 case Intrinsic::amdgcn_cluster_load_b32:
1562 case Intrinsic::amdgcn_cluster_load_b64:
1563 case Intrinsic::amdgcn_cluster_load_b128:
1564 case Intrinsic::amdgcn_ds_load_tr6_b96:
1565 case Intrinsic::amdgcn_ds_load_tr4_b64:
1566 case Intrinsic::amdgcn_ds_load_tr8_b64:
1567 case Intrinsic::amdgcn_ds_load_tr16_b128:
1568 case Intrinsic::amdgcn_global_load_tr6_b96:
1569 case Intrinsic::amdgcn_global_load_tr4_b64:
1570 case Intrinsic::amdgcn_global_load_tr_b64:
1571 case Intrinsic::amdgcn_global_load_tr_b128:
1572 case Intrinsic::amdgcn_ds_read_tr4_b64:
1573 case Intrinsic::amdgcn_ds_read_tr6_b96:
1574 case Intrinsic::amdgcn_ds_read_tr8_b64:
1575 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1576 Info.opc = ISD::INTRINSIC_W_CHAIN;
1577 Info.memVT = MVT::getVT(CI.getType());
1578 Info.ptrVal = CI.getOperand(0);
1579 Info.align.reset();
1580 Info.flags |= MachineMemOperand::MOLoad;
1581 return true;
1582 }
1583 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1584 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1585 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1586 Info.opc = ISD::INTRINSIC_W_CHAIN;
1587 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1588 Info.ptrVal = CI.getOperand(0);
1589 Info.align.reset();
1590 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1591 return true;
1592 }
1593 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1594 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1595 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1596 Info.opc = ISD::INTRINSIC_VOID;
1597 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1598 Info.ptrVal = CI.getArgOperand(0);
1599 Info.align.reset();
1600 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1601 return true;
1602 }
1603 case Intrinsic::amdgcn_ds_gws_init:
1604 case Intrinsic::amdgcn_ds_gws_barrier:
1605 case Intrinsic::amdgcn_ds_gws_sema_v:
1606 case Intrinsic::amdgcn_ds_gws_sema_br:
1607 case Intrinsic::amdgcn_ds_gws_sema_p:
1608 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1609 Info.opc = ISD::INTRINSIC_VOID;
1610
1611 const GCNTargetMachine &TM =
1612 static_cast<const GCNTargetMachine &>(getTargetMachine());
1613
1615 Info.ptrVal = MFI->getGWSPSV(TM);
1616
1617 // This is an abstract access, but we need to specify a type and size.
1618 Info.memVT = MVT::i32;
1619 Info.size = 4;
1620 Info.align = Align(4);
1621
1622 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1623 Info.flags |= MachineMemOperand::MOLoad;
1624 else
1625 Info.flags |= MachineMemOperand::MOStore;
1626 return true;
1627 }
1628 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1629 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1630 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1631 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1632 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1633 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1634 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1635 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1636 Info.opc = ISD::INTRINSIC_VOID;
1637 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1638 Info.ptrVal = CI.getArgOperand(1);
1640 return true;
1641 }
1642 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1643 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1644 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1645 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1646 Info.opc = ISD::INTRINSIC_VOID;
1647 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1648 Info.ptrVal = CI.getArgOperand(0);
1650 return true;
1651 }
1652 case Intrinsic::amdgcn_load_to_lds:
1653 case Intrinsic::amdgcn_global_load_lds: {
1654 Info.opc = ISD::INTRINSIC_VOID;
1655 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1656 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1657 Info.ptrVal = CI.getArgOperand(1);
1659 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1660 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1661 Info.flags |= MachineMemOperand::MOVolatile;
1662 return true;
1663 }
1664 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1665 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1666 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1667 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1668 Info.opc = ISD::INTRINSIC_W_CHAIN;
1669
1670 const GCNTargetMachine &TM =
1671 static_cast<const GCNTargetMachine &>(getTargetMachine());
1672
1674 Info.ptrVal = MFI->getGWSPSV(TM);
1675
1676 // This is an abstract access, but we need to specify a type and size.
1677 Info.memVT = MVT::i32;
1678 Info.size = 4;
1679 Info.align = Align(4);
1680
1682 return true;
1683 }
1684 case Intrinsic::amdgcn_s_prefetch_data:
1685 case Intrinsic::amdgcn_flat_prefetch:
1686 case Intrinsic::amdgcn_global_prefetch: {
1687 Info.opc = ISD::INTRINSIC_VOID;
1688 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1689 Info.ptrVal = CI.getArgOperand(0);
1690 Info.flags |= MachineMemOperand::MOLoad;
1691 return true;
1692 }
1693 default:
1694 return false;
1695 }
1696}
1697
1699 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1701 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1702 // The DAG's ValueType loses the addrspaces.
1703 // Add them as 2 extra Constant operands "from" and "to".
1704 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1705 unsigned DstAS = I.getType()->getPointerAddressSpace();
1706 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1707 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1708 break;
1709 }
1710 default:
1711 break;
1712 }
1713}
1714
1717 Type *&AccessTy) const {
1718 Value *Ptr = nullptr;
1719 switch (II->getIntrinsicID()) {
1720 case Intrinsic::amdgcn_cluster_load_b128:
1721 case Intrinsic::amdgcn_cluster_load_b64:
1722 case Intrinsic::amdgcn_cluster_load_b32:
1723 case Intrinsic::amdgcn_ds_append:
1724 case Intrinsic::amdgcn_ds_consume:
1725 case Intrinsic::amdgcn_ds_load_tr8_b64:
1726 case Intrinsic::amdgcn_ds_load_tr16_b128:
1727 case Intrinsic::amdgcn_ds_load_tr4_b64:
1728 case Intrinsic::amdgcn_ds_load_tr6_b96:
1729 case Intrinsic::amdgcn_ds_read_tr4_b64:
1730 case Intrinsic::amdgcn_ds_read_tr6_b96:
1731 case Intrinsic::amdgcn_ds_read_tr8_b64:
1732 case Intrinsic::amdgcn_ds_read_tr16_b64:
1733 case Intrinsic::amdgcn_ds_ordered_add:
1734 case Intrinsic::amdgcn_ds_ordered_swap:
1735 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1736 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1737 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1738 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1739 case Intrinsic::amdgcn_flat_load_monitor_b128:
1740 case Intrinsic::amdgcn_flat_load_monitor_b32:
1741 case Intrinsic::amdgcn_flat_load_monitor_b64:
1742 case Intrinsic::amdgcn_global_atomic_fmax_num:
1743 case Intrinsic::amdgcn_global_atomic_fmin_num:
1744 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1745 case Intrinsic::amdgcn_global_load_monitor_b128:
1746 case Intrinsic::amdgcn_global_load_monitor_b32:
1747 case Intrinsic::amdgcn_global_load_monitor_b64:
1748 case Intrinsic::amdgcn_global_load_tr_b64:
1749 case Intrinsic::amdgcn_global_load_tr_b128:
1750 case Intrinsic::amdgcn_global_load_tr4_b64:
1751 case Intrinsic::amdgcn_global_load_tr6_b96:
1752 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1753 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1754 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1755 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1756 Ptr = II->getArgOperand(0);
1757 break;
1758 case Intrinsic::amdgcn_load_to_lds:
1759 case Intrinsic::amdgcn_global_load_lds:
1760 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1761 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1762 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1763 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1764 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1765 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1766 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1767 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1768 Ptr = II->getArgOperand(1);
1769 break;
1770 default:
1771 return false;
1772 }
1773 AccessTy = II->getType();
1774 Ops.push_back(Ptr);
1775 return true;
1776}
1777
1779 unsigned AddrSpace) const {
1780 if (!Subtarget->hasFlatInstOffsets()) {
1781 // Flat instructions do not have offsets, and only have the register
1782 // address.
1783 return AM.BaseOffs == 0 && AM.Scale == 0;
1784 }
1785
1786 decltype(SIInstrFlags::FLAT) FlatVariant =
1790
1791 return AM.Scale == 0 &&
1792 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1793 AM.BaseOffs, AddrSpace, FlatVariant));
1794}
1795
1797 if (Subtarget->hasFlatGlobalInsts())
1799
1800 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1801 // Assume the we will use FLAT for all global memory accesses
1802 // on VI.
1803 // FIXME: This assumption is currently wrong. On VI we still use
1804 // MUBUF instructions for the r + i addressing mode. As currently
1805 // implemented, the MUBUF instructions only work on buffer < 4GB.
1806 // It may be possible to support > 4GB buffers with MUBUF instructions,
1807 // by setting the stride value in the resource descriptor which would
1808 // increase the size limit to (stride * 4GB). However, this is risky,
1809 // because it has never been validated.
1811 }
1812
1813 return isLegalMUBUFAddressingMode(AM);
1814}
1815
1816bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1817 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1818 // additionally can do r + r + i with addr64. 32-bit has more addressing
1819 // mode options. Depending on the resource constant, it can also do
1820 // (i64 r0) + (i32 r1) * (i14 i).
1821 //
1822 // Private arrays end up using a scratch buffer most of the time, so also
1823 // assume those use MUBUF instructions. Scratch loads / stores are currently
1824 // implemented as mubuf instructions with offen bit set, so slightly
1825 // different than the normal addr64.
1826 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1827 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1828 return false;
1829
1830 // FIXME: Since we can split immediate into soffset and immediate offset,
1831 // would it make sense to allow any immediate?
1832
1833 switch (AM.Scale) {
1834 case 0: // r + i or just i, depending on HasBaseReg.
1835 return true;
1836 case 1:
1837 return true; // We have r + r or r + i.
1838 case 2:
1839 if (AM.HasBaseReg) {
1840 // Reject 2 * r + r.
1841 return false;
1842 }
1843
1844 // Allow 2 * r as r + r
1845 // Or 2 * r + i is allowed as r + r + i.
1846 return true;
1847 default: // Don't allow n * r
1848 return false;
1849 }
1850}
1851
1853 const AddrMode &AM, Type *Ty,
1854 unsigned AS,
1855 Instruction *I) const {
1856 // No global is ever allowed as a base.
1857 if (AM.BaseGV)
1858 return false;
1859
1860 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1861 return isLegalGlobalAddressingMode(AM);
1862
1863 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1867 // If the offset isn't a multiple of 4, it probably isn't going to be
1868 // correctly aligned.
1869 // FIXME: Can we get the real alignment here?
1870 if (AM.BaseOffs % 4 != 0)
1871 return isLegalMUBUFAddressingMode(AM);
1872
1873 if (!Subtarget->hasScalarSubwordLoads()) {
1874 // There are no SMRD extloads, so if we have to do a small type access we
1875 // will use a MUBUF load.
1876 // FIXME?: We also need to do this if unaligned, but we don't know the
1877 // alignment here.
1878 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1879 return isLegalGlobalAddressingMode(AM);
1880 }
1881
1882 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1883 // SMRD instructions have an 8-bit, dword offset on SI.
1884 if (!isUInt<8>(AM.BaseOffs / 4))
1885 return false;
1886 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1887 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1888 // in 8-bits, it can use a smaller encoding.
1889 if (!isUInt<32>(AM.BaseOffs / 4))
1890 return false;
1891 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1892 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1893 if (!isUInt<20>(AM.BaseOffs))
1894 return false;
1895 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1896 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1897 // for S_BUFFER_* instructions).
1898 if (!isInt<21>(AM.BaseOffs))
1899 return false;
1900 } else {
1901 // On GFX12, all offsets are signed 24-bit in bytes.
1902 if (!isInt<24>(AM.BaseOffs))
1903 return false;
1904 }
1905
1906 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1908 AM.BaseOffs < 0) {
1909 // Scalar (non-buffer) loads can only use a negative offset if
1910 // soffset+offset is non-negative. Since the compiler can only prove that
1911 // in a few special cases, it is safer to claim that negative offsets are
1912 // not supported.
1913 return false;
1914 }
1915
1916 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1917 return true;
1918
1919 if (AM.Scale == 1 && AM.HasBaseReg)
1920 return true;
1921
1922 return false;
1923 }
1924
1925 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1926 return Subtarget->enableFlatScratch()
1928 : isLegalMUBUFAddressingMode(AM);
1929
1930 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1931 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1932 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1933 // field.
1934 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1935 // an 8-bit dword offset but we don't know the alignment here.
1936 if (!isUInt<16>(AM.BaseOffs))
1937 return false;
1938
1939 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1940 return true;
1941
1942 if (AM.Scale == 1 && AM.HasBaseReg)
1943 return true;
1944
1945 return false;
1946 }
1947
1949 // For an unknown address space, this usually means that this is for some
1950 // reason being used for pure arithmetic, and not based on some addressing
1951 // computation. We don't have instructions that compute pointers with any
1952 // addressing modes, so treat them as having no offset like flat
1953 // instructions.
1955 }
1956
1957 // Assume a user alias of global for unknown address spaces.
1958 return isLegalGlobalAddressingMode(AM);
1959}
1960
1962 const MachineFunction &MF) const {
1964 return (MemVT.getSizeInBits() <= 4 * 32);
1965 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1966 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1967 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1968 }
1970 return (MemVT.getSizeInBits() <= 2 * 32);
1971 return true;
1972}
1973
1975 unsigned Size, unsigned AddrSpace, Align Alignment,
1976 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1977 if (IsFast)
1978 *IsFast = 0;
1979
1980 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1981 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1982 // Check if alignment requirements for ds_read/write instructions are
1983 // disabled.
1984 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1985 return false;
1986
1987 Align RequiredAlignment(
1988 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1989 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1990 Alignment < RequiredAlignment)
1991 return false;
1992
1993 // Either, the alignment requirements are "enabled", or there is an
1994 // unaligned LDS access related hardware bug though alignment requirements
1995 // are "disabled". In either case, we need to check for proper alignment
1996 // requirements.
1997 //
1998 switch (Size) {
1999 case 64:
2000 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2001 // address is negative, then the instruction is incorrectly treated as
2002 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2003 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2004 // load later in the SILoadStoreOptimizer.
2005 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2006 return false;
2007
2008 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2009 // can do a 4 byte aligned, 8 byte access in a single operation using
2010 // ds_read2/write2_b32 with adjacent offsets.
2011 RequiredAlignment = Align(4);
2012
2013 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2014 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2015 // ds_write2_b32 depending on the alignment. In either case with either
2016 // alignment there is no faster way of doing this.
2017
2018 // The numbers returned here and below are not additive, it is a 'speed
2019 // rank'. They are just meant to be compared to decide if a certain way
2020 // of lowering an operation is faster than another. For that purpose
2021 // naturally aligned operation gets it bitsize to indicate that "it
2022 // operates with a speed comparable to N-bit wide load". With the full
2023 // alignment ds128 is slower than ds96 for example. If underaligned it
2024 // is comparable to a speed of a single dword access, which would then
2025 // mean 32 < 128 and it is faster to issue a wide load regardless.
2026 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2027 // wider load which will not be aligned anymore the latter is slower.
2028 if (IsFast)
2029 *IsFast = (Alignment >= RequiredAlignment) ? 64
2030 : (Alignment < Align(4)) ? 32
2031 : 1;
2032 return true;
2033 }
2034
2035 break;
2036 case 96:
2037 if (!Subtarget->hasDS96AndDS128())
2038 return false;
2039
2040 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2041 // gfx8 and older.
2042
2043 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2044 // Naturally aligned access is fastest. However, also report it is Fast
2045 // if memory is aligned less than DWORD. A narrow load or store will be
2046 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2047 // be more of them, so overall we will pay less penalty issuing a single
2048 // instruction.
2049
2050 // See comment on the values above.
2051 if (IsFast)
2052 *IsFast = (Alignment >= RequiredAlignment) ? 96
2053 : (Alignment < Align(4)) ? 32
2054 : 1;
2055 return true;
2056 }
2057
2058 break;
2059 case 128:
2060 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2061 return false;
2062
2063 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2064 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2065 // single operation using ds_read2/write2_b64.
2066 RequiredAlignment = Align(8);
2067
2068 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2069 // Naturally aligned access is fastest. However, also report it is Fast
2070 // if memory is aligned less than DWORD. A narrow load or store will be
2071 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2072 // will be more of them, so overall we will pay less penalty issuing a
2073 // single instruction.
2074
2075 // See comment on the values above.
2076 if (IsFast)
2077 *IsFast = (Alignment >= RequiredAlignment) ? 128
2078 : (Alignment < Align(4)) ? 32
2079 : 1;
2080 return true;
2081 }
2082
2083 break;
2084 default:
2085 if (Size > 32)
2086 return false;
2087
2088 break;
2089 }
2090
2091 // See comment on the values above.
2092 // Note that we have a single-dword or sub-dword here, so if underaligned
2093 // it is a slowest possible access, hence returned value is 0.
2094 if (IsFast)
2095 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2096
2097 return Alignment >= RequiredAlignment ||
2098 Subtarget->hasUnalignedDSAccessEnabled();
2099 }
2100
2101 // FIXME: We have to be conservative here and assume that flat operations
2102 // will access scratch. If we had access to the IR function, then we
2103 // could determine if any private memory was used in the function.
2104 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2105 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2106 bool AlignedBy4 = Alignment >= Align(4);
2107 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2108 if (IsFast)
2109 *IsFast = AlignedBy4 ? Size : 1;
2110 return true;
2111 }
2112
2113 if (IsFast)
2114 *IsFast = AlignedBy4;
2115
2116 return AlignedBy4;
2117 }
2118
2119 // So long as they are correct, wide global memory operations perform better
2120 // than multiple smaller memory ops -- even when misaligned
2121 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2122 if (IsFast)
2123 *IsFast = Size;
2124
2125 return Alignment >= Align(4) ||
2126 Subtarget->hasUnalignedBufferAccessEnabled();
2127 }
2128
2129 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2130 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2131 // out-of-bounds behavior, but in the edge case where an access starts
2132 // out-of-bounds and then enter in-bounds, the entire access would be treated
2133 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2134 // natural alignment of buffer accesses.
2135 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2136 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2137 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2138 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2139 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2140 return false;
2141 }
2142
2143 // Smaller than dword value must be aligned.
2144 if (Size < 32)
2145 return false;
2146
2147 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2148 // byte-address are ignored, thus forcing Dword alignment.
2149 // This applies to private, global, and constant memory.
2150 if (IsFast)
2151 *IsFast = 1;
2152
2153 return Size >= 32 && Alignment >= Align(4);
2154}
2155
2157 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2158 unsigned *IsFast) const {
2160 Alignment, Flags, IsFast);
2161}
2162
2164 LLVMContext &Context, const MemOp &Op,
2165 const AttributeList &FuncAttributes) const {
2166 // FIXME: Should account for address space here.
2167
2168 // The default fallback uses the private pointer size as a guess for a type to
2169 // use. Make sure we switch these to 64-bit accesses.
2170
2171 if (Op.size() >= 16 &&
2172 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2173 return MVT::v4i32;
2174
2175 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2176 return MVT::v2i32;
2177
2178 // Use the default.
2179 return MVT::Other;
2180}
2181
2183 const MemSDNode *MemNode = cast<MemSDNode>(N);
2184 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2185}
2186
2191
2193 unsigned DestAS) const {
2194 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2195 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2196 Subtarget->hasGloballyAddressableScratch()) {
2197 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2198 return false;
2199 }
2200
2201 // Flat -> private/local is a simple truncate.
2202 // Flat -> global is no-op
2203 return true;
2204 }
2205
2206 const GCNTargetMachine &TM =
2207 static_cast<const GCNTargetMachine &>(getTargetMachine());
2208 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2209}
2210
2218
2220 Type *Ty) const {
2221 // FIXME: Could be smarter if called for vector constants.
2222 return true;
2223}
2224
2226 unsigned Index) const {
2228 return false;
2229
2230 // TODO: Add more cases that are cheap.
2231 return Index == 0;
2232}
2233
2234bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2235 // TODO: This should be more aggressive, particular for 16-bit element
2236 // vectors. However there are some mixed improvements and regressions.
2237 EVT EltTy = VT.getVectorElementType();
2238 return EltTy.getSizeInBits() % 32 == 0;
2239}
2240
2242 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2243 switch (Op) {
2244 case ISD::LOAD:
2245 case ISD::STORE:
2246 return true;
2247 default:
2248 return false;
2249 }
2250 }
2251
2252 // SimplifySetCC uses this function to determine whether or not it should
2253 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2254 if (VT == MVT::i1 && Op == ISD::SETCC)
2255 return false;
2256
2258}
2259
2262 // This isn't really a constant pool but close enough.
2265 return PtrInfo;
2266}
2267
2268SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2269 const SDLoc &SL,
2270 SDValue Chain,
2271 uint64_t Offset) const {
2272 const DataLayout &DL = DAG.getDataLayout();
2276
2277 auto [InputPtrReg, RC, ArgTy] =
2278 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2279
2280 // We may not have the kernarg segment argument if we have no kernel
2281 // arguments.
2282 if (!InputPtrReg)
2283 return DAG.getConstant(Offset, SL, PtrVT);
2284
2286 SDValue BasePtr = DAG.getCopyFromReg(
2287 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2288
2289 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2290}
2291
2292SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2293 const SDLoc &SL) const {
2296 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2297}
2298
2299SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2300 const SDLoc &SL) const {
2301
2303 std::optional<uint32_t> KnownSize =
2305 if (KnownSize.has_value())
2306 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2307 return SDValue();
2308}
2309
2310SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2311 const SDLoc &SL, SDValue Val,
2312 bool Signed,
2313 const ISD::InputArg *Arg) const {
2314 // First, if it is a widened vector, narrow it.
2315 if (VT.isVector() &&
2317 EVT NarrowedVT =
2320 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2321 DAG.getConstant(0, SL, MVT::i32));
2322 }
2323
2324 // Then convert the vector elements or scalar value.
2325 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2326 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2327 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2328 }
2329
2330 if (MemVT.isFloatingPoint())
2331 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2332 else if (Signed)
2333 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2334 else
2335 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2336
2337 return Val;
2338}
2339
2340SDValue SITargetLowering::lowerKernargMemParameter(
2341 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2342 uint64_t Offset, Align Alignment, bool Signed,
2343 const ISD::InputArg *Arg) const {
2344
2345 MachinePointerInfo PtrInfo =
2347
2348 // Try to avoid using an extload by loading earlier than the argument address,
2349 // and extracting the relevant bits. The load should hopefully be merged with
2350 // the previous argument.
2351 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2352 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2353 int64_t AlignDownOffset = alignDown(Offset, 4);
2354 int64_t OffsetDiff = Offset - AlignDownOffset;
2355
2356 EVT IntVT = MemVT.changeTypeToInteger();
2357
2358 // TODO: If we passed in the base kernel offset we could have a better
2359 // alignment than 4, but we don't really need it.
2360 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2361 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2362 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2365
2366 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2367 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2368
2369 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2370 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2371 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2372
2373 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2374 }
2375
2376 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2377 SDValue Load = DAG.getLoad(
2378 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2380
2381 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2382 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2383}
2384
2385/// Coerce an argument which was passed in a different ABI type to the original
2386/// expected value type.
2387SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2388 SDValue Val,
2389 CCValAssign &VA,
2390 const SDLoc &SL) const {
2391 EVT ValVT = VA.getValVT();
2392
2393 // If this is an 8 or 16-bit value, it is really passed promoted
2394 // to 32 bits. Insert an assert[sz]ext to capture this, then
2395 // truncate to the right size.
2396 switch (VA.getLocInfo()) {
2397 case CCValAssign::Full:
2398 return Val;
2399 case CCValAssign::BCvt:
2400 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2401 case CCValAssign::SExt:
2402 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2403 DAG.getValueType(ValVT));
2404 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2405 case CCValAssign::ZExt:
2406 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2407 DAG.getValueType(ValVT));
2408 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2409 case CCValAssign::AExt:
2410 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2411 default:
2412 llvm_unreachable("Unknown loc info!");
2413 }
2414}
2415
2416SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2417 CCValAssign &VA, const SDLoc &SL,
2418 SDValue Chain,
2419 const ISD::InputArg &Arg) const {
2420 MachineFunction &MF = DAG.getMachineFunction();
2421 MachineFrameInfo &MFI = MF.getFrameInfo();
2422
2423 if (Arg.Flags.isByVal()) {
2424 unsigned Size = Arg.Flags.getByValSize();
2425 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2426 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2427 }
2428
2429 unsigned ArgOffset = VA.getLocMemOffset();
2430 unsigned ArgSize = VA.getValVT().getStoreSize();
2431
2432 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2433
2434 // Create load nodes to retrieve arguments from the stack.
2435 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2436
2437 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2439 MVT MemVT = VA.getValVT();
2440
2441 switch (VA.getLocInfo()) {
2442 default:
2443 break;
2444 case CCValAssign::BCvt:
2445 MemVT = VA.getLocVT();
2446 break;
2447 case CCValAssign::SExt:
2448 ExtType = ISD::SEXTLOAD;
2449 break;
2450 case CCValAssign::ZExt:
2451 ExtType = ISD::ZEXTLOAD;
2452 break;
2453 case CCValAssign::AExt:
2454 ExtType = ISD::EXTLOAD;
2455 break;
2456 }
2457
2458 SDValue ArgValue = DAG.getExtLoad(
2459 ExtType, SL, VA.getLocVT(), Chain, FIN,
2461
2462 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2463 if (ConvertedVal == ArgValue)
2464 return ConvertedVal;
2465
2466 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2467}
2468
2469SDValue SITargetLowering::lowerWorkGroupId(
2470 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2473 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2474 if (!Subtarget->hasClusters())
2475 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2476
2477 // Clusters are supported. Return the global position in the grid. If clusters
2478 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2479
2480 // WorkGroupIdXYZ = ClusterId == 0 ?
2481 // ClusterIdXYZ :
2482 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2483 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2484 SDLoc SL(ClusterIdXYZ);
2485 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2486 SDValue One = DAG.getConstant(1, SL, VT);
2487 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2488 SDValue ClusterWorkGroupIdXYZ =
2489 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2490 SDValue GlobalIdXYZ =
2491 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2492 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2493
2494 switch (MFI.getClusterDims().getKind()) {
2497 return GlobalIdXYZ;
2499 return ClusterIdXYZ;
2501 using namespace AMDGPU::Hwreg;
2502 SDValue ClusterIdField =
2503 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2504 SDNode *GetReg =
2505 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2506 SDValue ClusterId(GetReg, 0);
2507 SDValue Zero = DAG.getConstant(0, SL, VT);
2508 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2509 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2510 }
2511 }
2512
2513 llvm_unreachable("nothing should reach here");
2514}
2515
2516SDValue SITargetLowering::getPreloadedValue(
2517 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2519 const ArgDescriptor *Reg = nullptr;
2520 const TargetRegisterClass *RC;
2521 LLT Ty;
2522
2524 const ArgDescriptor WorkGroupIDX =
2525 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2526 // If GridZ is not programmed in an entry function then the hardware will set
2527 // it to all zeros, so there is no need to mask the GridY value in the low
2528 // order bits.
2529 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2530 AMDGPU::TTMP7,
2531 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2532 const ArgDescriptor WorkGroupIDZ =
2533 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2534 const ArgDescriptor ClusterWorkGroupIDX =
2535 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2536 const ArgDescriptor ClusterWorkGroupIDY =
2537 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2538 const ArgDescriptor ClusterWorkGroupIDZ =
2539 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2540 const ArgDescriptor ClusterWorkGroupMaxIDX =
2541 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2542 const ArgDescriptor ClusterWorkGroupMaxIDY =
2543 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2544 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2545 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2546 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2547 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2548
2549 auto LoadConstant = [&](unsigned N) {
2550 return DAG.getConstant(N, SDLoc(), VT);
2551 };
2552
2553 if (Subtarget->hasArchitectedSGPRs() &&
2555 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2556 bool HasFixedDims = ClusterDims.isFixedDims();
2557
2558 switch (PVID) {
2560 Reg = &WorkGroupIDX;
2561 RC = &AMDGPU::SReg_32RegClass;
2562 Ty = LLT::scalar(32);
2563 break;
2565 Reg = &WorkGroupIDY;
2566 RC = &AMDGPU::SReg_32RegClass;
2567 Ty = LLT::scalar(32);
2568 break;
2570 Reg = &WorkGroupIDZ;
2571 RC = &AMDGPU::SReg_32RegClass;
2572 Ty = LLT::scalar(32);
2573 break;
2575 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2576 return LoadConstant(0);
2577 Reg = &ClusterWorkGroupIDX;
2578 RC = &AMDGPU::SReg_32RegClass;
2579 Ty = LLT::scalar(32);
2580 break;
2582 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2583 return LoadConstant(0);
2584 Reg = &ClusterWorkGroupIDY;
2585 RC = &AMDGPU::SReg_32RegClass;
2586 Ty = LLT::scalar(32);
2587 break;
2589 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2590 return LoadConstant(0);
2591 Reg = &ClusterWorkGroupIDZ;
2592 RC = &AMDGPU::SReg_32RegClass;
2593 Ty = LLT::scalar(32);
2594 break;
2596 if (HasFixedDims)
2597 return LoadConstant(ClusterDims.getDims()[0] - 1);
2598 Reg = &ClusterWorkGroupMaxIDX;
2599 RC = &AMDGPU::SReg_32RegClass;
2600 Ty = LLT::scalar(32);
2601 break;
2603 if (HasFixedDims)
2604 return LoadConstant(ClusterDims.getDims()[1] - 1);
2605 Reg = &ClusterWorkGroupMaxIDY;
2606 RC = &AMDGPU::SReg_32RegClass;
2607 Ty = LLT::scalar(32);
2608 break;
2610 if (HasFixedDims)
2611 return LoadConstant(ClusterDims.getDims()[2] - 1);
2612 Reg = &ClusterWorkGroupMaxIDZ;
2613 RC = &AMDGPU::SReg_32RegClass;
2614 Ty = LLT::scalar(32);
2615 break;
2617 Reg = &ClusterWorkGroupMaxFlatID;
2618 RC = &AMDGPU::SReg_32RegClass;
2619 Ty = LLT::scalar(32);
2620 break;
2621 default:
2622 break;
2623 }
2624 }
2625
2626 if (!Reg)
2627 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2628 if (!Reg) {
2630 // It's possible for a kernarg intrinsic call to appear in a kernel with
2631 // no allocated segment, in which case we do not add the user sgpr
2632 // argument, so just return null.
2633 return DAG.getConstant(0, SDLoc(), VT);
2634 }
2635
2636 // It's undefined behavior if a function marked with the amdgpu-no-*
2637 // attributes uses the corresponding intrinsic.
2638 return DAG.getPOISON(VT);
2639 }
2640
2641 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2642}
2643
2645 CallingConv::ID CallConv,
2646 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2647 FunctionType *FType,
2649 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2650 const ISD::InputArg *Arg = &Ins[I];
2651
2652 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2653 "vector type argument should have been split");
2654
2655 // First check if it's a PS input addr.
2656 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2657 PSInputNum <= 15) {
2658 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2659
2660 // Inconveniently only the first part of the split is marked as isSplit,
2661 // so skip to the end. We only want to increment PSInputNum once for the
2662 // entire split argument.
2663 if (Arg->Flags.isSplit()) {
2664 while (!Arg->Flags.isSplitEnd()) {
2665 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2666 "unexpected vector split in ps argument type");
2667 if (!SkipArg)
2668 Splits.push_back(*Arg);
2669 Arg = &Ins[++I];
2670 }
2671 }
2672
2673 if (SkipArg) {
2674 // We can safely skip PS inputs.
2675 Skipped.set(Arg->getOrigArgIndex());
2676 ++PSInputNum;
2677 continue;
2678 }
2679
2680 Info->markPSInputAllocated(PSInputNum);
2681 if (Arg->Used)
2682 Info->markPSInputEnabled(PSInputNum);
2683
2684 ++PSInputNum;
2685 }
2686
2687 Splits.push_back(*Arg);
2688 }
2689}
2690
2691// Allocate special inputs passed in VGPRs.
2693 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2694 SIMachineFunctionInfo &Info) const {
2695 const LLT S32 = LLT::scalar(32);
2697
2698 if (Info.hasWorkItemIDX()) {
2699 Register Reg = AMDGPU::VGPR0;
2700 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2701
2702 CCInfo.AllocateReg(Reg);
2703 unsigned Mask =
2704 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2705 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2706 }
2707
2708 if (Info.hasWorkItemIDY()) {
2709 assert(Info.hasWorkItemIDX());
2710 if (Subtarget->hasPackedTID()) {
2711 Info.setWorkItemIDY(
2712 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2713 } else {
2714 unsigned Reg = AMDGPU::VGPR1;
2715 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2716
2717 CCInfo.AllocateReg(Reg);
2718 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2719 }
2720 }
2721
2722 if (Info.hasWorkItemIDZ()) {
2723 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2724 if (Subtarget->hasPackedTID()) {
2725 Info.setWorkItemIDZ(
2726 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2727 } else {
2728 unsigned Reg = AMDGPU::VGPR2;
2729 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2730
2731 CCInfo.AllocateReg(Reg);
2732 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2733 }
2734 }
2735}
2736
2737// Try to allocate a VGPR at the end of the argument list, or if no argument
2738// VGPRs are left allocating a stack slot.
2739// If \p Mask is is given it indicates bitfield position in the register.
2740// If \p Arg is given use it with new ]p Mask instead of allocating new.
2741static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2742 ArgDescriptor Arg = ArgDescriptor()) {
2743 if (Arg.isSet())
2744 return ArgDescriptor::createArg(Arg, Mask);
2745
2746 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2747 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2748 if (RegIdx == ArgVGPRs.size()) {
2749 // Spill to stack required.
2750 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2751
2752 return ArgDescriptor::createStack(Offset, Mask);
2753 }
2754
2755 unsigned Reg = ArgVGPRs[RegIdx];
2756 Reg = CCInfo.AllocateReg(Reg);
2757 assert(Reg != AMDGPU::NoRegister);
2758
2759 MachineFunction &MF = CCInfo.getMachineFunction();
2760 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2761 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2762 return ArgDescriptor::createRegister(Reg, Mask);
2763}
2764
2766 const TargetRegisterClass *RC,
2767 unsigned NumArgRegs) {
2768 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2769 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2770 if (RegIdx == ArgSGPRs.size())
2771 report_fatal_error("ran out of SGPRs for arguments");
2772
2773 unsigned Reg = ArgSGPRs[RegIdx];
2774 Reg = CCInfo.AllocateReg(Reg);
2775 assert(Reg != AMDGPU::NoRegister);
2776
2777 MachineFunction &MF = CCInfo.getMachineFunction();
2778 MF.addLiveIn(Reg, RC);
2780}
2781
2782// If this has a fixed position, we still should allocate the register in the
2783// CCInfo state. Technically we could get away with this for values passed
2784// outside of the normal argument range.
2786 const TargetRegisterClass *RC,
2787 MCRegister Reg) {
2788 Reg = CCInfo.AllocateReg(Reg);
2789 assert(Reg != AMDGPU::NoRegister);
2790 MachineFunction &MF = CCInfo.getMachineFunction();
2791 MF.addLiveIn(Reg, RC);
2792}
2793
2794static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2795 if (Arg) {
2796 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2797 Arg.getRegister());
2798 } else
2799 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2800}
2801
2802static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2803 if (Arg) {
2804 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2805 Arg.getRegister());
2806 } else
2807 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2808}
2809
2810/// Allocate implicit function VGPR arguments at the end of allocated user
2811/// arguments.
2813 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2814 SIMachineFunctionInfo &Info) const {
2815 const unsigned Mask = 0x3ff;
2816 ArgDescriptor Arg;
2817
2818 if (Info.hasWorkItemIDX()) {
2819 Arg = allocateVGPR32Input(CCInfo, Mask);
2820 Info.setWorkItemIDX(Arg);
2821 }
2822
2823 if (Info.hasWorkItemIDY()) {
2824 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2825 Info.setWorkItemIDY(Arg);
2826 }
2827
2828 if (Info.hasWorkItemIDZ())
2829 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2830}
2831
2832/// Allocate implicit function VGPR arguments in fixed registers.
2834 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2835 SIMachineFunctionInfo &Info) const {
2836 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2837 if (!Reg)
2838 report_fatal_error("failed to allocate VGPR for implicit arguments");
2839
2840 const unsigned Mask = 0x3ff;
2841 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2842 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2843 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2844}
2845
2847 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2848 SIMachineFunctionInfo &Info) const {
2849 auto &ArgInfo = Info.getArgInfo();
2850 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2851
2852 // TODO: Unify handling with private memory pointers.
2853 if (UserSGPRInfo.hasDispatchPtr())
2854 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2855
2856 if (UserSGPRInfo.hasQueuePtr())
2857 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2858
2859 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2860 // constant offset from the kernarg segment.
2861 if (Info.hasImplicitArgPtr())
2862 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2863
2864 if (UserSGPRInfo.hasDispatchID())
2865 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2866
2867 // flat_scratch_init is not applicable for non-kernel functions.
2868
2869 if (Info.hasWorkGroupIDX())
2870 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2871
2872 if (Info.hasWorkGroupIDY())
2873 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2874
2875 if (Info.hasWorkGroupIDZ())
2876 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2877
2878 if (Info.hasLDSKernelId())
2879 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2880}
2881
2882// Allocate special inputs passed in user SGPRs.
2884 MachineFunction &MF,
2885 const SIRegisterInfo &TRI,
2886 SIMachineFunctionInfo &Info) const {
2887 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2888 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2889 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2890 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2891 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2892 }
2893
2894 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2895 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2896 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2897 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2898 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2899 }
2900
2901 if (UserSGPRInfo.hasDispatchPtr()) {
2902 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2903 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2904 CCInfo.AllocateReg(DispatchPtrReg);
2905 }
2906
2907 if (UserSGPRInfo.hasQueuePtr()) {
2908 Register QueuePtrReg = Info.addQueuePtr(TRI);
2909 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2910 CCInfo.AllocateReg(QueuePtrReg);
2911 }
2912
2913 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2915 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2916 CCInfo.AllocateReg(InputPtrReg);
2917
2918 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2919 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2920 }
2921
2922 if (UserSGPRInfo.hasDispatchID()) {
2923 Register DispatchIDReg = Info.addDispatchID(TRI);
2924 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2925 CCInfo.AllocateReg(DispatchIDReg);
2926 }
2927
2928 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2929 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2930 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2931 CCInfo.AllocateReg(FlatScratchInitReg);
2932 }
2933
2934 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2935 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2936 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2937 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2938 }
2939
2940 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2941 // these from the dispatch pointer.
2942}
2943
2944// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2945// sequential starting from the first argument.
2947 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2949 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2950 Function &F = MF.getFunction();
2951 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2952 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2953 bool InPreloadSequence = true;
2954 unsigned InIdx = 0;
2955 bool AlignedForImplictArgs = false;
2956 unsigned ImplicitArgOffset = 0;
2957 for (auto &Arg : F.args()) {
2958 if (!InPreloadSequence || !Arg.hasInRegAttr())
2959 break;
2960
2961 unsigned ArgIdx = Arg.getArgNo();
2962 // Don't preload non-original args or parts not in the current preload
2963 // sequence.
2964 if (InIdx < Ins.size() &&
2965 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2966 break;
2967
2968 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2969 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2970 InIdx++) {
2971 assert(ArgLocs[ArgIdx].isMemLoc());
2972 auto &ArgLoc = ArgLocs[InIdx];
2973 const Align KernelArgBaseAlign = Align(16);
2974 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2975 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2976 unsigned NumAllocSGPRs =
2977 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2978
2979 // Fix alignment for hidden arguments.
2980 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2981 if (!AlignedForImplictArgs) {
2982 ImplicitArgOffset =
2983 alignTo(LastExplicitArgOffset,
2984 Subtarget->getAlignmentForImplicitArgPtr()) -
2985 LastExplicitArgOffset;
2986 AlignedForImplictArgs = true;
2987 }
2988 ArgOffset += ImplicitArgOffset;
2989 }
2990
2991 // Arg is preloaded into the previous SGPR.
2992 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2993 assert(InIdx >= 1 && "No previous SGPR");
2994 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2995 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2996 continue;
2997 }
2998
2999 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3000 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
3001 // Check for free user SGPRs for preloading.
3002 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3003 InPreloadSequence = false;
3004 break;
3005 }
3006
3007 // Preload this argument.
3008 const TargetRegisterClass *RC =
3009 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3010 SmallVectorImpl<MCRegister> *PreloadRegs =
3011 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3012
3013 if (PreloadRegs->size() > 1)
3014 RC = &AMDGPU::SGPR_32RegClass;
3015 for (auto &Reg : *PreloadRegs) {
3016 assert(Reg);
3017 MF.addLiveIn(Reg, RC);
3018 CCInfo.AllocateReg(Reg);
3019 }
3020
3021 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3022 }
3023 }
3024}
3025
3027 const SIRegisterInfo &TRI,
3028 SIMachineFunctionInfo &Info) const {
3029 // Always allocate this last since it is a synthetic preload.
3030 if (Info.hasLDSKernelId()) {
3031 Register Reg = Info.addLDSKernelId();
3032 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3033 CCInfo.AllocateReg(Reg);
3034 }
3035}
3036
3037// Allocate special input registers that are initialized per-wave.
3040 CallingConv::ID CallConv,
3041 bool IsShader) const {
3042 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3043 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3044 // Note: user SGPRs are handled by the front-end for graphics shaders
3045 // Pad up the used user SGPRs with dead inputs.
3046
3047 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3048 // before enabling architected SGPRs for workgroup IDs.
3049 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3050
3051 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3052 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3053 // rely on it to reach 16 since if we end up having no stack usage, it will
3054 // not really be added.
3055 unsigned NumRequiredSystemSGPRs =
3056 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3057 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3058 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3059 Register Reg = Info.addReservedUserSGPR();
3060 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3061 CCInfo.AllocateReg(Reg);
3062 }
3063 }
3064
3065 if (!HasArchitectedSGPRs) {
3066 if (Info.hasWorkGroupIDX()) {
3067 Register Reg = Info.addWorkGroupIDX();
3068 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3069 CCInfo.AllocateReg(Reg);
3070 }
3071
3072 if (Info.hasWorkGroupIDY()) {
3073 Register Reg = Info.addWorkGroupIDY();
3074 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3075 CCInfo.AllocateReg(Reg);
3076 }
3077
3078 if (Info.hasWorkGroupIDZ()) {
3079 Register Reg = Info.addWorkGroupIDZ();
3080 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3081 CCInfo.AllocateReg(Reg);
3082 }
3083 }
3084
3085 if (Info.hasWorkGroupInfo()) {
3086 Register Reg = Info.addWorkGroupInfo();
3087 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3088 CCInfo.AllocateReg(Reg);
3089 }
3090
3091 if (Info.hasPrivateSegmentWaveByteOffset()) {
3092 // Scratch wave offset passed in system SGPR.
3093 unsigned PrivateSegmentWaveByteOffsetReg;
3094
3095 if (IsShader) {
3096 PrivateSegmentWaveByteOffsetReg =
3097 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3098
3099 // This is true if the scratch wave byte offset doesn't have a fixed
3100 // location.
3101 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3102 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3103 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3104 }
3105 } else
3106 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3107
3108 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3109 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3110 }
3111
3112 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3113 Info.getNumPreloadedSGPRs() >= 16);
3114}
3115
3117 MachineFunction &MF,
3118 const SIRegisterInfo &TRI,
3120 // Now that we've figured out where the scratch register inputs are, see if
3121 // should reserve the arguments and use them directly.
3122 MachineFrameInfo &MFI = MF.getFrameInfo();
3123 bool HasStackObjects = MFI.hasStackObjects();
3124 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3125
3126 // Record that we know we have non-spill stack objects so we don't need to
3127 // check all stack objects later.
3128 if (HasStackObjects)
3129 Info.setHasNonSpillStackObjects(true);
3130
3131 // Everything live out of a block is spilled with fast regalloc, so it's
3132 // almost certain that spilling will be required.
3134 HasStackObjects = true;
3135
3136 // For now assume stack access is needed in any callee functions, so we need
3137 // the scratch registers to pass in.
3138 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3139
3140 if (!ST.enableFlatScratch()) {
3141 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3142 // If we have stack objects, we unquestionably need the private buffer
3143 // resource. For the Code Object V2 ABI, this will be the first 4 user
3144 // SGPR inputs. We can reserve those and use them directly.
3145
3146 Register PrivateSegmentBufferReg =
3148 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3149 } else {
3150 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3151 // We tentatively reserve the last registers (skipping the last registers
3152 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3153 // we'll replace these with the ones immediately after those which were
3154 // really allocated. In the prologue copies will be inserted from the
3155 // argument to these reserved registers.
3156
3157 // Without HSA, relocations are used for the scratch pointer and the
3158 // buffer resource setup is always inserted in the prologue. Scratch wave
3159 // offset is still in an input SGPR.
3160 Info.setScratchRSrcReg(ReservedBufferReg);
3161 }
3162 }
3163
3165
3166 // For entry functions we have to set up the stack pointer if we use it,
3167 // whereas non-entry functions get this "for free". This means there is no
3168 // intrinsic advantage to using S32 over S34 in cases where we do not have
3169 // calls but do need a frame pointer (i.e. if we are requested to have one
3170 // because frame pointer elimination is disabled). To keep things simple we
3171 // only ever use S32 as the call ABI stack pointer, and so using it does not
3172 // imply we need a separate frame pointer.
3173 //
3174 // Try to use s32 as the SP, but move it if it would interfere with input
3175 // arguments. This won't work with calls though.
3176 //
3177 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3178 // registers.
3179 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3180 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3181 } else {
3183
3184 if (MFI.hasCalls())
3185 report_fatal_error("call in graphics shader with too many input SGPRs");
3186
3187 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3188 if (!MRI.isLiveIn(Reg)) {
3189 Info.setStackPtrOffsetReg(Reg);
3190 break;
3191 }
3192 }
3193
3194 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3195 report_fatal_error("failed to find register for SP");
3196 }
3197
3198 // hasFP should be accurate for entry functions even before the frame is
3199 // finalized, because it does not rely on the known stack size, only
3200 // properties like whether variable sized objects are present.
3201 if (ST.getFrameLowering()->hasFP(MF)) {
3202 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3203 }
3204}
3205
3208 return !Info->isEntryFunction();
3209}
3210
3212
3214 MachineBasicBlock *Entry,
3215 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3217
3218 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3219 if (!IStart)
3220 return;
3221
3222 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3223 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3224 MachineBasicBlock::iterator MBBI = Entry->begin();
3225 for (const MCPhysReg *I = IStart; *I; ++I) {
3226 const TargetRegisterClass *RC = nullptr;
3227 if (AMDGPU::SReg_64RegClass.contains(*I))
3228 RC = &AMDGPU::SGPR_64RegClass;
3229 else if (AMDGPU::SReg_32RegClass.contains(*I))
3230 RC = &AMDGPU::SGPR_32RegClass;
3231 else
3232 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3233
3234 Register NewVR = MRI->createVirtualRegister(RC);
3235 // Create copy from CSR to a virtual register.
3236 Entry->addLiveIn(*I);
3237 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3238 .addReg(*I);
3239
3240 // Insert the copy-back instructions right before the terminator.
3241 for (auto *Exit : Exits)
3242 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3243 TII->get(TargetOpcode::COPY), *I)
3244 .addReg(NewVR);
3245 }
3246}
3247
3249 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3250 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3251 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3253
3255 const Function &Fn = MF.getFunction();
3258 bool IsError = false;
3259
3260 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3262 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3263 IsError = true;
3264 }
3265
3268 BitVector Skipped(Ins.size());
3269 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3270 *DAG.getContext());
3271
3272 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3273 bool IsKernel = AMDGPU::isKernel(CallConv);
3274 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3275
3276 if (IsGraphics) {
3277 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3278 assert(!UserSGPRInfo.hasDispatchPtr() &&
3279 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3280 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3281 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3282 (void)UserSGPRInfo;
3283 if (!Subtarget->enableFlatScratch())
3284 assert(!UserSGPRInfo.hasFlatScratchInit());
3285 if ((CallConv != CallingConv::AMDGPU_CS &&
3286 CallConv != CallingConv::AMDGPU_Gfx &&
3287 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3288 !Subtarget->hasArchitectedSGPRs())
3289 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3290 !Info->hasWorkGroupIDZ());
3291 }
3292
3293 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3294
3295 if (CallConv == CallingConv::AMDGPU_PS) {
3296 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3297
3298 // At least one interpolation mode must be enabled or else the GPU will
3299 // hang.
3300 //
3301 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3302 // set PSInputAddr, the user wants to enable some bits after the compilation
3303 // based on run-time states. Since we can't know what the final PSInputEna
3304 // will look like, so we shouldn't do anything here and the user should take
3305 // responsibility for the correct programming.
3306 //
3307 // Otherwise, the following restrictions apply:
3308 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3309 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3310 // enabled too.
3311 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3312 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3313 CCInfo.AllocateReg(AMDGPU::VGPR0);
3314 CCInfo.AllocateReg(AMDGPU::VGPR1);
3315 Info->markPSInputAllocated(0);
3316 Info->markPSInputEnabled(0);
3317 }
3318 if (Subtarget->isAmdPalOS()) {
3319 // For isAmdPalOS, the user does not enable some bits after compilation
3320 // based on run-time states; the register values being generated here are
3321 // the final ones set in hardware. Therefore we need to apply the
3322 // workaround to PSInputAddr and PSInputEnable together. (The case where
3323 // a bit is set in PSInputAddr but not PSInputEnable is where the
3324 // frontend set up an input arg for a particular interpolation mode, but
3325 // nothing uses that input arg. Really we should have an earlier pass
3326 // that removes such an arg.)
3327 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3328 if ((PsInputBits & 0x7F) == 0 ||
3329 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3330 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3331 }
3332 } else if (IsKernel) {
3333 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3334 } else {
3335 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3336 Ins.end());
3337 }
3338
3339 if (IsKernel)
3340 analyzeFormalArgumentsCompute(CCInfo, Ins);
3341
3342 if (IsEntryFunc) {
3343 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3344 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3345 if (IsKernel && Subtarget->hasKernargPreload())
3346 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3347
3348 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3349 } else if (!IsGraphics) {
3350 // For the fixed ABI, pass workitem IDs in the last argument register.
3351 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3352
3353 // FIXME: Sink this into allocateSpecialInputSGPRs
3354 if (!Subtarget->enableFlatScratch())
3355 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3356
3357 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3358 }
3359
3360 if (!IsKernel) {
3361 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3362 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3363
3364 // This assumes the registers are allocated by CCInfo in ascending order
3365 // with no gaps.
3366 Info->setNumWaveDispatchSGPRs(
3367 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3368 Info->setNumWaveDispatchVGPRs(
3369 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3370 } else if (Info->getNumKernargPreloadedSGPRs()) {
3371 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3372 }
3373
3375
3376 if (IsWholeWaveFunc) {
3377 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3378 {MVT::i1, MVT::Other}, Chain);
3379 InVals.push_back(Setup.getValue(0));
3380 Chains.push_back(Setup.getValue(1));
3381 }
3382
3383 // FIXME: This is the minimum kernel argument alignment. We should improve
3384 // this to the maximum alignment of the arguments.
3385 //
3386 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3387 // kern arg offset.
3388 const Align KernelArgBaseAlign = Align(16);
3389
3390 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3391 ++i) {
3392 const ISD::InputArg &Arg = Ins[i];
3393 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3394 InVals.push_back(DAG.getPOISON(Arg.VT));
3395 continue;
3396 }
3397
3398 CCValAssign &VA = ArgLocs[ArgIdx++];
3399 MVT VT = VA.getLocVT();
3400
3401 if (IsEntryFunc && VA.isMemLoc()) {
3402 VT = Ins[i].VT;
3403 EVT MemVT = VA.getLocVT();
3404
3405 const uint64_t Offset = VA.getLocMemOffset();
3406 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3407
3408 if (Arg.Flags.isByRef()) {
3409 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3410
3411 const GCNTargetMachine &TM =
3412 static_cast<const GCNTargetMachine &>(getTargetMachine());
3413 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3414 Arg.Flags.getPointerAddrSpace())) {
3417 }
3418
3419 InVals.push_back(Ptr);
3420 continue;
3421 }
3422
3423 SDValue NewArg;
3424 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3425 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3426 // In this case the argument is packed into the previous preload SGPR.
3427 int64_t AlignDownOffset = alignDown(Offset, 4);
3428 int64_t OffsetDiff = Offset - AlignDownOffset;
3429 EVT IntVT = MemVT.changeTypeToInteger();
3430
3431 const SIMachineFunctionInfo *Info =
3434 Register Reg =
3435 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3436
3437 assert(Reg);
3438 Register VReg = MRI.getLiveInVirtReg(Reg);
3439 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3440
3441 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3442 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3443
3444 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3445 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3446 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3447 Ins[i].Flags.isSExt(), &Ins[i]);
3448
3449 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3450 } else {
3451 const SIMachineFunctionInfo *Info =
3454 const SmallVectorImpl<MCRegister> &PreloadRegs =
3455 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3456
3457 SDValue Copy;
3458 if (PreloadRegs.size() == 1) {
3459 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3460 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3461 NewArg = DAG.getCopyFromReg(
3462 Chain, DL, VReg,
3464 TRI->getRegSizeInBits(*RC)));
3465
3466 } else {
3467 // If the kernarg alignment does not match the alignment of the SGPR
3468 // tuple RC that can accommodate this argument, it will be built up
3469 // via copies from from the individual SGPRs that the argument was
3470 // preloaded to.
3472 for (auto Reg : PreloadRegs) {
3473 Register VReg = MRI.getLiveInVirtReg(Reg);
3474 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3475 Elts.push_back(Copy);
3476 }
3477 NewArg =
3478 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3479 PreloadRegs.size()),
3480 DL, Elts);
3481 }
3482
3483 // If the argument was preloaded to multiple consecutive 32-bit
3484 // registers because of misalignment between addressable SGPR tuples
3485 // and the argument size, we can still assume that because of kernarg
3486 // segment alignment restrictions that NewArg's size is the same as
3487 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3488 // truncate since we cannot preload to less than a single SGPR and the
3489 // MemVT may be smaller.
3490 EVT MemVTInt =
3492 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3493 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3494
3495 NewArg = DAG.getBitcast(MemVT, NewArg);
3496 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3497 Ins[i].Flags.isSExt(), &Ins[i]);
3498 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3499 }
3500 } else {
3501 // Hidden arguments that are in the kernel signature must be preloaded
3502 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3503 // the argument list and is not preloaded.
3504 if (Arg.isOrigArg()) {
3505 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3506 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3508 *OrigArg->getParent(),
3509 "hidden argument in kernel signature was not preloaded",
3510 DL.getDebugLoc()));
3511 }
3512 }
3513
3514 NewArg =
3515 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3516 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3517 }
3518 Chains.push_back(NewArg.getValue(1));
3519
3520 auto *ParamTy =
3521 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3522 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3523 ParamTy &&
3524 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3525 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3526 // On SI local pointers are just offsets into LDS, so they are always
3527 // less than 16-bits. On CI and newer they could potentially be
3528 // real pointers, so we can't guarantee their size.
3529 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3530 DAG.getValueType(MVT::i16));
3531 }
3532
3533 InVals.push_back(NewArg);
3534 continue;
3535 }
3536 if (!IsEntryFunc && VA.isMemLoc()) {
3537 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3538 InVals.push_back(Val);
3539 if (!Arg.Flags.isByVal())
3540 Chains.push_back(Val.getValue(1));
3541 continue;
3542 }
3543
3544 assert(VA.isRegLoc() && "Parameter must be in a register!");
3545
3546 Register Reg = VA.getLocReg();
3547 const TargetRegisterClass *RC = nullptr;
3548 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3549 RC = &AMDGPU::VGPR_32RegClass;
3550 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3551 RC = &AMDGPU::SGPR_32RegClass;
3552 else
3553 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3554
3555 Reg = MF.addLiveIn(Reg, RC);
3556 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3557
3558 if (Arg.Flags.isSRet()) {
3559 // The return object should be reasonably addressable.
3560
3561 // FIXME: This helps when the return is a real sret. If it is a
3562 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3563 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3564 unsigned NumBits =
3566 Val = DAG.getNode(
3567 ISD::AssertZext, DL, VT, Val,
3568 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3569 }
3570
3571 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3572 InVals.push_back(Val);
3573 }
3574
3575 // Start adding system SGPRs.
3576 if (IsEntryFunc)
3577 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3578
3579 if (DAG.getPass()) {
3580 auto &ArgUsageInfo =
3582 ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(Fn, Info->getArgInfo());
3583 } else if (auto *MFAM = DAG.getMFAM()) {
3584 Module &M = *MF.getFunction().getParent();
3585 auto *ArgUsageInfo =
3587 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3588 if (ArgUsageInfo)
3589 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3590 }
3591
3592 unsigned StackArgSize = CCInfo.getStackSize();
3593 Info->setBytesInStackArgArea(StackArgSize);
3594
3595 return Chains.empty() ? Chain
3596 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3597}
3598
3599// TODO: If return values can't fit in registers, we should return as many as
3600// possible in registers before passing on stack.
3602 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3603 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3604 const Type *RetTy) const {
3605 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3606 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3607 // for shaders. Vector types should be explicitly handled by CC.
3608 if (AMDGPU::isEntryFunctionCC(CallConv))
3609 return true;
3610
3612 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3613 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3614 return false;
3615
3616 // We must use the stack if return would require unavailable registers.
3617 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3618 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3619 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3620 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3621 return false;
3622
3623 return true;
3624}
3625
3626SDValue
3628 bool isVarArg,
3630 const SmallVectorImpl<SDValue> &OutVals,
3631 const SDLoc &DL, SelectionDAG &DAG) const {
3635
3636 if (AMDGPU::isKernel(CallConv)) {
3637 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3638 OutVals, DL, DAG);
3639 }
3640
3641 bool IsShader = AMDGPU::isShader(CallConv);
3642
3643 Info->setIfReturnsVoid(Outs.empty());
3644 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3645
3646 // CCValAssign - represent the assignment of the return value to a location.
3648
3649 // CCState - Info about the registers and stack slots.
3650 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3651 *DAG.getContext());
3652
3653 // Analyze outgoing return values.
3654 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3655
3656 SDValue Glue;
3658 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3659
3660 SDValue ReadFirstLane =
3661 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3662 // Copy the result values into the output registers.
3663 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3664 ++I, ++RealRVLocIdx) {
3665 CCValAssign &VA = RVLocs[I];
3666 assert(VA.isRegLoc() && "Can only return in registers!");
3667 // TODO: Partially return in registers if return values don't fit.
3668 SDValue Arg = OutVals[RealRVLocIdx];
3669
3670 // Copied from other backends.
3671 switch (VA.getLocInfo()) {
3672 case CCValAssign::Full:
3673 break;
3674 case CCValAssign::BCvt:
3675 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3676 break;
3677 case CCValAssign::SExt:
3678 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3679 break;
3680 case CCValAssign::ZExt:
3681 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3682 break;
3683 case CCValAssign::AExt:
3684 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3685 break;
3686 default:
3687 llvm_unreachable("Unknown loc info!");
3688 }
3689 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3691 ReadFirstLane, Arg);
3692 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3693 Glue = Chain.getValue(1);
3694 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3695 }
3696
3697 // FIXME: Does sret work properly?
3698 if (!Info->isEntryFunction()) {
3699 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3700 const MCPhysReg *I =
3701 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3702 if (I) {
3703 for (; *I; ++I) {
3704 if (AMDGPU::SReg_64RegClass.contains(*I))
3705 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3706 else if (AMDGPU::SReg_32RegClass.contains(*I))
3707 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3708 else
3709 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3710 }
3711 }
3712 }
3713
3714 // Update chain and glue.
3715 RetOps[0] = Chain;
3716 if (Glue.getNode())
3717 RetOps.push_back(Glue);
3718
3719 unsigned Opc = AMDGPUISD::ENDPGM;
3720 if (!IsWaveEnd)
3721 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3722 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3723 : AMDGPUISD::RET_GLUE;
3724 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3725}
3726
3728 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3729 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3730 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3731 SDValue ThisVal) const {
3732 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3733
3734 // Assign locations to each value returned by this call.
3736 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3737 *DAG.getContext());
3738 CCInfo.AnalyzeCallResult(Ins, RetCC);
3739
3740 // Copy all of the result registers out of their specified physreg.
3741 for (CCValAssign VA : RVLocs) {
3742 SDValue Val;
3743
3744 if (VA.isRegLoc()) {
3745 Val =
3746 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3747 Chain = Val.getValue(1);
3748 InGlue = Val.getValue(2);
3749 } else if (VA.isMemLoc()) {
3750 report_fatal_error("TODO: return values in memory");
3751 } else
3752 llvm_unreachable("unknown argument location type");
3753
3754 switch (VA.getLocInfo()) {
3755 case CCValAssign::Full:
3756 break;
3757 case CCValAssign::BCvt:
3758 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3759 break;
3760 case CCValAssign::ZExt:
3761 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3762 DAG.getValueType(VA.getValVT()));
3763 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3764 break;
3765 case CCValAssign::SExt:
3766 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3767 DAG.getValueType(VA.getValVT()));
3768 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3769 break;
3770 case CCValAssign::AExt:
3771 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3772 break;
3773 default:
3774 llvm_unreachable("Unknown loc info!");
3775 }
3776
3777 InVals.push_back(Val);
3778 }
3779
3780 return Chain;
3781}
3782
3783// Add code to pass special inputs required depending on used features separate
3784// from the explicit user arguments present in the IR.
3786 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3787 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3788 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3789 // If we don't have a call site, this was a call inserted by
3790 // legalization. These can never use special inputs.
3791 if (!CLI.CB)
3792 return;
3793
3794 SelectionDAG &DAG = CLI.DAG;
3795 const SDLoc &DL = CLI.DL;
3796 const Function &F = DAG.getMachineFunction().getFunction();
3797
3798 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3799 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3800
3801 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3803 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3804 if (DAG.getPass()) {
3805 auto &ArgUsageInfo =
3807 CalleeArgInfo =
3808 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3809 } else if (auto *MFAM = DAG.getMFAM()) {
3811 auto *ArgUsageInfo =
3813 DAG.getMachineFunction())
3814 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3815 if (ArgUsageInfo)
3816 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3817 }
3818 }
3819
3820 // TODO: Unify with private memory register handling. This is complicated by
3821 // the fact that at least in kernels, the input argument is not necessarily
3822 // in the same location as the input.
3823 // clang-format off
3824 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3825 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3826 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3827 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3828 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3829 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3830 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3831 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3832 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3833 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3834 };
3835 // clang-format on
3836
3837 for (auto [InputID, Attrs] : ImplicitAttrs) {
3838 // If the callee does not use the attribute value, skip copying the value.
3839 if (all_of(Attrs, [&](StringRef Attr) {
3840 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3841 }))
3842 continue;
3843
3844 const auto [OutgoingArg, ArgRC, ArgTy] =
3845 CalleeArgInfo->getPreloadedValue(InputID);
3846 if (!OutgoingArg)
3847 continue;
3848
3849 const auto [IncomingArg, IncomingArgRC, Ty] =
3850 CallerArgInfo.getPreloadedValue(InputID);
3851 assert(IncomingArgRC == ArgRC);
3852
3853 // All special arguments are ints for now.
3854 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3855 SDValue InputReg;
3856
3857 if (IncomingArg) {
3858 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3859 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3860 // The implicit arg ptr is special because it doesn't have a corresponding
3861 // input for kernels, and is computed from the kernarg segment pointer.
3862 InputReg = getImplicitArgPtr(DAG, DL);
3863 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3864 std::optional<uint32_t> Id =
3866 if (Id.has_value()) {
3867 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3868 } else {
3869 InputReg = DAG.getPOISON(ArgVT);
3870 }
3871 } else {
3872 // We may have proven the input wasn't needed, although the ABI is
3873 // requiring it. We just need to allocate the register appropriately.
3874 InputReg = DAG.getPOISON(ArgVT);
3875 }
3876
3877 if (OutgoingArg->isRegister()) {
3878 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3879 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3880 report_fatal_error("failed to allocate implicit input argument");
3881 } else {
3882 unsigned SpecialArgOffset =
3883 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3884 SDValue ArgStore =
3885 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3886 MemOpChains.push_back(ArgStore);
3887 }
3888 }
3889
3890 // Pack workitem IDs into a single register or pass it as is if already
3891 // packed.
3892
3893 auto [OutgoingArg, ArgRC, Ty] =
3895 if (!OutgoingArg)
3896 std::tie(OutgoingArg, ArgRC, Ty) =
3898 if (!OutgoingArg)
3899 std::tie(OutgoingArg, ArgRC, Ty) =
3901 if (!OutgoingArg)
3902 return;
3903
3904 const ArgDescriptor *IncomingArgX = std::get<0>(
3906 const ArgDescriptor *IncomingArgY = std::get<0>(
3908 const ArgDescriptor *IncomingArgZ = std::get<0>(
3910
3911 SDValue InputReg;
3912 SDLoc SL;
3913
3914 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3915 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3916 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3917
3918 // If incoming ids are not packed we need to pack them.
3919 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3920 NeedWorkItemIDX) {
3921 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3922 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3923 } else {
3924 InputReg = DAG.getConstant(0, DL, MVT::i32);
3925 }
3926 }
3927
3928 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3929 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3930 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3931 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3932 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3933 InputReg = InputReg.getNode()
3934 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3935 : Y;
3936 }
3937
3938 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3939 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3940 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3941 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3942 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3943 InputReg = InputReg.getNode()
3944 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3945 : Z;
3946 }
3947
3948 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3949 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3950 // We're in a situation where the outgoing function requires the workitem
3951 // ID, but the calling function does not have it (e.g a graphics function
3952 // calling a C calling convention function). This is illegal, but we need
3953 // to produce something.
3954 InputReg = DAG.getPOISON(MVT::i32);
3955 } else {
3956 // Workitem ids are already packed, any of present incoming arguments
3957 // will carry all required fields.
3958 ArgDescriptor IncomingArg =
3959 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3960 : IncomingArgY ? *IncomingArgY
3961 : *IncomingArgZ,
3962 ~0u);
3963 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3964 }
3965 }
3966
3967 if (OutgoingArg->isRegister()) {
3968 if (InputReg)
3969 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3970
3971 CCInfo.AllocateReg(OutgoingArg->getRegister());
3972 } else {
3973 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3974 if (InputReg) {
3975 SDValue ArgStore =
3976 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3977 MemOpChains.push_back(ArgStore);
3978 }
3979 }
3980}
3981
3983 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3985 const SmallVectorImpl<SDValue> &OutVals,
3986 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3987 if (AMDGPU::isChainCC(CalleeCC))
3988 return true;
3989
3990 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3991 return false;
3992
3993 // For a divergent call target, we need to do a waterfall loop over the
3994 // possible callees which precludes us from using a simple jump.
3995 if (Callee->isDivergent())
3996 return false;
3997
3999 const Function &CallerF = MF.getFunction();
4000 CallingConv::ID CallerCC = CallerF.getCallingConv();
4002 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4003
4004 // Kernels aren't callable, and don't have a live in return address so it
4005 // doesn't make sense to do a tail call with entry functions.
4006 if (!CallerPreserved)
4007 return false;
4008
4009 bool CCMatch = CallerCC == CalleeCC;
4010
4012 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
4013 return true;
4014 return false;
4015 }
4016
4017 // TODO: Can we handle var args?
4018 if (IsVarArg)
4019 return false;
4020
4021 for (const Argument &Arg : CallerF.args()) {
4022 if (Arg.hasByValAttr())
4023 return false;
4024 }
4025
4026 LLVMContext &Ctx = *DAG.getContext();
4027
4028 // Check that the call results are passed in the same way.
4029 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4030 CCAssignFnForCall(CalleeCC, IsVarArg),
4031 CCAssignFnForCall(CallerCC, IsVarArg)))
4032 return false;
4033
4034 // The callee has to preserve all registers the caller needs to preserve.
4035 if (!CCMatch) {
4036 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4037 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4038 return false;
4039 }
4040
4041 // Nothing more to check if the callee is taking no arguments.
4042 if (Outs.empty())
4043 return true;
4044
4046 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4047
4048 // FIXME: We are not allocating special input registers, so we will be
4049 // deciding based on incorrect register assignments.
4050 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4051
4052 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4053 // If the stack arguments for this call do not fit into our own save area then
4054 // the call cannot be made tail.
4055 // TODO: Is this really necessary?
4056 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4057 return false;
4058
4059 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4060 // FIXME: What about inreg arguments that end up passed in memory?
4061 if (!CCVA.isRegLoc())
4062 continue;
4063
4064 // If we are passing an argument in an SGPR, and the value is divergent,
4065 // this call requires a waterfall loop.
4066 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4067 LLVM_DEBUG(
4068 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4069 << printReg(CCVA.getLocReg(), TRI) << '\n');
4070 return false;
4071 }
4072 }
4073
4074 const MachineRegisterInfo &MRI = MF.getRegInfo();
4075 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4076}
4077
4079 if (!CI->isTailCall())
4080 return false;
4081
4082 const Function *ParentFn = CI->getFunction();
4084 return false;
4085 return true;
4086}
4087
4088namespace {
4089// Chain calls have special arguments that we need to handle. These are
4090// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4091// arguments (index 0 and 1 respectively).
4092enum ChainCallArgIdx {
4093 Exec = 2,
4094 Flags,
4095 NumVGPRs,
4096 FallbackExec,
4097 FallbackCallee
4098};
4099} // anonymous namespace
4100
4101// The wave scratch offset register is used as the global base pointer.
4103 SmallVectorImpl<SDValue> &InVals) const {
4104 CallingConv::ID CallConv = CLI.CallConv;
4105 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4106
4107 SelectionDAG &DAG = CLI.DAG;
4108
4109 const SDLoc &DL = CLI.DL;
4110 SDValue Chain = CLI.Chain;
4111 SDValue Callee = CLI.Callee;
4112
4113 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4114 bool UsesDynamicVGPRs = false;
4115 if (IsChainCallConv) {
4116 // The last arguments should be the value that we need to put in EXEC,
4117 // followed by the flags and any other arguments with special meanings.
4118 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4119 // we don't treat them like the "real" arguments.
4120 auto RequestedExecIt =
4121 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4122 return Arg.OrigArgIndex == 2;
4123 });
4124 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4125
4126 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4127 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4128 CLI.OutVals.end());
4129 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4130
4131 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4132 "Haven't popped all the special args");
4133
4134 TargetLowering::ArgListEntry RequestedExecArg =
4135 CLI.Args[ChainCallArgIdx::Exec];
4136 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4137 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4138
4139 // Convert constants into TargetConstants, so they become immediate operands
4140 // instead of being selected into S_MOV.
4141 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4142 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4143 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4144 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4145 } else
4146 ChainCallSpecialArgs.push_back(Arg.Node);
4147 };
4148
4149 PushNodeOrTargetConstant(RequestedExecArg);
4150
4151 // Process any other special arguments depending on the value of the flags.
4152 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4153
4154 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4155 if (FlagsValue.isZero()) {
4156 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4157 return lowerUnhandledCall(CLI, InVals,
4158 "no additional args allowed if flags == 0");
4159 } else if (FlagsValue.isOneBitSet(0)) {
4160 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4161 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4162 }
4163
4164 if (!Subtarget->isWave32()) {
4165 return lowerUnhandledCall(
4166 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4167 }
4168
4169 UsesDynamicVGPRs = true;
4170 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4171 CLI.Args.end(), PushNodeOrTargetConstant);
4172 }
4173 }
4174
4176 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4178 bool &IsTailCall = CLI.IsTailCall;
4179 bool IsVarArg = CLI.IsVarArg;
4180 bool IsSibCall = false;
4182
4183 if (Callee.isUndef() || isNullConstant(Callee)) {
4184 if (!CLI.IsTailCall) {
4185 for (ISD::InputArg &Arg : CLI.Ins)
4186 InVals.push_back(DAG.getPOISON(Arg.VT));
4187 }
4188
4189 return Chain;
4190 }
4191
4192 if (IsVarArg) {
4193 return lowerUnhandledCall(CLI, InVals,
4194 "unsupported call to variadic function ");
4195 }
4196
4197 if (!CLI.CB)
4198 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4199
4200 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4201 return lowerUnhandledCall(CLI, InVals,
4202 "unsupported required tail call to function ");
4203 }
4204
4205 if (IsTailCall) {
4206 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4207 Outs, OutVals, Ins, DAG);
4208 if (!IsTailCall &&
4209 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4210 report_fatal_error("failed to perform tail call elimination on a call "
4211 "site marked musttail or on llvm.amdgcn.cs.chain");
4212 }
4213
4214 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4215
4216 // A sibling call is one where we're under the usual C ABI and not planning
4217 // to change that but can still do a tail call:
4218 if (!TailCallOpt && IsTailCall)
4219 IsSibCall = true;
4220
4221 if (IsTailCall)
4222 ++NumTailCalls;
4223 }
4224
4227 SmallVector<SDValue, 8> MemOpChains;
4228
4229 // Analyze operands of the call, assigning locations to each operand.
4231 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4232 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4233
4234 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4236 // With a fixed ABI, allocate fixed registers before user arguments.
4237 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4238 }
4239
4240 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4241
4242 // Get a count of how many bytes are to be pushed on the stack.
4243 unsigned NumBytes = CCInfo.getStackSize();
4244
4245 if (IsSibCall) {
4246 // Since we're not changing the ABI to make this a tail call, the memory
4247 // operands are already available in the caller's incoming argument space.
4248 NumBytes = 0;
4249 }
4250
4251 // FPDiff is the byte offset of the call's argument area from the callee's.
4252 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4253 // by this amount for a tail call. In a sibling call it must be 0 because the
4254 // caller will deallocate the entire stack and the callee still expects its
4255 // arguments to begin at SP+0. Completely unused for non-tail calls.
4256 int32_t FPDiff = 0;
4257 MachineFrameInfo &MFI = MF.getFrameInfo();
4258 auto *TRI = Subtarget->getRegisterInfo();
4259
4260 // Adjust the stack pointer for the new arguments...
4261 // These operations are automatically eliminated by the prolog/epilog pass
4262 if (!IsSibCall)
4263 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4264
4265 if (!IsSibCall || IsChainCallConv) {
4266 if (!Subtarget->enableFlatScratch()) {
4267 SmallVector<SDValue, 4> CopyFromChains;
4268
4269 // In the HSA case, this should be an identity copy.
4270 SDValue ScratchRSrcReg =
4271 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4272 RegsToPass.emplace_back(IsChainCallConv
4273 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4274 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4275 ScratchRSrcReg);
4276 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4277 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4278 }
4279 }
4280
4281 const unsigned NumSpecialInputs = RegsToPass.size();
4282
4283 MVT PtrVT = MVT::i32;
4284
4285 // Walk the register/memloc assignments, inserting copies/loads.
4286 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4287 CCValAssign &VA = ArgLocs[i];
4288 SDValue Arg = OutVals[i];
4289
4290 // Promote the value if needed.
4291 switch (VA.getLocInfo()) {
4292 case CCValAssign::Full:
4293 break;
4294 case CCValAssign::BCvt:
4295 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4296 break;
4297 case CCValAssign::ZExt:
4298 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4299 break;
4300 case CCValAssign::SExt:
4301 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4302 break;
4303 case CCValAssign::AExt:
4304 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4305 break;
4306 case CCValAssign::FPExt:
4307 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4308 break;
4309 default:
4310 llvm_unreachable("Unknown loc info!");
4311 }
4312
4313 if (VA.isRegLoc()) {
4314 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4315 } else {
4316 assert(VA.isMemLoc());
4317
4318 SDValue DstAddr;
4319 MachinePointerInfo DstInfo;
4320
4321 unsigned LocMemOffset = VA.getLocMemOffset();
4322 int32_t Offset = LocMemOffset;
4323
4324 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4325 MaybeAlign Alignment;
4326
4327 if (IsTailCall) {
4328 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4329 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4330 : VA.getValVT().getStoreSize();
4331
4332 // FIXME: We can have better than the minimum byval required alignment.
4333 Alignment =
4334 Flags.isByVal()
4335 ? Flags.getNonZeroByValAlign()
4336 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4337
4338 Offset = Offset + FPDiff;
4339 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4340
4341 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4342 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4343
4344 // Make sure any stack arguments overlapping with where we're storing
4345 // are loaded before this eventual operation. Otherwise they'll be
4346 // clobbered.
4347
4348 // FIXME: Why is this really necessary? This seems to just result in a
4349 // lot of code to copy the stack and write them back to the same
4350 // locations, which are supposed to be immutable?
4351 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4352 } else {
4353 // Stores to the argument stack area are relative to the stack pointer.
4354 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4355 MVT::i32);
4356 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4357 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4358 Alignment =
4359 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4360 }
4361
4362 if (Outs[i].Flags.isByVal()) {
4363 SDValue SizeNode =
4364 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4365 SDValue Cpy =
4366 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4367 Outs[i].Flags.getNonZeroByValAlign(),
4368 /*isVol = */ false, /*AlwaysInline = */ true,
4369 /*CI=*/nullptr, std::nullopt, DstInfo,
4371
4372 MemOpChains.push_back(Cpy);
4373 } else {
4374 SDValue Store =
4375 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4376 MemOpChains.push_back(Store);
4377 }
4378 }
4379 }
4380
4381 if (!MemOpChains.empty())
4382 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4383
4384 SDValue ReadFirstLaneID =
4385 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4386
4387 SDValue TokenGlue;
4388 if (CLI.ConvergenceControlToken) {
4389 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4391 }
4392
4393 // Build a sequence of copy-to-reg nodes chained together with token chain
4394 // and flag operands which copy the outgoing args into the appropriate regs.
4395 SDValue InGlue;
4396
4397 unsigned ArgIdx = 0;
4398 for (auto [Reg, Val] : RegsToPass) {
4399 if (ArgIdx++ >= NumSpecialInputs &&
4400 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4401 // For chain calls, the inreg arguments are required to be
4402 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4403 // they are uniform.
4404 //
4405 // For other calls, if an inreg arguments is known to be uniform,
4406 // speculatively insert a readfirstlane in case it is in a VGPR.
4407 //
4408 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4409 // value, so let that continue to produce invalid code.
4410
4411 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4412 if (TokenGlue)
4413 ReadfirstlaneArgs.push_back(TokenGlue);
4415 ReadfirstlaneArgs);
4416 }
4417
4418 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4419 InGlue = Chain.getValue(1);
4420 }
4421
4422 // We don't usually want to end the call-sequence here because we would tidy
4423 // the frame up *after* the call, however in the ABI-changing tail-call case
4424 // we've carefully laid out the parameters so that when sp is reset they'll be
4425 // in the correct location.
4426 if (IsTailCall && !IsSibCall) {
4427 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4428 InGlue = Chain.getValue(1);
4429 }
4430
4431 std::vector<SDValue> Ops({Chain});
4432
4433 // Add a redundant copy of the callee global which will not be legalized, as
4434 // we need direct access to the callee later.
4436 const GlobalValue *GV = GSD->getGlobal();
4437 Ops.push_back(Callee);
4438 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4439 } else {
4440 if (IsTailCall) {
4441 // isEligibleForTailCallOptimization considered whether the call target is
4442 // divergent, but we may still end up with a uniform value in a VGPR.
4443 // Insert a readfirstlane just in case.
4444 SDValue ReadFirstLaneID =
4445 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4446
4447 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4448 if (TokenGlue)
4449 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4450 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4451 ReadfirstlaneArgs);
4452 }
4453
4454 Ops.push_back(Callee);
4455 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4456 }
4457
4458 if (IsTailCall) {
4459 // Each tail call may have to adjust the stack by a different amount, so
4460 // this information must travel along with the operation for eventual
4461 // consumption by emitEpilogue.
4462 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4463 }
4464
4465 if (IsChainCallConv)
4466 llvm::append_range(Ops, ChainCallSpecialArgs);
4467
4468 // Add argument registers to the end of the list so that they are known live
4469 // into the call.
4470 for (auto &[Reg, Val] : RegsToPass)
4471 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4472
4473 // Add a register mask operand representing the call-preserved registers.
4474 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4475 assert(Mask && "Missing call preserved mask for calling convention");
4476 Ops.push_back(DAG.getRegisterMask(Mask));
4477
4478 if (SDValue Token = CLI.ConvergenceControlToken) {
4480 GlueOps.push_back(Token);
4481 if (InGlue)
4482 GlueOps.push_back(InGlue);
4483
4484 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4485 MVT::Glue, GlueOps),
4486 0);
4487 }
4488
4489 if (InGlue)
4490 Ops.push_back(InGlue);
4491
4492 // If we're doing a tall call, use a TC_RETURN here rather than an
4493 // actual call instruction.
4494 if (IsTailCall) {
4495 MFI.setHasTailCall();
4496 unsigned OPC = AMDGPUISD::TC_RETURN;
4497 switch (CallConv) {
4499 OPC = AMDGPUISD::TC_RETURN_GFX;
4500 break;
4503 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4504 : AMDGPUISD::TC_RETURN_CHAIN;
4505 break;
4506 }
4507
4508 // If the caller is a whole wave function, we need to use a special opcode
4509 // so we can patch up EXEC.
4510 if (Info->isWholeWaveFunction())
4511 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4512
4513 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4514 }
4515
4516 // Returns a chain and a flag for retval copy to use.
4517 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4518 Chain = Call.getValue(0);
4519 InGlue = Call.getValue(1);
4520
4521 uint64_t CalleePopBytes = NumBytes;
4522 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4523 if (!Ins.empty())
4524 InGlue = Chain.getValue(1);
4525
4526 // Handle result values, copying them out of physregs into vregs that we
4527 // return.
4528 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4529 InVals, /*IsThisReturn=*/false, SDValue());
4530}
4531
4532// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4533// except for:
4534// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4535// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4537 SelectionDAG &DAG) const {
4538 const MachineFunction &MF = DAG.getMachineFunction();
4540
4541 SDLoc dl(Op);
4542 EVT VT = Op.getValueType();
4543 SDValue Chain = Op.getOperand(0);
4544 Register SPReg = Info->getStackPtrOffsetReg();
4545
4546 // Chain the dynamic stack allocation so that it doesn't modify the stack
4547 // pointer when other instructions are using the stack.
4548 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4549
4550 SDValue Size = Op.getOperand(1);
4551 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4552 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4553
4554 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4556 "Stack grows upwards for AMDGPU");
4557
4558 Chain = BaseAddr.getValue(1);
4559 Align StackAlign = TFL->getStackAlign();
4560 if (Alignment > StackAlign) {
4561 uint64_t ScaledAlignment = Alignment.value()
4562 << Subtarget->getWavefrontSizeLog2();
4563 uint64_t StackAlignMask = ScaledAlignment - 1;
4564 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4565 DAG.getConstant(StackAlignMask, dl, VT));
4566 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4567 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4568 }
4569
4570 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4571 SDValue NewSP;
4573 // For constant sized alloca, scale alloca size by wave-size
4574 SDValue ScaledSize = DAG.getNode(
4575 ISD::SHL, dl, VT, Size,
4576 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4577 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4578 } else {
4579 // For dynamic sized alloca, perform wave-wide reduction to get max of
4580 // alloca size(divergent) and then scale it by wave-size
4581 SDValue WaveReduction =
4582 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4583 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4584 Size, DAG.getConstant(0, dl, MVT::i32));
4585 SDValue ScaledSize = DAG.getNode(
4586 ISD::SHL, dl, VT, Size,
4587 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4588 NewSP =
4589 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4590 SDValue ReadFirstLaneID =
4591 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4592 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4593 NewSP);
4594 }
4595
4596 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4597 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4598
4599 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4600}
4601
4603 if (Op.getValueType() != MVT::i32)
4604 return Op; // Defer to cannot select error.
4605
4607 SDLoc SL(Op);
4608
4609 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4610
4611 // Convert from wave uniform to swizzled vector address. This should protect
4612 // from any edge cases where the stacksave result isn't directly used with
4613 // stackrestore.
4614 SDValue VectorAddress =
4615 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4616 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4617}
4618
4620 SelectionDAG &DAG) const {
4621 SDLoc SL(Op);
4622 assert(Op.getValueType() == MVT::i32);
4623
4624 uint32_t BothRoundHwReg =
4626 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4627
4628 SDValue IntrinID =
4629 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4630 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4631 Op.getOperand(0), IntrinID, GetRoundBothImm);
4632
4633 // There are two rounding modes, one for f32 and one for f64/f16. We only
4634 // report in the standard value range if both are the same.
4635 //
4636 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4637 // ties away from zero is not supported, and the other values are rotated by
4638 // 1.
4639 //
4640 // If the two rounding modes are not the same, report a target defined value.
4641
4642 // Mode register rounding mode fields:
4643 //
4644 // [1:0] Single-precision round mode.
4645 // [3:2] Double/Half-precision round mode.
4646 //
4647 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4648 //
4649 // Hardware Spec
4650 // Toward-0 3 0
4651 // Nearest Even 0 1
4652 // +Inf 1 2
4653 // -Inf 2 3
4654 // NearestAway0 N/A 4
4655 //
4656 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4657 // table we can index by the raw hardware mode.
4658 //
4659 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4660
4661 SDValue BitTable =
4663
4664 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4665 SDValue RoundModeTimesNumBits =
4666 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4667
4668 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4669 // knew only one mode was demanded.
4670 SDValue TableValue =
4671 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4672 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4673
4674 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4675 SDValue TableEntry =
4676 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4677
4678 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4679 // if it's an extended value.
4680 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4681 SDValue IsStandardValue =
4682 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4683 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4684 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4685 TableEntry, EnumOffset);
4686
4687 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4688}
4689
4691 SelectionDAG &DAG) const {
4692 SDLoc SL(Op);
4693
4694 SDValue NewMode = Op.getOperand(1);
4695 assert(NewMode.getValueType() == MVT::i32);
4696
4697 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4698 // hardware MODE.fp_round values.
4699 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4700 uint32_t ClampedVal = std::min(
4701 static_cast<uint32_t>(ConstMode->getZExtValue()),
4703 NewMode = DAG.getConstant(
4704 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4705 } else {
4706 // If we know the input can only be one of the supported standard modes in
4707 // the range 0-3, we can use a simplified mapping to hardware values.
4708 KnownBits KB = DAG.computeKnownBits(NewMode);
4709 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4710 // The supported standard values are 0-3. The extended values start at 8. We
4711 // need to offset by 4 if the value is in the extended range.
4712
4713 if (UseReducedTable) {
4714 // Truncate to the low 32-bits.
4715 SDValue BitTable = DAG.getConstant(
4716 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4717
4718 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4719 SDValue RoundModeTimesNumBits =
4720 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4721
4722 NewMode =
4723 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4724
4725 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4726 // the table extracted bits into inline immediates.
4727 } else {
4728 // table_index = umin(value, value - 4)
4729 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4730 SDValue BitTable =
4732
4733 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4734 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4735 SDValue IndexVal =
4736 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4737
4738 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4739 SDValue RoundModeTimesNumBits =
4740 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4741
4742 SDValue TableValue =
4743 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4744 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4745
4746 // No need to mask out the high bits since the setreg will ignore them
4747 // anyway.
4748 NewMode = TruncTable;
4749 }
4750
4751 // Insert a readfirstlane in case the value is a VGPR. We could do this
4752 // earlier and keep more operations scalar, but that interferes with
4753 // combining the source.
4754 SDValue ReadFirstLaneID =
4755 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4756 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4757 ReadFirstLaneID, NewMode);
4758 }
4759
4760 // N.B. The setreg will be later folded into s_round_mode on supported
4761 // targets.
4762 SDValue IntrinID =
4763 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4764 uint32_t BothRoundHwReg =
4766 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4767
4768 SDValue SetReg =
4769 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4770 IntrinID, RoundBothImm, NewMode);
4771
4772 return SetReg;
4773}
4774
4776 if (Op->isDivergent() &&
4777 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4778 // Cannot do I$ prefetch with divergent pointer.
4779 return SDValue();
4780
4781 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4785 break;
4787 if (Subtarget->hasSafeSmemPrefetch())
4788 break;
4789 [[fallthrough]];
4790 default:
4791 return SDValue();
4792 }
4793
4794 // I$ prefetch
4795 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4796 return SDValue();
4797
4798 return Op;
4799}
4800
4801// Work around DAG legality rules only based on the result type.
4803 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4804 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4805 EVT SrcVT = Src.getValueType();
4806
4807 if (SrcVT.getScalarType() != MVT::bf16)
4808 return Op;
4809
4810 SDLoc SL(Op);
4811 SDValue BitCast =
4812 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4813
4814 EVT DstVT = Op.getValueType();
4815 if (IsStrict)
4816 llvm_unreachable("Need STRICT_BF16_TO_FP");
4817
4818 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4819}
4820
4822 SDLoc SL(Op);
4823 if (Op.getValueType() != MVT::i64)
4824 return Op;
4825
4826 uint32_t ModeHwReg =
4828 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4829 uint32_t TrapHwReg =
4831 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4832
4833 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4834 SDValue IntrinID =
4835 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4836 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4837 Op.getOperand(0), IntrinID, ModeHwRegImm);
4838 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4839 Op.getOperand(0), IntrinID, TrapHwRegImm);
4840 SDValue TokenReg =
4841 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4842 GetTrapReg.getValue(1));
4843
4844 SDValue CvtPtr =
4845 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4846 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4847
4848 return DAG.getMergeValues({Result, TokenReg}, SL);
4849}
4850
4852 SDLoc SL(Op);
4853 if (Op.getOperand(1).getValueType() != MVT::i64)
4854 return Op;
4855
4856 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4857 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4858 DAG.getConstant(0, SL, MVT::i32));
4859 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4860 DAG.getConstant(1, SL, MVT::i32));
4861
4862 SDValue ReadFirstLaneID =
4863 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4864 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4865 ReadFirstLaneID, NewModeReg);
4866 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4867 ReadFirstLaneID, NewTrapReg);
4868
4869 unsigned ModeHwReg =
4871 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4872 unsigned TrapHwReg =
4874 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4875
4876 SDValue IntrinID =
4877 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4878 SDValue SetModeReg =
4879 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4880 IntrinID, ModeHwRegImm, NewModeReg);
4881 SDValue SetTrapReg =
4882 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4883 IntrinID, TrapHwRegImm, NewTrapReg);
4884 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4885}
4886
4888 const MachineFunction &MF) const {
4889 const Function &Fn = MF.getFunction();
4890
4892 .Case("m0", AMDGPU::M0)
4893 .Case("exec", AMDGPU::EXEC)
4894 .Case("exec_lo", AMDGPU::EXEC_LO)
4895 .Case("exec_hi", AMDGPU::EXEC_HI)
4896 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4897 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4898 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4899 .Default(Register());
4900 if (!Reg)
4901 return Reg;
4902
4903 if (!Subtarget->hasFlatScrRegister() &&
4904 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4905 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4906 "\" for subtarget."));
4907 }
4908
4909 switch (Reg) {
4910 case AMDGPU::M0:
4911 case AMDGPU::EXEC_LO:
4912 case AMDGPU::EXEC_HI:
4913 case AMDGPU::FLAT_SCR_LO:
4914 case AMDGPU::FLAT_SCR_HI:
4915 if (VT.getSizeInBits() == 32)
4916 return Reg;
4917 break;
4918 case AMDGPU::EXEC:
4919 case AMDGPU::FLAT_SCR:
4920 if (VT.getSizeInBits() == 64)
4921 return Reg;
4922 break;
4923 default:
4924 llvm_unreachable("missing register type checking");
4925 }
4926
4928 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4929}
4930
4931// If kill is not the last instruction, split the block so kill is always a
4932// proper terminator.
4935 MachineBasicBlock *BB) const {
4936 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4938 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4939 return SplitBB;
4940}
4941
4942// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4943// \p MI will be the only instruction in the loop body block. Otherwise, it will
4944// be the first instruction in the remainder block.
4945//
4946/// \returns { LoopBody, Remainder }
4947static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4949 MachineFunction *MF = MBB.getParent();
4951
4952 // To insert the loop we need to split the block. Move everything after this
4953 // point to a new block, and insert a new empty block between the two.
4955 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4957 ++MBBI;
4958
4959 MF->insert(MBBI, LoopBB);
4960 MF->insert(MBBI, RemainderBB);
4961
4962 LoopBB->addSuccessor(LoopBB);
4963 LoopBB->addSuccessor(RemainderBB);
4964
4965 // Move the rest of the block into a new block.
4966 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4967
4968 if (InstInLoop) {
4969 auto Next = std::next(I);
4970
4971 // Move instruction to loop body.
4972 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4973
4974 // Move the rest of the block.
4975 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4976 } else {
4977 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4978 }
4979
4980 MBB.addSuccessor(LoopBB);
4981
4982 return std::pair(LoopBB, RemainderBB);
4983}
4984
4985/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4987 MachineBasicBlock *MBB = MI.getParent();
4989 auto I = MI.getIterator();
4990 auto E = std::next(I);
4991
4992 // clang-format off
4993 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4994 .addImm(0);
4995 // clang-format on
4996
4997 MIBundleBuilder Bundler(*MBB, I, E);
4998 finalizeBundle(*MBB, Bundler.begin());
4999}
5000
5003 MachineBasicBlock *BB) const {
5004 const DebugLoc &DL = MI.getDebugLoc();
5005
5007
5009
5010 // Apparently kill flags are only valid if the def is in the same block?
5011 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
5012 Src->setIsKill(false);
5013
5014 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
5015
5016 MachineBasicBlock::iterator I = LoopBB->end();
5017
5018 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5020
5021 // Clear TRAP_STS.MEM_VIOL
5022 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5023 .addImm(0)
5024 .addImm(EncodedReg);
5025
5027
5028 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5029
5030 // Load and check TRAP_STS.MEM_VIOL
5031 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5032 .addImm(EncodedReg);
5033
5034 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5035 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5036 .addReg(Reg, RegState::Kill)
5037 .addImm(0);
5038 // clang-format off
5039 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5040 .addMBB(LoopBB);
5041 // clang-format on
5042
5043 return RemainderBB;
5044}
5045
5046// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5047// wavefront. If the value is uniform and just happens to be in a VGPR, this
5048// will only do one iteration. In the worst case, this will loop 64 times.
5049//
5050// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5053 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5054 const DebugLoc &DL, const MachineOperand &Idx,
5055 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5056 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5057 Register &SGPRIdxReg) {
5058
5059 MachineFunction *MF = OrigBB.getParent();
5060 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5061 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5064
5065 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5066 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5067 Register NewExec = MRI.createVirtualRegister(BoolRC);
5068 Register CurrentIdxReg =
5069 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5070 Register CondReg = MRI.createVirtualRegister(BoolRC);
5071
5072 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5073 .addReg(InitReg)
5074 .addMBB(&OrigBB)
5075 .addReg(ResultReg)
5076 .addMBB(&LoopBB);
5077
5078 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5079 .addReg(InitSaveExecReg)
5080 .addMBB(&OrigBB)
5081 .addReg(NewExec)
5082 .addMBB(&LoopBB);
5083
5084 // Read the next variant <- also loop target.
5085 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5086 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5087
5088 // Compare the just read M0 value to all possible Idx values.
5089 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5090 .addReg(CurrentIdxReg)
5091 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5092
5093 // Update EXEC, save the original EXEC value to VCC.
5094 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5095 .addReg(CondReg, RegState::Kill);
5096
5097 MRI.setSimpleHint(NewExec, CondReg);
5098
5099 if (UseGPRIdxMode) {
5100 if (Offset == 0) {
5101 SGPRIdxReg = CurrentIdxReg;
5102 } else {
5103 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5104 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5105 .addReg(CurrentIdxReg, RegState::Kill)
5106 .addImm(Offset);
5107 }
5108 } else {
5109 // Move index from VCC into M0
5110 if (Offset == 0) {
5111 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5112 .addReg(CurrentIdxReg, RegState::Kill);
5113 } else {
5114 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5115 .addReg(CurrentIdxReg, RegState::Kill)
5116 .addImm(Offset);
5117 }
5118 }
5119
5120 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5121 MachineInstr *InsertPt =
5122 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5123 .addReg(LMC.ExecReg)
5124 .addReg(NewExec);
5125
5126 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5127 // s_cbranch_scc0?
5128
5129 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5130 // clang-format off
5131 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5132 .addMBB(&LoopBB);
5133 // clang-format on
5134
5135 return InsertPt->getIterator();
5136}
5137
5138// This has slightly sub-optimal regalloc when the source vector is killed by
5139// the read. The register allocator does not understand that the kill is
5140// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5141// subregister from it, using 1 more VGPR than necessary. This was saved when
5142// this was expanded after register allocation.
5145 unsigned InitResultReg, unsigned PhiReg, int Offset,
5146 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5147 MachineFunction *MF = MBB.getParent();
5148 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5149 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5151 const DebugLoc &DL = MI.getDebugLoc();
5153
5154 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5155 Register DstReg = MI.getOperand(0).getReg();
5156 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5157 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5159
5160 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5161
5162 // Save the EXEC mask
5163 // clang-format off
5164 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5165 .addReg(LMC.ExecReg);
5166 // clang-format on
5167
5168 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5169
5170 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5171
5172 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5173 InitResultReg, DstReg, PhiReg, TmpExec,
5174 Offset, UseGPRIdxMode, SGPRIdxReg);
5175
5176 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5178 ++MBBI;
5179 MF->insert(MBBI, LandingPad);
5180 LoopBB->removeSuccessor(RemainderBB);
5181 LandingPad->addSuccessor(RemainderBB);
5182 LoopBB->addSuccessor(LandingPad);
5183 MachineBasicBlock::iterator First = LandingPad->begin();
5184 // clang-format off
5185 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5186 .addReg(SaveExec);
5187 // clang-format on
5188
5189 return InsPt;
5190}
5191
5192// Returns subreg index, offset
5193static std::pair<unsigned, int>
5195 const TargetRegisterClass *SuperRC, unsigned VecReg,
5196 int Offset) {
5197 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5198
5199 // Skip out of bounds offsets, or else we would end up using an undefined
5200 // register.
5201 if (Offset >= NumElts || Offset < 0)
5202 return std::pair(AMDGPU::sub0, Offset);
5203
5204 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5205}
5206
5209 int Offset) {
5210 MachineBasicBlock *MBB = MI.getParent();
5211 const DebugLoc &DL = MI.getDebugLoc();
5213
5214 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5215
5216 assert(Idx->getReg() != AMDGPU::NoRegister);
5217
5218 if (Offset == 0) {
5219 // clang-format off
5220 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5221 .add(*Idx);
5222 // clang-format on
5223 } else {
5224 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5225 .add(*Idx)
5226 .addImm(Offset);
5227 }
5228}
5229
5232 int Offset) {
5233 MachineBasicBlock *MBB = MI.getParent();
5234 const DebugLoc &DL = MI.getDebugLoc();
5236
5237 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5238
5239 if (Offset == 0)
5240 return Idx->getReg();
5241
5242 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5243 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5244 .add(*Idx)
5245 .addImm(Offset);
5246 return Tmp;
5247}
5248
5251 const GCNSubtarget &ST) {
5252 const SIInstrInfo *TII = ST.getInstrInfo();
5253 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5254 MachineFunction *MF = MBB.getParent();
5256
5257 Register Dst = MI.getOperand(0).getReg();
5258 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5259 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5260 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5261
5262 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5263 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5264
5265 unsigned SubReg;
5266 std::tie(SubReg, Offset) =
5267 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5268
5269 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5270
5271 // Check for a SGPR index.
5272 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5274 const DebugLoc &DL = MI.getDebugLoc();
5275
5276 if (UseGPRIdxMode) {
5277 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5278 // to avoid interfering with other uses, so probably requires a new
5279 // optimization pass.
5281
5282 const MCInstrDesc &GPRIDXDesc =
5283 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5284 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5285 .addReg(SrcReg)
5286 .addReg(Idx)
5287 .addImm(SubReg);
5288 } else {
5290
5291 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5292 .addReg(SrcReg, 0, SubReg)
5293 .addReg(SrcReg, RegState::Implicit);
5294 }
5295
5296 MI.eraseFromParent();
5297
5298 return &MBB;
5299 }
5300
5301 // Control flow needs to be inserted if indexing with a VGPR.
5302 const DebugLoc &DL = MI.getDebugLoc();
5304
5305 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5306 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5307
5308 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5309
5310 Register SGPRIdxReg;
5311 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5312 UseGPRIdxMode, SGPRIdxReg);
5313
5314 MachineBasicBlock *LoopBB = InsPt->getParent();
5315
5316 if (UseGPRIdxMode) {
5317 const MCInstrDesc &GPRIDXDesc =
5318 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5319
5320 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5321 .addReg(SrcReg)
5322 .addReg(SGPRIdxReg)
5323 .addImm(SubReg);
5324 } else {
5325 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5326 .addReg(SrcReg, 0, SubReg)
5327 .addReg(SrcReg, RegState::Implicit);
5328 }
5329
5330 MI.eraseFromParent();
5331
5332 return LoopBB;
5333}
5334
5337 const GCNSubtarget &ST) {
5338 const SIInstrInfo *TII = ST.getInstrInfo();
5339 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5340 MachineFunction *MF = MBB.getParent();
5342
5343 Register Dst = MI.getOperand(0).getReg();
5344 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5345 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5346 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5347 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5348 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5349 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5350
5351 // This can be an immediate, but will be folded later.
5352 assert(Val->getReg());
5353
5354 unsigned SubReg;
5355 std::tie(SubReg, Offset) =
5356 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5357 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5358
5359 if (Idx->getReg() == AMDGPU::NoRegister) {
5361 const DebugLoc &DL = MI.getDebugLoc();
5362
5363 assert(Offset == 0);
5364
5365 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5366 .add(*SrcVec)
5367 .add(*Val)
5368 .addImm(SubReg);
5369
5370 MI.eraseFromParent();
5371 return &MBB;
5372 }
5373
5374 // Check for a SGPR index.
5375 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5377 const DebugLoc &DL = MI.getDebugLoc();
5378
5379 if (UseGPRIdxMode) {
5381
5382 const MCInstrDesc &GPRIDXDesc =
5383 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5384 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5385 .addReg(SrcVec->getReg())
5386 .add(*Val)
5387 .addReg(Idx)
5388 .addImm(SubReg);
5389 } else {
5391
5392 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5393 TRI.getRegSizeInBits(*VecRC), 32, false);
5394 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5395 .addReg(SrcVec->getReg())
5396 .add(*Val)
5397 .addImm(SubReg);
5398 }
5399 MI.eraseFromParent();
5400 return &MBB;
5401 }
5402
5403 // Control flow needs to be inserted if indexing with a VGPR.
5404 if (Val->isReg())
5405 MRI.clearKillFlags(Val->getReg());
5406
5407 const DebugLoc &DL = MI.getDebugLoc();
5408
5409 Register PhiReg = MRI.createVirtualRegister(VecRC);
5410
5411 Register SGPRIdxReg;
5412 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5413 UseGPRIdxMode, SGPRIdxReg);
5414 MachineBasicBlock *LoopBB = InsPt->getParent();
5415
5416 if (UseGPRIdxMode) {
5417 const MCInstrDesc &GPRIDXDesc =
5418 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5419
5420 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5421 .addReg(PhiReg)
5422 .add(*Val)
5423 .addReg(SGPRIdxReg)
5424 .addImm(SubReg);
5425 } else {
5426 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5427 TRI.getRegSizeInBits(*VecRC), 32, false);
5428 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5429 .addReg(PhiReg)
5430 .add(*Val)
5431 .addImm(SubReg);
5432 }
5433
5434 MI.eraseFromParent();
5435 return LoopBB;
5436}
5437
5439 MachineBasicBlock *BB) {
5440 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5441 // For GFX12, we emit s_add_u64 and s_sub_u64.
5442 MachineFunction *MF = BB->getParent();
5443 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5444 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5446 const DebugLoc &DL = MI.getDebugLoc();
5447 MachineOperand &Dest = MI.getOperand(0);
5448 MachineOperand &Src0 = MI.getOperand(1);
5449 MachineOperand &Src1 = MI.getOperand(2);
5450 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5451 if (ST.hasScalarAddSub64()) {
5452 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5453 // clang-format off
5454 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5455 .add(Src0)
5456 .add(Src1);
5457 // clang-format on
5458 } else {
5459 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5460 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5461
5462 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5463 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5464
5465 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5466 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5467 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5468 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5469
5470 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5471 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5472 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5473 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5474
5475 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5476 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5477 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5478 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5479 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5480 .addReg(DestSub0)
5481 .addImm(AMDGPU::sub0)
5482 .addReg(DestSub1)
5483 .addImm(AMDGPU::sub1);
5484 }
5485 MI.eraseFromParent();
5486 return BB;
5487}
5488
5490 switch (Opc) {
5491 case AMDGPU::S_MIN_U32:
5492 return std::numeric_limits<uint32_t>::max();
5493 case AMDGPU::S_MIN_I32:
5494 return std::numeric_limits<int32_t>::max();
5495 case AMDGPU::S_MAX_U32:
5496 return std::numeric_limits<uint32_t>::min();
5497 case AMDGPU::S_MAX_I32:
5498 return std::numeric_limits<int32_t>::min();
5499 case AMDGPU::V_ADD_F32_e64: // -0.0
5500 return 0x80000000;
5501 case AMDGPU::V_SUB_F32_e64: // +0.0
5502 return 0x0;
5503 case AMDGPU::S_ADD_I32:
5504 case AMDGPU::S_SUB_I32:
5505 case AMDGPU::S_OR_B32:
5506 case AMDGPU::S_XOR_B32:
5507 return std::numeric_limits<uint32_t>::min();
5508 case AMDGPU::S_AND_B32:
5509 return std::numeric_limits<uint32_t>::max();
5510 case AMDGPU::V_MIN_F32_e64:
5511 case AMDGPU::V_MAX_F32_e64:
5512 return 0x7fc00000; // qNAN
5513 default:
5515 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5516 }
5517}
5518
5520 switch (Opc) {
5521 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5522 return std::numeric_limits<uint64_t>::max();
5523 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5524 return std::numeric_limits<int64_t>::max();
5525 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5526 return std::numeric_limits<uint64_t>::min();
5527 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5528 return std::numeric_limits<int64_t>::min();
5529 case AMDGPU::S_ADD_U64_PSEUDO:
5530 case AMDGPU::S_SUB_U64_PSEUDO:
5531 case AMDGPU::S_OR_B64:
5532 case AMDGPU::S_XOR_B64:
5533 return std::numeric_limits<uint64_t>::min();
5534 case AMDGPU::S_AND_B64:
5535 return std::numeric_limits<uint64_t>::max();
5536 default:
5538 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5539 }
5540}
5541
5542static bool is32bitWaveReduceOperation(unsigned Opc) {
5543 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5544 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5545 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5546 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5547 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5548 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5549 Opc == AMDGPU::V_SUB_F32_e64;
5550}
5551
5553 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5554 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
5555}
5556
5559 const GCNSubtarget &ST,
5560 unsigned Opc) {
5562 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5563 const DebugLoc &DL = MI.getDebugLoc();
5564 const SIInstrInfo *TII = ST.getInstrInfo();
5565
5566 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5567 Register SrcReg = MI.getOperand(1).getReg();
5568 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5569 Register DstReg = MI.getOperand(0).getReg();
5570 MachineBasicBlock *RetBB = nullptr;
5571 if (isSGPR) {
5572 switch (Opc) {
5573 case AMDGPU::S_MIN_U32:
5574 case AMDGPU::S_MIN_I32:
5575 case AMDGPU::V_MIN_F32_e64:
5576 case AMDGPU::S_MAX_U32:
5577 case AMDGPU::S_MAX_I32:
5578 case AMDGPU::V_MAX_F32_e64:
5579 case AMDGPU::S_AND_B32:
5580 case AMDGPU::S_OR_B32: {
5581 // Idempotent operations.
5582 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5583 RetBB = &BB;
5584 break;
5585 }
5586 case AMDGPU::V_CMP_LT_U64_e64: // umin
5587 case AMDGPU::V_CMP_LT_I64_e64: // min
5588 case AMDGPU::V_CMP_GT_U64_e64: // umax
5589 case AMDGPU::V_CMP_GT_I64_e64: // max
5590 case AMDGPU::S_AND_B64:
5591 case AMDGPU::S_OR_B64: {
5592 // Idempotent operations.
5593 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5594 RetBB = &BB;
5595 break;
5596 }
5597 case AMDGPU::S_XOR_B32:
5598 case AMDGPU::S_XOR_B64:
5599 case AMDGPU::S_ADD_I32:
5600 case AMDGPU::S_ADD_U64_PSEUDO:
5601 case AMDGPU::V_ADD_F32_e64:
5602 case AMDGPU::S_SUB_I32:
5603 case AMDGPU::S_SUB_U64_PSEUDO:
5604 case AMDGPU::V_SUB_F32_e64: {
5605 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5606 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5607 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5608 Register NumActiveLanes =
5609 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5610
5611 bool IsWave32 = ST.isWave32();
5612 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5613 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5614 unsigned BitCountOpc =
5615 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5616
5617 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5618
5619 auto NewAccumulator =
5620 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5621 .addReg(ExecMask);
5622
5623 switch (Opc) {
5624 case AMDGPU::S_XOR_B32:
5625 case AMDGPU::S_XOR_B64: {
5626 // Performing an XOR operation on a uniform value
5627 // depends on the parity of the number of active lanes.
5628 // For even parity, the result will be 0, for odd
5629 // parity the result will be the same as the input value.
5630 Register ParityRegister =
5631 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5632
5633 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5634 .addReg(NewAccumulator->getOperand(0).getReg())
5635 .addImm(1)
5636 .setOperandDead(3); // Dead scc
5637 if (Opc == AMDGPU::S_XOR_B32) {
5638 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5639 .addReg(SrcReg)
5640 .addReg(ParityRegister);
5641 } else {
5642 Register DestSub0 =
5643 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5644 Register DestSub1 =
5645 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5646
5647 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5648 const TargetRegisterClass *SrcSubRC =
5649 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5650
5651 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5652 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5653 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5654 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5655
5656 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5657 .add(Op1L)
5658 .addReg(ParityRegister);
5659
5660 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5661 .add(Op1H)
5662 .addReg(ParityRegister);
5663
5664 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5665 .addReg(DestSub0)
5666 .addImm(AMDGPU::sub0)
5667 .addReg(DestSub1)
5668 .addImm(AMDGPU::sub1);
5669 }
5670 break;
5671 }
5672 case AMDGPU::S_SUB_I32: {
5673 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5674
5675 // Take the negation of the source operand.
5676 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5677 .addImm(0)
5678 .addReg(SrcReg);
5679 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5680 .addReg(NegatedVal)
5681 .addReg(NewAccumulator->getOperand(0).getReg());
5682 break;
5683 }
5684 case AMDGPU::S_ADD_I32: {
5685 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5686 .addReg(SrcReg)
5687 .addReg(NewAccumulator->getOperand(0).getReg());
5688 break;
5689 }
5690 case AMDGPU::S_ADD_U64_PSEUDO:
5691 case AMDGPU::S_SUB_U64_PSEUDO: {
5692 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5693 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5694 Register Op1H_Op0L_Reg =
5695 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5696 Register Op1L_Op0H_Reg =
5697 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5698 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5699 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5700 Register NegatedValLo =
5701 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5702 Register NegatedValHi =
5703 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5704
5705 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5706 const TargetRegisterClass *Src1SubRC =
5707 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5708
5709 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5710 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5711 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5712 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5713
5714 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5715 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5716 .addImm(0)
5717 .addReg(NewAccumulator->getOperand(0).getReg())
5718 .setOperandDead(3); // Dead scc
5719 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5720 .addReg(NegatedValLo)
5721 .addImm(31)
5722 .setOperandDead(3); // Dead scc
5723 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5724 .add(Op1L)
5725 .addReg(NegatedValHi);
5726 }
5727 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5728 ? NegatedValLo
5729 : NewAccumulator->getOperand(0).getReg();
5730 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5731 .add(Op1L)
5732 .addReg(LowOpcode);
5733 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5734 .add(Op1L)
5735 .addReg(LowOpcode);
5736 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5737 .add(Op1H)
5738 .addReg(LowOpcode);
5739
5740 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5741 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5742 .addReg(CarryReg)
5743 .addReg(Op1H_Op0L_Reg)
5744 .setOperandDead(3); // Dead scc
5745
5746 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5747 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5748 .addReg(HiVal)
5749 .addReg(Op1L_Op0H_Reg)
5750 .setOperandDead(3); // Dead scc
5751 }
5752 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5753 .addReg(DestSub0)
5754 .addImm(AMDGPU::sub0)
5755 .addReg(DestSub1)
5756 .addImm(AMDGPU::sub1);
5757 break;
5758 }
5759 case AMDGPU::V_ADD_F32_e64:
5760 case AMDGPU::V_SUB_F32_e64: {
5761 Register ActiveLanesVreg =
5762 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5763 Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5764 // Get number of active lanes as a float val.
5765 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
5766 ActiveLanesVreg)
5767 .addReg(NewAccumulator->getOperand(0).getReg())
5768 .addImm(0) // clamp
5769 .addImm(0); // output-modifier
5770
5771 // Take negation of input for SUB reduction
5772 unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5773 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
5774 .addImm(srcMod) // src0 modifier
5775 .addReg(SrcReg)
5776 .addImm(0) // src1 modifier
5777 .addReg(ActiveLanesVreg)
5778 .addImm(0) // clamp
5779 .addImm(0); // output-mod
5780 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5781 .addReg(DstVreg);
5782 }
5783 }
5784 RetBB = &BB;
5785 }
5786 }
5787 } else {
5788 // TODO: Implement DPP Strategy and switch based on immediate strategy
5789 // operand. For now, for all the cases (default, Iterative and DPP we use
5790 // iterative approach by default.)
5791
5792 // To reduce the VGPR using iterative approach, we need to iterate
5793 // over all the active lanes. Lowering consists of ComputeLoop,
5794 // which iterate over only active lanes. We use copy of EXEC register
5795 // as induction variable and every active lane modifies it using bitset0
5796 // so that we will get the next active lane for next iteration.
5798 Register SrcReg = MI.getOperand(1).getReg();
5799 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5801
5802 // Create Control flow for loop
5803 // Split MI's Machine Basic block into For loop
5804 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5805
5806 // Create virtual registers required for lowering.
5807 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5808 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5809 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5810 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5811 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5812 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5813 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5814 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5815 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5816
5817 bool IsWave32 = ST.isWave32();
5818 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5819 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5820
5821 // Create initial values of induction variable from Exec, Accumulator and
5822 // insert branch instr to newly created ComputeBlock
5823 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5824 if (is32BitOpc) {
5826 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5827 .addImm(IdentityValue);
5828 } else {
5830 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5831 .addImm(IdentityValue);
5832 }
5833 // clang-format off
5834 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5835 .addMBB(ComputeLoop);
5836 // clang-format on
5837
5838 // Start constructing ComputeLoop
5839 I = ComputeLoop->begin();
5840 auto Accumulator =
5841 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5842 .addReg(IdentityValReg)
5843 .addMBB(&BB);
5844 auto ActiveBits =
5845 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5846 .addReg(LoopIterator)
5847 .addMBB(&BB);
5848
5849 I = ComputeLoop->end();
5850 MachineInstr *NewAccumulator;
5851 // Perform the computations
5852 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5853 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5854 .addReg(ActiveBitsReg);
5855 if (is32BitOpc) {
5856 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5857 LaneValueReg)
5858 .addReg(SrcReg)
5859 .addReg(FF1Reg);
5860 if (isFPOp) {
5861 Register LaneValVreg =
5862 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5863 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5864 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5865 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
5866 LaneValVreg)
5867 .addReg(LaneValueReg);
5868 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
5869 .addImm(0) // src0 modifier
5870 .addReg(Accumulator->getOperand(0).getReg())
5871 .addImm(0) // src1 modifier
5872 .addReg(LaneValVreg)
5873 .addImm(0) // clamp
5874 .addImm(0); // omod
5875 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5876 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5877 .addReg(DstVreg);
5878 } else {
5879 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5880 .addReg(Accumulator->getOperand(0).getReg())
5881 .addReg(LaneValueReg);
5882 }
5883 } else {
5884 Register LaneValueLoReg =
5885 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5886 Register LaneValueHiReg =
5887 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5888 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5889 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5890 const TargetRegisterClass *SrcSubRC =
5891 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5892 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5893 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5894 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5895 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5896 // lane value input should be in an sgpr
5897 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5898 LaneValueLoReg)
5899 .add(Op1L)
5900 .addReg(FF1Reg);
5901 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5902 LaneValueHiReg)
5903 .add(Op1H)
5904 .addReg(FF1Reg);
5905 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5906 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5907 .addReg(LaneValueLoReg)
5908 .addImm(AMDGPU::sub0)
5909 .addReg(LaneValueHiReg)
5910 .addImm(AMDGPU::sub1);
5911 switch (Opc) {
5912 case AMDGPU::S_OR_B64:
5913 case AMDGPU::S_AND_B64:
5914 case AMDGPU::S_XOR_B64: {
5915 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5916 .addReg(Accumulator->getOperand(0).getReg())
5917 .addReg(LaneValue->getOperand(0).getReg())
5918 .setOperandDead(3); // Dead scc
5919 break;
5920 }
5921 case AMDGPU::V_CMP_GT_I64_e64:
5922 case AMDGPU::V_CMP_GT_U64_e64:
5923 case AMDGPU::V_CMP_LT_I64_e64:
5924 case AMDGPU::V_CMP_LT_U64_e64: {
5925 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5926 Register ComparisonResultReg =
5927 MRI.createVirtualRegister(WaveMaskRegClass);
5928 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5929 const TargetRegisterClass *VSubRegClass =
5930 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5931 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5932 MachineOperand SrcReg0Sub0 =
5933 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5934 VregClass, AMDGPU::sub0, VSubRegClass);
5935 MachineOperand SrcReg0Sub1 =
5936 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5937 VregClass, AMDGPU::sub1, VSubRegClass);
5938 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5939 AccumulatorVReg)
5940 .add(SrcReg0Sub0)
5941 .addImm(AMDGPU::sub0)
5942 .add(SrcReg0Sub1)
5943 .addImm(AMDGPU::sub1);
5944 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5945 .addReg(LaneValue->getOperand(0).getReg())
5946 .addReg(AccumulatorVReg);
5947
5948 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5949 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5950 .addReg(LaneMaskReg)
5951 .addReg(ActiveBitsReg);
5952
5953 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5954 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5955 .addReg(LaneValue->getOperand(0).getReg())
5956 .addReg(Accumulator->getOperand(0).getReg());
5957 break;
5958 }
5959 case AMDGPU::S_ADD_U64_PSEUDO:
5960 case AMDGPU::S_SUB_U64_PSEUDO: {
5961 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5962 .addReg(Accumulator->getOperand(0).getReg())
5963 .addReg(LaneValue->getOperand(0).getReg());
5964 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5965 break;
5966 }
5967 }
5968 }
5969 // Manipulate the iterator to get the next active lane
5970 unsigned BITSETOpc =
5971 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5972 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5973 .addReg(FF1Reg)
5974 .addReg(ActiveBitsReg);
5975
5976 // Add phi nodes
5977 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5978 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5979
5980 // Creating branching
5981 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5982 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5983 .addReg(NewActiveBitsReg)
5984 .addImm(0);
5985 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5986 .addMBB(ComputeLoop);
5987
5988 RetBB = ComputeEnd;
5989 }
5990 MI.eraseFromParent();
5991 return RetBB;
5992}
5993
5996 MachineBasicBlock *BB) const {
5997 MachineFunction *MF = BB->getParent();
5999 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6001 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6003 const DebugLoc &DL = MI.getDebugLoc();
6004
6005 switch (MI.getOpcode()) {
6006 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6007 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
6008 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6009 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
6010 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6011 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
6012 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6013 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
6014 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6015 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
6016 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6017 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
6018 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6019 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
6020 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6021 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6022 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6023 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6024 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6025 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6026 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6027 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6028 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6029 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6030 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6031 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6032 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6033 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6034 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6035 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6036 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6037 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6038 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6039 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6040 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6041 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6042 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6043 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6044 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6045 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6046 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6047 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6048 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6049 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6050 case AMDGPU::S_UADDO_PSEUDO:
6051 case AMDGPU::S_USUBO_PSEUDO: {
6052 MachineOperand &Dest0 = MI.getOperand(0);
6053 MachineOperand &Dest1 = MI.getOperand(1);
6054 MachineOperand &Src0 = MI.getOperand(2);
6055 MachineOperand &Src1 = MI.getOperand(3);
6056
6057 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6058 ? AMDGPU::S_ADD_U32
6059 : AMDGPU::S_SUB_U32;
6060 // clang-format off
6061 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6062 .add(Src0)
6063 .add(Src1);
6064 // clang-format on
6065
6066 unsigned SelOpc =
6067 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6068 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6069
6070 MI.eraseFromParent();
6071 return BB;
6072 }
6073 case AMDGPU::S_ADD_U64_PSEUDO:
6074 case AMDGPU::S_SUB_U64_PSEUDO: {
6075 return Expand64BitScalarArithmetic(MI, BB);
6076 }
6077 case AMDGPU::V_ADD_U64_PSEUDO:
6078 case AMDGPU::V_SUB_U64_PSEUDO: {
6079 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6080
6081 MachineOperand &Dest = MI.getOperand(0);
6082 MachineOperand &Src0 = MI.getOperand(1);
6083 MachineOperand &Src1 = MI.getOperand(2);
6084
6085 if (ST.hasAddSubU64Insts()) {
6086 auto I = BuildMI(*BB, MI, DL,
6087 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6088 : AMDGPU::V_SUB_U64_e64),
6089 Dest.getReg())
6090 .add(Src0)
6091 .add(Src1)
6092 .addImm(0); // clamp
6093 TII->legalizeOperands(*I);
6094 MI.eraseFromParent();
6095 return BB;
6096 }
6097
6098 if (IsAdd && ST.hasLshlAddU64Inst()) {
6099 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6100 Dest.getReg())
6101 .add(Src0)
6102 .addImm(0)
6103 .add(Src1);
6104 TII->legalizeOperands(*Add);
6105 MI.eraseFromParent();
6106 return BB;
6107 }
6108
6109 const auto *CarryRC = TRI->getWaveMaskRegClass();
6110
6111 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6112 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6113
6114 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6115 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6116
6117 const TargetRegisterClass *Src0RC = Src0.isReg()
6118 ? MRI.getRegClass(Src0.getReg())
6119 : &AMDGPU::VReg_64RegClass;
6120 const TargetRegisterClass *Src1RC = Src1.isReg()
6121 ? MRI.getRegClass(Src1.getReg())
6122 : &AMDGPU::VReg_64RegClass;
6123
6124 const TargetRegisterClass *Src0SubRC =
6125 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6126 const TargetRegisterClass *Src1SubRC =
6127 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6128
6129 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6130 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6131 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6132 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6133
6134 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6135 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6136 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6137 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6138
6139 unsigned LoOpc =
6140 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6141 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6142 .addReg(CarryReg, RegState::Define)
6143 .add(SrcReg0Sub0)
6144 .add(SrcReg1Sub0)
6145 .addImm(0); // clamp bit
6146
6147 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6148 MachineInstr *HiHalf =
6149 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6150 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6151 .add(SrcReg0Sub1)
6152 .add(SrcReg1Sub1)
6153 .addReg(CarryReg, RegState::Kill)
6154 .addImm(0); // clamp bit
6155
6156 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6157 .addReg(DestSub0)
6158 .addImm(AMDGPU::sub0)
6159 .addReg(DestSub1)
6160 .addImm(AMDGPU::sub1);
6161 TII->legalizeOperands(*LoHalf);
6162 TII->legalizeOperands(*HiHalf);
6163 MI.eraseFromParent();
6164 return BB;
6165 }
6166 case AMDGPU::S_ADD_CO_PSEUDO:
6167 case AMDGPU::S_SUB_CO_PSEUDO: {
6168 // This pseudo has a chance to be selected
6169 // only from uniform add/subcarry node. All the VGPR operands
6170 // therefore assumed to be splat vectors.
6172 MachineOperand &Dest = MI.getOperand(0);
6173 MachineOperand &CarryDest = MI.getOperand(1);
6174 MachineOperand &Src0 = MI.getOperand(2);
6175 MachineOperand &Src1 = MI.getOperand(3);
6176 MachineOperand &Src2 = MI.getOperand(4);
6177 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6178 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6179 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6180 .addReg(Src0.getReg());
6181 Src0.setReg(RegOp0);
6182 }
6183 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6184 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6185 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6186 .addReg(Src1.getReg());
6187 Src1.setReg(RegOp1);
6188 }
6189 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6190 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6191 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6192 .addReg(Src2.getReg());
6193 Src2.setReg(RegOp2);
6194 }
6195
6196 if (ST.isWave64()) {
6197 if (ST.hasScalarCompareEq64()) {
6198 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6199 .addReg(Src2.getReg())
6200 .addImm(0);
6201 } else {
6202 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6203 const TargetRegisterClass *SubRC =
6204 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6205 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6206 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6207 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6208 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6209 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6210
6211 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6212 .add(Src2Sub0)
6213 .add(Src2Sub1);
6214
6215 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6216 .addReg(Src2_32, RegState::Kill)
6217 .addImm(0);
6218 }
6219 } else {
6220 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6221 .addReg(Src2.getReg())
6222 .addImm(0);
6223 }
6224
6225 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6226 ? AMDGPU::S_ADDC_U32
6227 : AMDGPU::S_SUBB_U32;
6228
6229 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6230
6231 unsigned SelOpc =
6232 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6233
6234 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6235 .addImm(-1)
6236 .addImm(0);
6237
6238 MI.eraseFromParent();
6239 return BB;
6240 }
6241 case AMDGPU::SI_INIT_M0: {
6242 MachineOperand &M0Init = MI.getOperand(0);
6243 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6244 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6245 AMDGPU::M0)
6246 .add(M0Init);
6247 MI.eraseFromParent();
6248 return BB;
6249 }
6250 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6251 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6252 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6253 TII->get(AMDGPU::S_CMP_EQ_U32))
6254 .addImm(0)
6255 .addImm(0);
6256 return BB;
6257 }
6258 case AMDGPU::GET_GROUPSTATICSIZE: {
6259 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6260 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6261 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6262 .add(MI.getOperand(0))
6263 .addImm(MFI->getLDSSize());
6264 MI.eraseFromParent();
6265 return BB;
6266 }
6267 case AMDGPU::GET_SHADERCYCLESHILO: {
6269 // The algorithm is:
6270 //
6271 // hi1 = getreg(SHADER_CYCLES_HI)
6272 // lo1 = getreg(SHADER_CYCLES_LO)
6273 // hi2 = getreg(SHADER_CYCLES_HI)
6274 //
6275 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6276 // Otherwise there was overflow and the result is hi2:0. In both cases the
6277 // result should represent the actual time at some point during the sequence
6278 // of three getregs.
6279 using namespace AMDGPU::Hwreg;
6280 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6281 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6282 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6283 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6284 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6285 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6286 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6287 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6288 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6289 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6290 .addReg(RegHi1)
6291 .addReg(RegHi2);
6292 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6293 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6294 .addReg(RegLo1)
6295 .addImm(0);
6296 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6297 .add(MI.getOperand(0))
6298 .addReg(RegLo)
6299 .addImm(AMDGPU::sub0)
6300 .addReg(RegHi2)
6301 .addImm(AMDGPU::sub1);
6302 MI.eraseFromParent();
6303 return BB;
6304 }
6305 case AMDGPU::SI_INDIRECT_SRC_V1:
6306 case AMDGPU::SI_INDIRECT_SRC_V2:
6307 case AMDGPU::SI_INDIRECT_SRC_V3:
6308 case AMDGPU::SI_INDIRECT_SRC_V4:
6309 case AMDGPU::SI_INDIRECT_SRC_V5:
6310 case AMDGPU::SI_INDIRECT_SRC_V6:
6311 case AMDGPU::SI_INDIRECT_SRC_V7:
6312 case AMDGPU::SI_INDIRECT_SRC_V8:
6313 case AMDGPU::SI_INDIRECT_SRC_V9:
6314 case AMDGPU::SI_INDIRECT_SRC_V10:
6315 case AMDGPU::SI_INDIRECT_SRC_V11:
6316 case AMDGPU::SI_INDIRECT_SRC_V12:
6317 case AMDGPU::SI_INDIRECT_SRC_V16:
6318 case AMDGPU::SI_INDIRECT_SRC_V32:
6319 return emitIndirectSrc(MI, *BB, *getSubtarget());
6320 case AMDGPU::SI_INDIRECT_DST_V1:
6321 case AMDGPU::SI_INDIRECT_DST_V2:
6322 case AMDGPU::SI_INDIRECT_DST_V3:
6323 case AMDGPU::SI_INDIRECT_DST_V4:
6324 case AMDGPU::SI_INDIRECT_DST_V5:
6325 case AMDGPU::SI_INDIRECT_DST_V6:
6326 case AMDGPU::SI_INDIRECT_DST_V7:
6327 case AMDGPU::SI_INDIRECT_DST_V8:
6328 case AMDGPU::SI_INDIRECT_DST_V9:
6329 case AMDGPU::SI_INDIRECT_DST_V10:
6330 case AMDGPU::SI_INDIRECT_DST_V11:
6331 case AMDGPU::SI_INDIRECT_DST_V12:
6332 case AMDGPU::SI_INDIRECT_DST_V16:
6333 case AMDGPU::SI_INDIRECT_DST_V32:
6334 return emitIndirectDst(MI, *BB, *getSubtarget());
6335 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6336 case AMDGPU::SI_KILL_I1_PSEUDO:
6337 return splitKillBlock(MI, BB);
6338 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6339 Register Dst = MI.getOperand(0).getReg();
6340 const MachineOperand &Src0 = MI.getOperand(1);
6341 const MachineOperand &Src1 = MI.getOperand(2);
6342 Register SrcCond = MI.getOperand(3).getReg();
6343
6344 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6345 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6346 const auto *CondRC = TRI->getWaveMaskRegClass();
6347 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6348
6349 const TargetRegisterClass *Src0RC = Src0.isReg()
6350 ? MRI.getRegClass(Src0.getReg())
6351 : &AMDGPU::VReg_64RegClass;
6352 const TargetRegisterClass *Src1RC = Src1.isReg()
6353 ? MRI.getRegClass(Src1.getReg())
6354 : &AMDGPU::VReg_64RegClass;
6355
6356 const TargetRegisterClass *Src0SubRC =
6357 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6358 const TargetRegisterClass *Src1SubRC =
6359 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6360
6361 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6362 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6363 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6364 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6365
6366 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6367 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6368 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6369 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6370
6371 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6372 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6373 .addImm(0)
6374 .add(Src0Sub0)
6375 .addImm(0)
6376 .add(Src1Sub0)
6377 .addReg(SrcCondCopy);
6378 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6379 .addImm(0)
6380 .add(Src0Sub1)
6381 .addImm(0)
6382 .add(Src1Sub1)
6383 .addReg(SrcCondCopy);
6384
6385 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6386 .addReg(DstLo)
6387 .addImm(AMDGPU::sub0)
6388 .addReg(DstHi)
6389 .addImm(AMDGPU::sub1);
6390 MI.eraseFromParent();
6391 return BB;
6392 }
6393 case AMDGPU::SI_BR_UNDEF: {
6394 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6395 .add(MI.getOperand(0));
6396 Br->getOperand(1).setIsUndef(); // read undef SCC
6397 MI.eraseFromParent();
6398 return BB;
6399 }
6400 case AMDGPU::ADJCALLSTACKUP:
6401 case AMDGPU::ADJCALLSTACKDOWN: {
6403 MachineInstrBuilder MIB(*MF, &MI);
6404 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6405 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6406 return BB;
6407 }
6408 case AMDGPU::SI_CALL_ISEL: {
6409 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6410
6412 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6413
6414 for (const MachineOperand &MO : MI.operands())
6415 MIB.add(MO);
6416
6417 MIB.cloneMemRefs(MI);
6418 MI.eraseFromParent();
6419 return BB;
6420 }
6421 case AMDGPU::V_ADD_CO_U32_e32:
6422 case AMDGPU::V_SUB_CO_U32_e32:
6423 case AMDGPU::V_SUBREV_CO_U32_e32: {
6424 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6425 unsigned Opc = MI.getOpcode();
6426
6427 bool NeedClampOperand = false;
6428 if (TII->pseudoToMCOpcode(Opc) == -1) {
6430 NeedClampOperand = true;
6431 }
6432
6433 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6434 if (TII->isVOP3(*I)) {
6435 I.addReg(TRI->getVCC(), RegState::Define);
6436 }
6437 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6438 if (NeedClampOperand)
6439 I.addImm(0); // clamp bit for e64 encoding
6440
6441 TII->legalizeOperands(*I);
6442
6443 MI.eraseFromParent();
6444 return BB;
6445 }
6446 case AMDGPU::V_ADDC_U32_e32:
6447 case AMDGPU::V_SUBB_U32_e32:
6448 case AMDGPU::V_SUBBREV_U32_e32:
6449 // These instructions have an implicit use of vcc which counts towards the
6450 // constant bus limit.
6451 TII->legalizeOperands(MI);
6452 return BB;
6453 case AMDGPU::DS_GWS_INIT:
6454 case AMDGPU::DS_GWS_SEMA_BR:
6455 case AMDGPU::DS_GWS_BARRIER:
6456 case AMDGPU::DS_GWS_SEMA_V:
6457 case AMDGPU::DS_GWS_SEMA_P:
6458 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6459 // A s_waitcnt 0 is required to be the instruction immediately following.
6460 if (getSubtarget()->hasGWSAutoReplay()) {
6462 return BB;
6463 }
6464
6465 return emitGWSMemViolTestLoop(MI, BB);
6466 case AMDGPU::S_SETREG_B32: {
6467 // Try to optimize cases that only set the denormal mode or rounding mode.
6468 //
6469 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6470 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6471 // instead.
6472 //
6473 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6474 // allow you to have a no side effect instruction in the output of a
6475 // sideeffecting pattern.
6476 auto [ID, Offset, Width] =
6477 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6479 return BB;
6480
6481 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6482 const unsigned SetMask = WidthMask << Offset;
6483
6484 if (getSubtarget()->hasDenormModeInst()) {
6485 unsigned SetDenormOp = 0;
6486 unsigned SetRoundOp = 0;
6487
6488 // The dedicated instructions can only set the whole denorm or round mode
6489 // at once, not a subset of bits in either.
6490 if (SetMask ==
6492 // If this fully sets both the round and denorm mode, emit the two
6493 // dedicated instructions for these.
6494 SetRoundOp = AMDGPU::S_ROUND_MODE;
6495 SetDenormOp = AMDGPU::S_DENORM_MODE;
6496 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6497 SetRoundOp = AMDGPU::S_ROUND_MODE;
6498 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6499 SetDenormOp = AMDGPU::S_DENORM_MODE;
6500 }
6501
6502 if (SetRoundOp || SetDenormOp) {
6503 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6504 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6505 unsigned ImmVal = Def->getOperand(1).getImm();
6506 if (SetRoundOp) {
6507 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6508 .addImm(ImmVal & 0xf);
6509
6510 // If we also have the denorm mode, get just the denorm mode bits.
6511 ImmVal >>= 4;
6512 }
6513
6514 if (SetDenormOp) {
6515 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6516 .addImm(ImmVal & 0xf);
6517 }
6518
6519 MI.eraseFromParent();
6520 return BB;
6521 }
6522 }
6523 }
6524
6525 // If only FP bits are touched, used the no side effects pseudo.
6526 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6527 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6528 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6529
6530 return BB;
6531 }
6532 case AMDGPU::S_INVERSE_BALLOT_U32:
6533 case AMDGPU::S_INVERSE_BALLOT_U64:
6534 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6535 // necessary. After that they are equivalent to a COPY.
6536 MI.setDesc(TII->get(AMDGPU::COPY));
6537 return BB;
6538 case AMDGPU::ENDPGM_TRAP: {
6539 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6540 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6541 MI.addOperand(MachineOperand::CreateImm(0));
6542 return BB;
6543 }
6544
6545 // We need a block split to make the real endpgm a terminator. We also don't
6546 // want to break phis in successor blocks, so we can't just delete to the
6547 // end of the block.
6548
6549 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6551 MF->push_back(TrapBB);
6552 // clang-format off
6553 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6554 .addImm(0);
6555 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6556 .addMBB(TrapBB);
6557 // clang-format on
6558
6559 BB->addSuccessor(TrapBB);
6560 MI.eraseFromParent();
6561 return SplitBB;
6562 }
6563 case AMDGPU::SIMULATED_TRAP: {
6564 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6565 MachineBasicBlock *SplitBB =
6566 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6567 MI.eraseFromParent();
6568 return SplitBB;
6569 }
6570 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6571 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6573
6574 // During ISel, it's difficult to propagate the original EXEC mask to use as
6575 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6576 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6577 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6578 Register OriginalExec = Setup->getOperand(0).getReg();
6579 MF->getRegInfo().clearKillFlags(OriginalExec);
6580 MI.getOperand(0).setReg(OriginalExec);
6581 return BB;
6582 }
6583 default:
6584 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6585 if (!MI.mayStore())
6587 return BB;
6588 }
6590 }
6591}
6592
6594 // This currently forces unfolding various combinations of fsub into fma with
6595 // free fneg'd operands. As long as we have fast FMA (controlled by
6596 // isFMAFasterThanFMulAndFAdd), we should perform these.
6597
6598 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6599 // most of these combines appear to be cycle neutral but save on instruction
6600 // count / code size.
6601 return true;
6602}
6603
6605
6607 EVT VT) const {
6608 if (!VT.isVector()) {
6609 return MVT::i1;
6610 }
6611 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6612}
6613
6615 // TODO: Should i16 be used always if legal? For now it would force VALU
6616 // shifts.
6617 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6618}
6619
6621 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6622 ? Ty.changeElementSize(16)
6623 : Ty.changeElementSize(32);
6624}
6625
6626// Answering this is somewhat tricky and depends on the specific device which
6627// have different rates for fma or all f64 operations.
6628//
6629// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6630// regardless of which device (although the number of cycles differs between
6631// devices), so it is always profitable for f64.
6632//
6633// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6634// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6635// which we can always do even without fused FP ops since it returns the same
6636// result as the separate operations and since it is always full
6637// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6638// however does not support denormals, so we do report fma as faster if we have
6639// a fast fma device and require denormals.
6640//
6642 EVT VT) const {
6643 VT = VT.getScalarType();
6644
6645 switch (VT.getSimpleVT().SimpleTy) {
6646 case MVT::f32: {
6647 // If mad is not available this depends only on if f32 fma is full rate.
6648 if (!Subtarget->hasMadMacF32Insts())
6649 return Subtarget->hasFastFMAF32();
6650
6651 // Otherwise f32 mad is always full rate and returns the same result as
6652 // the separate operations so should be preferred over fma.
6653 // However does not support denormals.
6655 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6656
6657 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6658 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6659 }
6660 case MVT::f64:
6661 return true;
6662 case MVT::f16:
6663 case MVT::bf16:
6664 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6665 default:
6666 break;
6667 }
6668
6669 return false;
6670}
6671
6673 LLT Ty) const {
6674 switch (Ty.getScalarSizeInBits()) {
6675 case 16:
6676 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6677 case 32:
6678 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6679 case 64:
6680 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6681 default:
6682 break;
6683 }
6684
6685 return false;
6686}
6687
6689 if (!Ty.isScalar())
6690 return false;
6691
6692 if (Ty.getScalarSizeInBits() == 16)
6693 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6694 if (Ty.getScalarSizeInBits() == 32)
6695 return Subtarget->hasMadMacF32Insts() &&
6696 denormalModeIsFlushAllF32(*MI.getMF());
6697
6698 return false;
6699}
6700
6702 const SDNode *N) const {
6703 // TODO: Check future ftz flag
6704 // v_mad_f32/v_mac_f32 do not support denormals.
6705 EVT VT = N->getValueType(0);
6706 if (VT == MVT::f32)
6707 return Subtarget->hasMadMacF32Insts() &&
6709 if (VT == MVT::f16) {
6710 return Subtarget->hasMadF16() &&
6712 }
6713
6714 return false;
6715}
6716
6717//===----------------------------------------------------------------------===//
6718// Custom DAG Lowering Operations
6719//===----------------------------------------------------------------------===//
6720
6721// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6722// wider vector type is legal.
6724 SelectionDAG &DAG) const {
6725 unsigned Opc = Op.getOpcode();
6726 EVT VT = Op.getValueType();
6727 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6728 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6729 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6730 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6731 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6732 VT == MVT::v32bf16);
6733
6734 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6735
6736 SDLoc SL(Op);
6737 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6738 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6739
6740 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6741}
6742
6743// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6744// regression whereby extra unnecessary instructions were added to codegen
6745// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6746// instructions to extract the result from the vector.
6748 [[maybe_unused]] EVT VT = Op.getValueType();
6749
6750 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6751 VT == MVT::v16i32) &&
6752 "Unexpected ValueType.");
6753
6754 return DAG.UnrollVectorOp(Op.getNode());
6755}
6756
6757// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6758// wider vector type is legal.
6760 SelectionDAG &DAG) const {
6761 unsigned Opc = Op.getOpcode();
6762 EVT VT = Op.getValueType();
6763 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6764 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6765 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6766 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6767 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6768 VT == MVT::v32bf16);
6769
6770 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6771 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6772
6773 SDLoc SL(Op);
6774
6775 SDValue OpLo =
6776 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6777 SDValue OpHi =
6778 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6779
6780 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6781}
6782
6784 SelectionDAG &DAG) const {
6785 unsigned Opc = Op.getOpcode();
6786 EVT VT = Op.getValueType();
6787 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6788 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6789 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6790 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6791 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6792 VT == MVT::v32bf16);
6793
6794 SDValue Op0 = Op.getOperand(0);
6795 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6796 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6797 : std::pair(Op0, Op0);
6798
6799 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6800 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6801
6802 SDLoc SL(Op);
6803 auto ResVT = DAG.GetSplitDestVTs(VT);
6804
6805 SDValue OpLo =
6806 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6807 SDValue OpHi =
6808 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6809
6810 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6811}
6812
6814 switch (Op.getOpcode()) {
6815 default:
6817 case ISD::BRCOND:
6818 return LowerBRCOND(Op, DAG);
6819 case ISD::RETURNADDR:
6820 return LowerRETURNADDR(Op, DAG);
6821 case ISD::LOAD: {
6822 SDValue Result = LowerLOAD(Op, DAG);
6823 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6824 "Load should return a value and a chain");
6825 return Result;
6826 }
6827 case ISD::FSQRT: {
6828 EVT VT = Op.getValueType();
6829 if (VT == MVT::f32)
6830 return lowerFSQRTF32(Op, DAG);
6831 if (VT == MVT::f64)
6832 return lowerFSQRTF64(Op, DAG);
6833 return SDValue();
6834 }
6835 case ISD::FSIN:
6836 case ISD::FCOS:
6837 return LowerTrig(Op, DAG);
6838 case ISD::SELECT:
6839 return LowerSELECT(Op, DAG);
6840 case ISD::FDIV:
6841 return LowerFDIV(Op, DAG);
6842 case ISD::FFREXP:
6843 return LowerFFREXP(Op, DAG);
6845 return LowerATOMIC_CMP_SWAP(Op, DAG);
6846 case ISD::STORE:
6847 return LowerSTORE(Op, DAG);
6848 case ISD::GlobalAddress: {
6851 return LowerGlobalAddress(MFI, Op, DAG);
6852 }
6854 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6856 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6858 return LowerINTRINSIC_VOID(Op, DAG);
6859 case ISD::ADDRSPACECAST:
6860 return lowerADDRSPACECAST(Op, DAG);
6862 return lowerINSERT_SUBVECTOR(Op, DAG);
6864 return lowerINSERT_VECTOR_ELT(Op, DAG);
6866 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6868 return lowerVECTOR_SHUFFLE(Op, DAG);
6870 return lowerSCALAR_TO_VECTOR(Op, DAG);
6871 case ISD::BUILD_VECTOR:
6872 return lowerBUILD_VECTOR(Op, DAG);
6873 case ISD::FP_ROUND:
6875 return lowerFP_ROUND(Op, DAG);
6876 case ISD::TRAP:
6877 return lowerTRAP(Op, DAG);
6878 case ISD::DEBUGTRAP:
6879 return lowerDEBUGTRAP(Op, DAG);
6880 case ISD::ABS:
6881 case ISD::FABS:
6882 case ISD::FNEG:
6883 case ISD::FCANONICALIZE:
6884 case ISD::BSWAP:
6885 return splitUnaryVectorOp(Op, DAG);
6886 case ISD::FMINNUM:
6887 case ISD::FMAXNUM:
6888 return lowerFMINNUM_FMAXNUM(Op, DAG);
6889 case ISD::FMINIMUMNUM:
6890 case ISD::FMAXIMUMNUM:
6891 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6892 case ISD::FMINIMUM:
6893 case ISD::FMAXIMUM:
6894 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6895 case ISD::FLDEXP:
6896 case ISD::STRICT_FLDEXP:
6897 return lowerFLDEXP(Op, DAG);
6898 case ISD::FMA:
6899 return splitTernaryVectorOp(Op, DAG);
6900 case ISD::FP_TO_SINT:
6901 case ISD::FP_TO_UINT:
6902 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
6903 Op.getValueType() == MVT::i16 &&
6904 Op.getOperand(0).getValueType() == MVT::f32) {
6905 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
6906 return Op;
6907 }
6908 return LowerFP_TO_INT(Op, DAG);
6909 case ISD::SHL:
6910 case ISD::SRA:
6911 case ISD::SRL:
6912 case ISD::ADD:
6913 case ISD::SUB:
6914 case ISD::SMIN:
6915 case ISD::SMAX:
6916 case ISD::UMIN:
6917 case ISD::UMAX:
6918 case ISD::FADD:
6919 case ISD::FMUL:
6920 case ISD::FMINNUM_IEEE:
6921 case ISD::FMAXNUM_IEEE:
6922 case ISD::UADDSAT:
6923 case ISD::USUBSAT:
6924 case ISD::SADDSAT:
6925 case ISD::SSUBSAT:
6926 return splitBinaryVectorOp(Op, DAG);
6927 case ISD::FCOPYSIGN:
6928 return lowerFCOPYSIGN(Op, DAG);
6929 case ISD::MUL:
6930 return lowerMUL(Op, DAG);
6931 case ISD::SMULO:
6932 case ISD::UMULO:
6933 return lowerXMULO(Op, DAG);
6934 case ISD::SMUL_LOHI:
6935 case ISD::UMUL_LOHI:
6936 return lowerXMUL_LOHI(Op, DAG);
6938 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6939 case ISD::STACKSAVE:
6940 return LowerSTACKSAVE(Op, DAG);
6941 case ISD::GET_ROUNDING:
6942 return lowerGET_ROUNDING(Op, DAG);
6943 case ISD::SET_ROUNDING:
6944 return lowerSET_ROUNDING(Op, DAG);
6945 case ISD::PREFETCH:
6946 return lowerPREFETCH(Op, DAG);
6947 case ISD::FP_EXTEND:
6949 return lowerFP_EXTEND(Op, DAG);
6950 case ISD::GET_FPENV:
6951 return lowerGET_FPENV(Op, DAG);
6952 case ISD::SET_FPENV:
6953 return lowerSET_FPENV(Op, DAG);
6954 case ISD::ROTR:
6955 return lowerROTR(Op, DAG);
6956 }
6957 return SDValue();
6958}
6959
6960// Used for D16: Casts the result of an instruction into the right vector,
6961// packs values if loads return unpacked values.
6963 const SDLoc &DL, SelectionDAG &DAG,
6964 bool Unpacked) {
6965 if (!LoadVT.isVector())
6966 return Result;
6967
6968 // Cast back to the original packed type or to a larger type that is a
6969 // multiple of 32 bit for D16. Widening the return type is a required for
6970 // legalization.
6971 EVT FittingLoadVT = LoadVT;
6972 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6973 FittingLoadVT =
6975 LoadVT.getVectorNumElements() + 1);
6976 }
6977
6978 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6979 // Truncate to v2i16/v4i16.
6980 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6981
6982 // Workaround legalizer not scalarizing truncate after vector op
6983 // legalization but not creating intermediate vector trunc.
6985 DAG.ExtractVectorElements(Result, Elts);
6986 for (SDValue &Elt : Elts)
6987 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6988
6989 // Pad illegal v1i16/v3fi6 to v4i16
6990 if ((LoadVT.getVectorNumElements() % 2) == 1)
6991 Elts.push_back(DAG.getPOISON(MVT::i16));
6992
6993 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6994
6995 // Bitcast to original type (v2f16/v4f16).
6996 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6997 }
6998
6999 // Cast back to the original packed type.
7000 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7001}
7002
7003SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7004 SelectionDAG &DAG,
7006 bool IsIntrinsic) const {
7007 SDLoc DL(M);
7008
7009 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7010 EVT LoadVT = M->getValueType(0);
7011
7012 EVT EquivLoadVT = LoadVT;
7013 if (LoadVT.isVector()) {
7014 if (Unpacked) {
7015 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
7016 LoadVT.getVectorNumElements());
7017 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7018 // Widen v3f16 to legal type
7019 EquivLoadVT =
7021 LoadVT.getVectorNumElements() + 1);
7022 }
7023 }
7024
7025 // Change from v4f16/v2f16 to EquivLoadVT.
7026 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7027
7029 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7030 M->getMemoryVT(), M->getMemOperand());
7031
7032 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7033
7034 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7035}
7036
7037SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7038 SelectionDAG &DAG,
7039 ArrayRef<SDValue> Ops) const {
7040 SDLoc DL(M);
7041 EVT LoadVT = M->getValueType(0);
7042 EVT EltType = LoadVT.getScalarType();
7043 EVT IntVT = LoadVT.changeTypeToInteger();
7044
7045 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7046
7047 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7048 bool IsTFE = M->getNumValues() == 3;
7049
7050 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7051 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7052 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7053 : AMDGPUISD::BUFFER_LOAD;
7054
7055 if (IsD16) {
7056 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7057 }
7058
7059 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7060 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7061 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7062 IsTFE);
7063
7064 if (isTypeLegal(LoadVT)) {
7065 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7066 M->getMemOperand(), DAG);
7067 }
7068
7069 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7070 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7071 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7072 M->getMemOperand(), DAG);
7073 return DAG.getMergeValues(
7074 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7075 DL);
7076}
7077
7079 SelectionDAG &DAG) {
7080 EVT VT = N->getValueType(0);
7081 unsigned CondCode = N->getConstantOperandVal(3);
7082 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7083 return DAG.getPOISON(VT);
7084
7085 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7086
7087 SDValue LHS = N->getOperand(1);
7088 SDValue RHS = N->getOperand(2);
7089
7090 SDLoc DL(N);
7091
7092 EVT CmpVT = LHS.getValueType();
7093 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7094 unsigned PromoteOp =
7096 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7097 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7098 }
7099
7100 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7101
7102 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7103 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7104
7105 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7106 DAG.getCondCode(CCOpcode));
7107 if (VT.bitsEq(CCVT))
7108 return SetCC;
7109 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7110}
7111
7113 SelectionDAG &DAG) {
7114 EVT VT = N->getValueType(0);
7115
7116 unsigned CondCode = N->getConstantOperandVal(3);
7117 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7118 return DAG.getPOISON(VT);
7119
7120 SDValue Src0 = N->getOperand(1);
7121 SDValue Src1 = N->getOperand(2);
7122 EVT CmpVT = Src0.getValueType();
7123 SDLoc SL(N);
7124
7125 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7126 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7127 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7128 }
7129
7130 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7131 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7132 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7133 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7134 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7135 DAG.getCondCode(CCOpcode));
7136 if (VT.bitsEq(CCVT))
7137 return SetCC;
7138 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7139}
7140
7142 SelectionDAG &DAG) {
7143 EVT VT = N->getValueType(0);
7144 SDValue Src = N->getOperand(1);
7145 SDLoc SL(N);
7146
7147 if (Src.getOpcode() == ISD::SETCC) {
7148 SDValue Op0 = Src.getOperand(0);
7149 SDValue Op1 = Src.getOperand(1);
7150 // Need to expand bfloat to float for comparison (setcc).
7151 if (Op0.getValueType() == MVT::bf16) {
7152 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7153 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7154 }
7155 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7156 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7157 }
7158 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7159 // (ballot 0) -> 0
7160 if (Arg->isZero())
7161 return DAG.getConstant(0, SL, VT);
7162
7163 // (ballot 1) -> EXEC/EXEC_LO
7164 if (Arg->isOne()) {
7165 Register Exec;
7166 if (VT.getScalarSizeInBits() == 32)
7167 Exec = AMDGPU::EXEC_LO;
7168 else if (VT.getScalarSizeInBits() == 64)
7169 Exec = AMDGPU::EXEC;
7170 else
7171 return SDValue();
7172
7173 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7174 }
7175 }
7176
7177 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7178 // ISD::SETNE)
7179 return DAG.getNode(
7180 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7181 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7182}
7183
7185 SelectionDAG &DAG) {
7186 EVT VT = N->getValueType(0);
7187 unsigned ValSize = VT.getSizeInBits();
7188 unsigned IID = N->getConstantOperandVal(0);
7189 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7190 IID == Intrinsic::amdgcn_permlanex16;
7191 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7192 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7193 SDLoc SL(N);
7194 MVT IntVT = MVT::getIntegerVT(ValSize);
7195 const GCNSubtarget *ST = TLI.getSubtarget();
7196 unsigned SplitSize = 32;
7197 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7198 ST->hasDPALU_DPP() &&
7199 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7200 SplitSize = 64;
7201
7202 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7203 SDValue Src2, MVT ValT) -> SDValue {
7204 SmallVector<SDValue, 8> Operands;
7205 switch (IID) {
7206 case Intrinsic::amdgcn_permlane16:
7207 case Intrinsic::amdgcn_permlanex16:
7208 case Intrinsic::amdgcn_update_dpp:
7209 Operands.push_back(N->getOperand(6));
7210 Operands.push_back(N->getOperand(5));
7211 Operands.push_back(N->getOperand(4));
7212 [[fallthrough]];
7213 case Intrinsic::amdgcn_writelane:
7214 Operands.push_back(Src2);
7215 [[fallthrough]];
7216 case Intrinsic::amdgcn_readlane:
7217 case Intrinsic::amdgcn_set_inactive:
7218 case Intrinsic::amdgcn_set_inactive_chain_arg:
7219 case Intrinsic::amdgcn_mov_dpp8:
7220 Operands.push_back(Src1);
7221 [[fallthrough]];
7222 case Intrinsic::amdgcn_readfirstlane:
7223 case Intrinsic::amdgcn_permlane64:
7224 Operands.push_back(Src0);
7225 break;
7226 default:
7227 llvm_unreachable("unhandled lane op");
7228 }
7229
7230 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7231 std::reverse(Operands.begin(), Operands.end());
7232
7233 if (SDNode *GL = N->getGluedNode()) {
7234 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7235 GL = GL->getOperand(0).getNode();
7236 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7237 SDValue(GL, 0)));
7238 }
7239
7240 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7241 };
7242
7243 SDValue Src0 = N->getOperand(1);
7244 SDValue Src1, Src2;
7245 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7246 IID == Intrinsic::amdgcn_mov_dpp8 ||
7247 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7248 Src1 = N->getOperand(2);
7249 if (IID == Intrinsic::amdgcn_writelane ||
7250 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7251 Src2 = N->getOperand(3);
7252 }
7253
7254 if (ValSize == SplitSize) {
7255 // Already legal
7256 return SDValue();
7257 }
7258
7259 if (ValSize < 32) {
7260 bool IsFloat = VT.isFloatingPoint();
7261 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7262 SL, MVT::i32);
7263
7264 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7265 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7266 SL, MVT::i32);
7267 }
7268
7269 if (IID == Intrinsic::amdgcn_writelane) {
7270 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7271 SL, MVT::i32);
7272 }
7273
7274 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7275 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7276 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7277 }
7278
7279 if (ValSize % SplitSize != 0)
7280 return SDValue();
7281
7282 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7283 EVT VT = N->getValueType(0);
7284 unsigned NE = VT.getVectorNumElements();
7285 EVT EltVT = VT.getVectorElementType();
7287 unsigned NumOperands = N->getNumOperands();
7288 SmallVector<SDValue, 4> Operands(NumOperands);
7289 SDNode *GL = N->getGluedNode();
7290
7291 // only handle convergencectrl_glue
7293
7294 for (unsigned i = 0; i != NE; ++i) {
7295 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7296 ++j) {
7297 SDValue Operand = N->getOperand(j);
7298 EVT OperandVT = Operand.getValueType();
7299 if (OperandVT.isVector()) {
7300 // A vector operand; extract a single element.
7301 EVT OperandEltVT = OperandVT.getVectorElementType();
7302 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7303 Operand, DAG.getVectorIdxConstant(i, SL));
7304 } else {
7305 // A scalar operand; just use it as is.
7306 Operands[j] = Operand;
7307 }
7308 }
7309
7310 if (GL)
7311 Operands[NumOperands - 1] =
7312 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7313 SDValue(GL->getOperand(0).getNode(), 0));
7314
7315 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7316 }
7317
7318 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7319 return DAG.getBuildVector(VecVT, SL, Scalars);
7320 };
7321
7322 if (VT.isVector()) {
7323 switch (MVT::SimpleValueType EltTy =
7325 case MVT::i32:
7326 case MVT::f32:
7327 if (SplitSize == 32) {
7328 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7329 return unrollLaneOp(LaneOp.getNode());
7330 }
7331 [[fallthrough]];
7332 case MVT::i16:
7333 case MVT::f16:
7334 case MVT::bf16: {
7335 unsigned SubVecNumElt =
7336 SplitSize / VT.getVectorElementType().getSizeInBits();
7337 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7339 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7340 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7341 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7342 DAG.getConstant(EltIdx, SL, MVT::i32));
7343
7344 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7345 IsPermLane16)
7346 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7347 DAG.getConstant(EltIdx, SL, MVT::i32));
7348
7349 if (IID == Intrinsic::amdgcn_writelane)
7350 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7351 DAG.getConstant(EltIdx, SL, MVT::i32));
7352
7353 Pieces.push_back(
7354 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7355 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7356 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7357 EltIdx += SubVecNumElt;
7358 }
7359 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7360 }
7361 default:
7362 // Handle all other cases by bitcasting to i32 vectors
7363 break;
7364 }
7365 }
7366
7367 MVT VecVT =
7368 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7369 Src0 = DAG.getBitcast(VecVT, Src0);
7370
7371 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7372 Src1 = DAG.getBitcast(VecVT, Src1);
7373
7374 if (IID == Intrinsic::amdgcn_writelane)
7375 Src2 = DAG.getBitcast(VecVT, Src2);
7376
7377 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7378 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7379 return DAG.getBitcast(VT, UnrolledLaneOp);
7380}
7381
7384 SelectionDAG &DAG) const {
7385 switch (N->getOpcode()) {
7387 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7388 Results.push_back(Res);
7389 return;
7390 }
7392 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7393 Results.push_back(Res);
7394 return;
7395 }
7397 unsigned IID = N->getConstantOperandVal(0);
7398 switch (IID) {
7399 case Intrinsic::amdgcn_make_buffer_rsrc:
7400 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7401 return;
7402 case Intrinsic::amdgcn_cvt_pkrtz: {
7403 SDValue Src0 = N->getOperand(1);
7404 SDValue Src1 = N->getOperand(2);
7405 SDLoc SL(N);
7406 SDValue Cvt =
7407 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7408 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7409 return;
7410 }
7411 case Intrinsic::amdgcn_cvt_pknorm_i16:
7412 case Intrinsic::amdgcn_cvt_pknorm_u16:
7413 case Intrinsic::amdgcn_cvt_pk_i16:
7414 case Intrinsic::amdgcn_cvt_pk_u16: {
7415 SDValue Src0 = N->getOperand(1);
7416 SDValue Src1 = N->getOperand(2);
7417 SDLoc SL(N);
7418 unsigned Opcode;
7419
7420 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7421 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7422 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7423 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7424 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7425 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7426 else
7427 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7428
7429 EVT VT = N->getValueType(0);
7430 if (isTypeLegal(VT))
7431 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7432 else {
7433 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7434 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7435 }
7436 return;
7437 }
7438 case Intrinsic::amdgcn_s_buffer_load: {
7439 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7440 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7441 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7442 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7443 // s_buffer_load_i8.
7444 if (!Subtarget->hasScalarSubwordLoads())
7445 return;
7446 SDValue Op = SDValue(N, 0);
7447 SDValue Rsrc = Op.getOperand(1);
7448 SDValue Offset = Op.getOperand(2);
7449 SDValue CachePolicy = Op.getOperand(3);
7450 EVT VT = Op.getValueType();
7451 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7452 SDLoc DL(Op);
7454 const DataLayout &DataLayout = DAG.getDataLayout();
7455 Align Alignment =
7461 VT.getStoreSize(), Alignment);
7462 SDValue LoadVal;
7463 if (!Offset->isDivergent()) {
7464 SDValue Ops[] = {Rsrc, // source register
7465 Offset, CachePolicy};
7466 SDValue BufferLoad =
7467 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
7468 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7469 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7470 } else {
7471 SDValue Ops[] = {
7472 DAG.getEntryNode(), // Chain
7473 Rsrc, // rsrc
7474 DAG.getConstant(0, DL, MVT::i32), // vindex
7475 {}, // voffset
7476 {}, // soffset
7477 {}, // offset
7478 CachePolicy, // cachepolicy
7479 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7480 };
7481 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7482 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7483 }
7484 Results.push_back(LoadVal);
7485 return;
7486 }
7487 case Intrinsic::amdgcn_dead: {
7488 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7489 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7490 return;
7491 }
7492 }
7493 break;
7494 }
7496 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7497 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7498 // FIXME: Hacky
7499 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7500 Results.push_back(Res.getOperand(I));
7501 }
7502 } else {
7503 Results.push_back(Res);
7504 Results.push_back(Res.getValue(1));
7505 }
7506 return;
7507 }
7508
7509 break;
7510 }
7511 case ISD::SELECT: {
7512 SDLoc SL(N);
7513 EVT VT = N->getValueType(0);
7514 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7515 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7516 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7517
7518 EVT SelectVT = NewVT;
7519 if (NewVT.bitsLT(MVT::i32)) {
7520 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7521 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7522 SelectVT = MVT::i32;
7523 }
7524
7525 SDValue NewSelect =
7526 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7527
7528 if (NewVT != SelectVT)
7529 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7530 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7531 return;
7532 }
7533 case ISD::FNEG: {
7534 if (N->getValueType(0) != MVT::v2f16)
7535 break;
7536
7537 SDLoc SL(N);
7538 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7539
7540 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7541 DAG.getConstant(0x80008000, SL, MVT::i32));
7542 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7543 return;
7544 }
7545 case ISD::FABS: {
7546 if (N->getValueType(0) != MVT::v2f16)
7547 break;
7548
7549 SDLoc SL(N);
7550 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7551
7552 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7553 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7554 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7555 return;
7556 }
7557 case ISD::FSQRT: {
7558 if (N->getValueType(0) != MVT::f16)
7559 break;
7560 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7561 break;
7562 }
7563 default:
7565 break;
7566 }
7567}
7568
7569/// Helper function for LowerBRCOND
7570static SDNode *findUser(SDValue Value, unsigned Opcode) {
7571
7572 for (SDUse &U : Value->uses()) {
7573 if (U.get() != Value)
7574 continue;
7575
7576 if (U.getUser()->getOpcode() == Opcode)
7577 return U.getUser();
7578 }
7579 return nullptr;
7580}
7581
7582unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7583 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7584 switch (Intr->getConstantOperandVal(1)) {
7585 case Intrinsic::amdgcn_if:
7586 return AMDGPUISD::IF;
7587 case Intrinsic::amdgcn_else:
7588 return AMDGPUISD::ELSE;
7589 case Intrinsic::amdgcn_loop:
7590 return AMDGPUISD::LOOP;
7591 case Intrinsic::amdgcn_end_cf:
7592 llvm_unreachable("should not occur");
7593 default:
7594 return 0;
7595 }
7596 }
7597
7598 // break, if_break, else_break are all only used as inputs to loop, not
7599 // directly as branch conditions.
7600 return 0;
7601}
7602
7609
7611 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7612 return false;
7613
7614 // FIXME: Either avoid relying on address space here or change the default
7615 // address space for functions to avoid the explicit check.
7616 return (GV->getValueType()->isFunctionTy() ||
7619}
7620
7622 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7623}
7624
7626 if (!GV->hasExternalLinkage())
7627 return true;
7628
7629 const auto OS = getTargetMachine().getTargetTriple().getOS();
7630 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7631}
7632
7633/// This transforms the control flow intrinsics to get the branch destination as
7634/// last parameter, also switches branch target with BR if the need arise
7635SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7636 SDLoc DL(BRCOND);
7637
7638 SDNode *Intr = BRCOND.getOperand(1).getNode();
7639 SDValue Target = BRCOND.getOperand(2);
7640 SDNode *BR = nullptr;
7641 SDNode *SetCC = nullptr;
7642
7643 switch (Intr->getOpcode()) {
7644 case ISD::SETCC: {
7645 // As long as we negate the condition everything is fine
7646 SetCC = Intr;
7647 Intr = SetCC->getOperand(0).getNode();
7648 break;
7649 }
7650 case ISD::XOR: {
7651 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7652 SDValue LHS = Intr->getOperand(0);
7653 SDValue RHS = Intr->getOperand(1);
7654 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7655 Intr = LHS.getNode();
7656 break;
7657 }
7658 [[fallthrough]];
7659 }
7660 default: {
7661 // Get the target from BR if we don't negate the condition
7662 BR = findUser(BRCOND, ISD::BR);
7663 assert(BR && "brcond missing unconditional branch user");
7664 Target = BR->getOperand(1);
7665 }
7666 }
7667
7668 unsigned CFNode = isCFIntrinsic(Intr);
7669 if (CFNode == 0) {
7670 // This is a uniform branch so we don't need to legalize.
7671 return BRCOND;
7672 }
7673
7674 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7676
7677 assert(!SetCC ||
7678 (SetCC->getConstantOperandVal(1) == 1 &&
7679 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7680 ISD::SETNE));
7681
7682 // operands of the new intrinsic call
7684 if (HaveChain)
7685 Ops.push_back(BRCOND.getOperand(0));
7686
7687 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7688 Ops.push_back(Target);
7689
7690 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7691
7692 // build the new intrinsic call
7693 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7694
7695 if (!HaveChain) {
7696 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7697
7699 }
7700
7701 if (BR) {
7702 // Give the branch instruction our target
7703 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7704 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7705 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7706 }
7707
7708 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7709
7710 // Copy the intrinsic results to registers
7711 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7712 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7713 if (!CopyToReg)
7714 continue;
7715
7716 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7717 SDValue(Result, i - 1), SDValue());
7718
7719 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7720 }
7721
7722 // Remove the old intrinsic from the chain
7723 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7724 Intr->getOperand(0));
7725
7726 return Chain;
7727}
7728
7729SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7730 MVT VT = Op.getSimpleValueType();
7731 SDLoc DL(Op);
7732 // Checking the depth
7733 if (Op.getConstantOperandVal(0) != 0)
7734 return DAG.getConstant(0, DL, VT);
7735
7736 MachineFunction &MF = DAG.getMachineFunction();
7737 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7738 // Check for kernel and shader functions
7739 if (Info->isEntryFunction())
7740 return DAG.getConstant(0, DL, VT);
7741
7742 MachineFrameInfo &MFI = MF.getFrameInfo();
7743 // There is a call to @llvm.returnaddress in this function
7744 MFI.setReturnAddressIsTaken(true);
7745
7746 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7747 // Get the return address reg and mark it as an implicit live-in
7748 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7749 getRegClassFor(VT, Op.getNode()->isDivergent()));
7750
7751 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7752}
7753
7754SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7755 const SDLoc &DL, EVT VT) const {
7756 return Op.getValueType().bitsLE(VT)
7757 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7758 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7759 DAG.getTargetConstant(0, DL, MVT::i32));
7760}
7761
7762SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7763 SelectionDAG &DAG) const {
7764 EVT DstVT = Op.getValueType();
7765 unsigned NumElts = DstVT.getVectorNumElements();
7766 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7767
7768 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7769
7770 SDLoc DL(Op);
7771 unsigned Opc = Op.getOpcode();
7772 SDValue Flags = Op.getOperand(1);
7773 EVT HalfDstVT =
7774 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7775 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7776 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7777
7778 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7779}
7780
7781SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7782 SDValue Src = Op.getOperand(0);
7783 EVT SrcVT = Src.getValueType();
7784 EVT DstVT = Op.getValueType();
7785
7786 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7787 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7788 if (SrcVT.getScalarType() != MVT::f32)
7789 return SDValue();
7790 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7791 }
7792
7793 if (SrcVT.getScalarType() != MVT::f64)
7794 return Op;
7795
7796 SDLoc DL(Op);
7797 if (DstVT == MVT::f16) {
7798 // TODO: Handle strictfp
7799 if (Op.getOpcode() != ISD::FP_ROUND)
7800 return Op;
7801
7802 if (!Subtarget->has16BitInsts()) {
7803 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7804 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7805 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7806 }
7807 if (Op->getFlags().hasApproximateFuncs()) {
7808 SDValue Flags = Op.getOperand(1);
7809 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7810 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7811 }
7812 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7813 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7814 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7815 }
7816
7817 assert(DstVT.getScalarType() == MVT::bf16 &&
7818 "custom lower FP_ROUND for f16 or bf16");
7819 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7820
7821 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7822 // hardware f32 -> bf16 instruction.
7823 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7824 MVT::f32;
7825 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7826 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7827 DAG.getTargetConstant(0, DL, MVT::i32));
7828}
7829
7830SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7831 SelectionDAG &DAG) const {
7832 EVT VT = Op.getValueType();
7833 const MachineFunction &MF = DAG.getMachineFunction();
7834 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7835 bool IsIEEEMode = Info->getMode().IEEE;
7836
7837 // FIXME: Assert during selection that this is only selected for
7838 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7839 // mode functions, but this happens to be OK since it's only done in cases
7840 // where there is known no sNaN.
7841 if (IsIEEEMode)
7842 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7843
7844 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7845 VT == MVT::v16bf16)
7846 return splitBinaryVectorOp(Op, DAG);
7847 return Op;
7848}
7849
7850SDValue
7851SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7852 SelectionDAG &DAG) const {
7853 EVT VT = Op.getValueType();
7854 const MachineFunction &MF = DAG.getMachineFunction();
7855 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7856 bool IsIEEEMode = Info->getMode().IEEE;
7857
7858 if (IsIEEEMode)
7859 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7860
7861 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7862 VT == MVT::v16bf16)
7863 return splitBinaryVectorOp(Op, DAG);
7864 return Op;
7865}
7866
7867SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7868 SelectionDAG &DAG) const {
7869 EVT VT = Op.getValueType();
7870 if (VT.isVector())
7871 return splitBinaryVectorOp(Op, DAG);
7872
7873 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7874 !Subtarget->hasMinimum3Maximum3F16() &&
7875 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7876 "should not need to widen f16 minimum/maximum to v2f16");
7877
7878 // Widen f16 operation to v2f16
7879
7880 // fminimum f16:x, f16:y ->
7881 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7882 // (v2f16 (scalar_to_vector y))), 0
7883 SDLoc SL(Op);
7884 SDValue WideSrc0 =
7885 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7886 SDValue WideSrc1 =
7887 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7888
7889 SDValue Widened =
7890 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7891
7892 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7893 DAG.getConstant(0, SL, MVT::i32));
7894}
7895
7896SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7897 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7898 EVT VT = Op.getValueType();
7899 assert(VT == MVT::f16);
7900
7901 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7902 EVT ExpVT = Exp.getValueType();
7903 if (ExpVT == MVT::i16)
7904 return Op;
7905
7906 SDLoc DL(Op);
7907
7908 // Correct the exponent type for f16 to i16.
7909 // Clamp the range of the exponent to the instruction's range.
7910
7911 // TODO: This should be a generic narrowing legalization, and can easily be
7912 // for GlobalISel.
7913
7914 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7915 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7916
7917 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7918 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7919
7920 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7921
7922 if (IsStrict) {
7923 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7924 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7925 }
7926
7927 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7928}
7929
7931 switch (Op->getOpcode()) {
7932 case ISD::SRA:
7933 case ISD::SMIN:
7934 case ISD::SMAX:
7935 return ISD::SIGN_EXTEND;
7936 case ISD::SRL:
7937 case ISD::UMIN:
7938 case ISD::UMAX:
7939 return ISD::ZERO_EXTEND;
7940 case ISD::ADD:
7941 case ISD::SUB:
7942 case ISD::AND:
7943 case ISD::OR:
7944 case ISD::XOR:
7945 case ISD::SHL:
7946 case ISD::SELECT:
7947 case ISD::MUL:
7948 // operation result won't be influenced by garbage high bits.
7949 // TODO: are all of those cases correct, and are there more?
7950 return ISD::ANY_EXTEND;
7951 case ISD::SETCC: {
7952 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7954 }
7955 default:
7956 llvm_unreachable("unexpected opcode!");
7957 }
7958}
7959
7960SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7961 DAGCombinerInfo &DCI) const {
7962 const unsigned Opc = Op.getOpcode();
7963 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7964 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7965 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7966 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7967 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7968
7969 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7970 : Op->getOperand(0).getValueType();
7971 auto ExtTy = OpTy.changeElementType(MVT::i32);
7972
7973 if (DCI.isBeforeLegalizeOps() ||
7974 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7975 return SDValue();
7976
7977 auto &DAG = DCI.DAG;
7978
7979 SDLoc DL(Op);
7980 SDValue LHS;
7981 SDValue RHS;
7982 if (Opc == ISD::SELECT) {
7983 LHS = Op->getOperand(1);
7984 RHS = Op->getOperand(2);
7985 } else {
7986 LHS = Op->getOperand(0);
7987 RHS = Op->getOperand(1);
7988 }
7989
7990 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7991 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7992
7993 // Special case: for shifts, the RHS always needs a zext.
7994 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7995 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7996 else
7997 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7998
7999 // setcc always return i1/i1 vec so no need to truncate after.
8000 if (Opc == ISD::SETCC) {
8001 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8002 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
8003 }
8004
8005 // For other ops, we extend the operation's return type as well so we need to
8006 // truncate back to the original type.
8007 SDValue NewVal;
8008 if (Opc == ISD::SELECT)
8009 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
8010 else
8011 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
8012
8013 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8014}
8015
8016SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8017 SDValue Mag = Op.getOperand(0);
8018 EVT MagVT = Mag.getValueType();
8019
8020 if (MagVT.getVectorNumElements() > 2)
8021 return splitBinaryVectorOp(Op, DAG);
8022
8023 SDValue Sign = Op.getOperand(1);
8024 EVT SignVT = Sign.getValueType();
8025
8026 if (MagVT == SignVT)
8027 return Op;
8028
8029 // fcopysign v2f16:mag, v2f32:sign ->
8030 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8031
8032 SDLoc SL(Op);
8033 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8034 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8035
8036 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8037
8038 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8039}
8040
8041// Custom lowering for vector multiplications and s_mul_u64.
8042SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8043 EVT VT = Op.getValueType();
8044
8045 // Split vector operands.
8046 if (VT.isVector())
8047 return splitBinaryVectorOp(Op, DAG);
8048
8049 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8050
8051 // There are four ways to lower s_mul_u64:
8052 //
8053 // 1. If all the operands are uniform, then we lower it as it is.
8054 //
8055 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8056 // multiplications because there is not a vector equivalent of s_mul_u64.
8057 //
8058 // 3. If the cost model decides that it is more efficient to use vector
8059 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8060 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8061 //
8062 // 4. If the cost model decides to use vector registers and both of the
8063 // operands are zero-extended/sign-extended from 32-bits, then we split the
8064 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8065 // possible to check if the operands are zero-extended or sign-extended in
8066 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8067 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8068 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8069 // If the cost model decides that we have to use vector registers, then
8070 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8071 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8072 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8073 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8074 // SIInstrInfo.cpp .
8075
8076 if (Op->isDivergent())
8077 return SDValue();
8078
8079 SDValue Op0 = Op.getOperand(0);
8080 SDValue Op1 = Op.getOperand(1);
8081 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8082 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8083 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8084 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8085 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8086 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8087 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8088 SDLoc SL(Op);
8089 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8090 return SDValue(
8091 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8092 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8093 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8094 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8095 return SDValue(
8096 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8097 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8098 return Op;
8099}
8100
8101SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8102 EVT VT = Op.getValueType();
8103 SDLoc SL(Op);
8104 SDValue LHS = Op.getOperand(0);
8105 SDValue RHS = Op.getOperand(1);
8106 bool isSigned = Op.getOpcode() == ISD::SMULO;
8107
8108 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8109 const APInt &C = RHSC->getAPIntValue();
8110 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8111 if (C.isPowerOf2()) {
8112 // smulo(x, signed_min) is same as umulo(x, signed_min).
8113 bool UseArithShift = isSigned && !C.isMinSignedValue();
8114 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8115 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8116 SDValue Overflow =
8117 DAG.getSetCC(SL, MVT::i1,
8118 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8119 Result, ShiftAmt),
8120 LHS, ISD::SETNE);
8121 return DAG.getMergeValues({Result, Overflow}, SL);
8122 }
8123 }
8124
8125 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8126 SDValue Top =
8127 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8128
8129 SDValue Sign = isSigned
8130 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8131 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8132 SL, MVT::i32))
8133 : DAG.getConstant(0, SL, VT);
8134 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8135
8136 return DAG.getMergeValues({Result, Overflow}, SL);
8137}
8138
8139SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8140 if (Op->isDivergent()) {
8141 // Select to V_MAD_[IU]64_[IU]32.
8142 return Op;
8143 }
8144 if (Subtarget->hasSMulHi()) {
8145 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8146 return SDValue();
8147 }
8148 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8149 // calculate the high part, so we might as well do the whole thing with
8150 // V_MAD_[IU]64_[IU]32.
8151 return Op;
8152}
8153
8154SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8155 if (!Subtarget->isTrapHandlerEnabled() ||
8156 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8157 return lowerTrapEndpgm(Op, DAG);
8158
8159 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8160 : lowerTrapHsaQueuePtr(Op, DAG);
8161}
8162
8163SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8164 SDLoc SL(Op);
8165 SDValue Chain = Op.getOperand(0);
8166 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8167}
8168
8169SDValue
8170SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8171 const SDLoc &DL, Align Alignment,
8172 ImplicitParameter Param) const {
8173 MachineFunction &MF = DAG.getMachineFunction();
8174 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8175 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8176 MachinePointerInfo PtrInfo =
8178 return DAG.getLoad(
8179 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
8181}
8182
8183SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8184 SelectionDAG &DAG) const {
8185 SDLoc SL(Op);
8186 SDValue Chain = Op.getOperand(0);
8187
8188 SDValue QueuePtr;
8189 // For code object version 5, QueuePtr is passed through implicit kernarg.
8190 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8192 QueuePtr =
8193 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8194 } else {
8195 MachineFunction &MF = DAG.getMachineFunction();
8196 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8197 Register UserSGPR = Info->getQueuePtrUserSGPR();
8198
8199 if (UserSGPR == AMDGPU::NoRegister) {
8200 // We probably are in a function incorrectly marked with
8201 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8202 // trap, so just use a null pointer.
8203 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8204 } else {
8205 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8206 MVT::i64);
8207 }
8208 }
8209
8210 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8211 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8212
8213 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8214 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8215 ToReg.getValue(1)};
8216 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8217}
8218
8219SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8220 SDLoc SL(Op);
8221 SDValue Chain = Op.getOperand(0);
8222
8223 // We need to simulate the 's_trap 2' instruction on targets that run in
8224 // PRIV=1 (where it is treated as a nop).
8225 if (Subtarget->hasPrivEnabledTrap2NopBug())
8226 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8227
8228 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8229 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8230 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8231}
8232
8233SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8234 SDLoc SL(Op);
8235 SDValue Chain = Op.getOperand(0);
8236 MachineFunction &MF = DAG.getMachineFunction();
8237
8238 if (!Subtarget->isTrapHandlerEnabled() ||
8239 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8240 LLVMContext &Ctx = MF.getFunction().getContext();
8241 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8242 "debugtrap handler not supported",
8243 Op.getDebugLoc(), DS_Warning));
8244 return Chain;
8245 }
8246
8247 uint64_t TrapID =
8248 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8249 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8250 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8251}
8252
8253SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8254 SelectionDAG &DAG) const {
8255 if (Subtarget->hasApertureRegs()) {
8256 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8257 ? AMDGPU::SRC_SHARED_BASE
8258 : AMDGPU::SRC_PRIVATE_BASE;
8259 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8260 !Subtarget->hasGloballyAddressableScratch()) &&
8261 "Cannot use src_private_base with globally addressable scratch!");
8262 // Note: this feature (register) is broken. When used as a 32-bit operand,
8263 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8264 // bits.
8265 //
8266 // To work around the issue, emit a 64 bit copy from this register
8267 // then extract the high bits. Note that this shouldn't even result in a
8268 // shift being emitted and simply become a pair of registers (e.g.):
8269 // s_mov_b64 s[6:7], src_shared_base
8270 // v_mov_b32_e32 v1, s7
8271 SDValue Copy =
8272 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8273 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8274 }
8275
8276 // For code object version 5, private_base and shared_base are passed through
8277 // implicit kernargs.
8278 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8282 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8283 }
8284
8285 MachineFunction &MF = DAG.getMachineFunction();
8286 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8287 Register UserSGPR = Info->getQueuePtrUserSGPR();
8288 if (UserSGPR == AMDGPU::NoRegister) {
8289 // We probably are in a function incorrectly marked with
8290 // amdgpu-no-queue-ptr. This is undefined.
8291 return DAG.getPOISON(MVT::i32);
8292 }
8293
8294 SDValue QueuePtr =
8295 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8296
8297 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8298 // private_segment_aperture_base_hi.
8299 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8300
8301 SDValue Ptr =
8302 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8303
8304 // TODO: Use custom target PseudoSourceValue.
8305 // TODO: We should use the value from the IR intrinsic call, but it might not
8306 // be available and how do we get it?
8307 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8308 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8309 commonAlignment(Align(64), StructOffset),
8312}
8313
8314/// Return true if the value is a known valid address, such that a null check is
8315/// not necessary.
8317 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8319 return true;
8320
8321 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8322 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8323
8324 // TODO: Search through arithmetic, handle arguments and loads
8325 // marked nonnull.
8326 return false;
8327}
8328
8329SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8330 SelectionDAG &DAG) const {
8331 SDLoc SL(Op);
8332
8333 const AMDGPUTargetMachine &TM =
8334 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8335
8336 unsigned DestAS, SrcAS;
8337 SDValue Src;
8338 bool IsNonNull = false;
8339 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8340 SrcAS = ASC->getSrcAddressSpace();
8341 Src = ASC->getOperand(0);
8342 DestAS = ASC->getDestAddressSpace();
8343 } else {
8344 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8345 Op.getConstantOperandVal(0) ==
8346 Intrinsic::amdgcn_addrspacecast_nonnull);
8347 Src = Op->getOperand(1);
8348 SrcAS = Op->getConstantOperandVal(2);
8349 DestAS = Op->getConstantOperandVal(3);
8350 IsNonNull = true;
8351 }
8352
8353 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8354
8355 // flat -> local/private
8356 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8357 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8358 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8359 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8360
8361 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8362 Subtarget->hasGloballyAddressableScratch()) {
8363 // flat -> private with globally addressable scratch: subtract
8364 // src_flat_scratch_base_lo.
8365 SDValue FlatScratchBaseLo(
8366 DAG.getMachineNode(
8367 AMDGPU::S_MOV_B32, SL, MVT::i32,
8368 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8369 0);
8370 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8371 }
8372
8373 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8374 return Ptr;
8375
8376 unsigned NullVal = TM.getNullPointerValue(DestAS);
8377 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8378 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8379
8380 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8381 SegmentNullPtr);
8382 }
8383 }
8384
8385 // local/private -> flat
8386 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8387 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8388 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8389 SDValue CvtPtr;
8390 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8391 Subtarget->hasGloballyAddressableScratch()) {
8392 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8393 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8394 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8395 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8396 ThreadID = DAG.getNode(
8397 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8398 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8399 AllOnes, ThreadID);
8400 if (Subtarget->isWave64())
8401 ThreadID = DAG.getNode(
8402 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8403 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8404 AllOnes, ThreadID);
8405 SDValue ShAmt = DAG.getShiftAmountConstant(
8406 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8407 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8408 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8409 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8410 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8411 // 64-bit hi:lo value.
8412 SDValue FlatScratchBase = {
8413 DAG.getMachineNode(
8414 AMDGPU::S_MOV_B64, SL, MVT::i64,
8415 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8416 0};
8417 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8418 } else {
8419 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8420 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8421 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8422 }
8423
8424 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8425 return CvtPtr;
8426
8427 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8428 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8429
8430 SDValue NonNull =
8431 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8432
8433 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8434 FlatNullPtr);
8435 }
8436 }
8437
8438 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8439 Op.getValueType() == MVT::i64) {
8440 const SIMachineFunctionInfo *Info =
8441 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8442 if (Info->get32BitAddressHighBits() == 0)
8443 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
8444
8445 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8446 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8447 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8448 }
8449
8450 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8451 Src.getValueType() == MVT::i64)
8452 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8453
8454 // global <-> flat are no-ops and never emitted.
8455
8456 // Invalid casts are poison.
8457 return DAG.getPOISON(Op->getValueType(0));
8458}
8459
8460// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8461// the small vector and inserting them into the big vector. That is better than
8462// the default expansion of doing it via a stack slot. Even though the use of
8463// the stack slot would be optimized away afterwards, the stack slot itself
8464// remains.
8465SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8466 SelectionDAG &DAG) const {
8467 SDValue Vec = Op.getOperand(0);
8468 SDValue Ins = Op.getOperand(1);
8469 SDValue Idx = Op.getOperand(2);
8470 EVT VecVT = Vec.getValueType();
8471 EVT InsVT = Ins.getValueType();
8472 EVT EltVT = VecVT.getVectorElementType();
8473 unsigned InsNumElts = InsVT.getVectorNumElements();
8474 unsigned IdxVal = Idx->getAsZExtVal();
8475 SDLoc SL(Op);
8476
8477 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8478 // Insert 32-bit registers at a time.
8479 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8480
8481 unsigned VecNumElts = VecVT.getVectorNumElements();
8482 EVT NewVecVT =
8483 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8484 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8486 MVT::i32, InsNumElts / 2);
8487
8488 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8489 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8490
8491 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8492 SDValue Elt;
8493 if (InsNumElts == 2) {
8494 Elt = Ins;
8495 } else {
8496 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8497 DAG.getConstant(I, SL, MVT::i32));
8498 }
8499 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8500 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8501 }
8502
8503 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8504 }
8505
8506 for (unsigned I = 0; I != InsNumElts; ++I) {
8507 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8508 DAG.getConstant(I, SL, MVT::i32));
8509 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8510 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8511 }
8512 return Vec;
8513}
8514
8515SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8516 SelectionDAG &DAG) const {
8517 SDValue Vec = Op.getOperand(0);
8518 SDValue InsVal = Op.getOperand(1);
8519 SDValue Idx = Op.getOperand(2);
8520 EVT VecVT = Vec.getValueType();
8521 EVT EltVT = VecVT.getVectorElementType();
8522 unsigned VecSize = VecVT.getSizeInBits();
8523 unsigned EltSize = EltVT.getSizeInBits();
8524 SDLoc SL(Op);
8525
8526 // Specially handle the case of v4i16 with static indexing.
8527 unsigned NumElts = VecVT.getVectorNumElements();
8528 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8529 if (NumElts == 4 && EltSize == 16 && KIdx) {
8530 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8531
8532 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8533 DAG.getConstant(0, SL, MVT::i32));
8534 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8535 DAG.getConstant(1, SL, MVT::i32));
8536
8537 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8538 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8539
8540 unsigned Idx = KIdx->getZExtValue();
8541 bool InsertLo = Idx < 2;
8542 SDValue InsHalf = DAG.getNode(
8543 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8544 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8545 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8546
8547 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8548
8549 SDValue Concat =
8550 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8551 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8552
8553 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8554 }
8555
8556 // Static indexing does not lower to stack access, and hence there is no need
8557 // for special custom lowering to avoid stack access.
8558 if (isa<ConstantSDNode>(Idx))
8559 return SDValue();
8560
8561 // Avoid stack access for dynamic indexing by custom lowering to
8562 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8563
8564 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8565
8566 MVT IntVT = MVT::getIntegerVT(VecSize);
8567
8568 // Convert vector index to bit-index and get the required bit mask.
8569 assert(isPowerOf2_32(EltSize));
8570 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8571 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8572 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8573 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8574 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8575
8576 // 1. Create a congruent vector with the target value in each element.
8577 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8578 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8579
8580 // 2. Mask off all other indices except the required index within (1).
8581 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8582
8583 // 3. Mask off the required index within the target vector.
8584 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8585 SDValue RHS =
8586 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8587
8588 // 4. Get (2) and (3) ORed into the target vector.
8589 SDValue BFI =
8590 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8591
8592 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8593}
8594
8595SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8596 SelectionDAG &DAG) const {
8597 SDLoc SL(Op);
8598
8599 EVT ResultVT = Op.getValueType();
8600 SDValue Vec = Op.getOperand(0);
8601 SDValue Idx = Op.getOperand(1);
8602 EVT VecVT = Vec.getValueType();
8603 unsigned VecSize = VecVT.getSizeInBits();
8604 EVT EltVT = VecVT.getVectorElementType();
8605
8606 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8607
8608 // Make sure we do any optimizations that will make it easier to fold
8609 // source modifiers before obscuring it with bit operations.
8610
8611 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8612 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8613 return Combined;
8614
8615 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8616 SDValue Lo, Hi;
8617 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8618
8619 if (VecSize == 128) {
8620 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8621 Lo = DAG.getBitcast(LoVT,
8622 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8623 DAG.getConstant(0, SL, MVT::i32)));
8624 Hi = DAG.getBitcast(HiVT,
8625 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8626 DAG.getConstant(1, SL, MVT::i32)));
8627 } else if (VecSize == 256) {
8628 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8629 SDValue Parts[4];
8630 for (unsigned P = 0; P < 4; ++P) {
8631 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8632 DAG.getConstant(P, SL, MVT::i32));
8633 }
8634
8635 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8636 Parts[0], Parts[1]));
8637 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8638 Parts[2], Parts[3]));
8639 } else {
8640 assert(VecSize == 512);
8641
8642 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8643 SDValue Parts[8];
8644 for (unsigned P = 0; P < 8; ++P) {
8645 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8646 DAG.getConstant(P, SL, MVT::i32));
8647 }
8648
8649 Lo = DAG.getBitcast(LoVT,
8650 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8651 Parts[0], Parts[1], Parts[2], Parts[3]));
8652 Hi = DAG.getBitcast(HiVT,
8653 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8654 Parts[4], Parts[5], Parts[6], Parts[7]));
8655 }
8656
8657 EVT IdxVT = Idx.getValueType();
8658 unsigned NElem = VecVT.getVectorNumElements();
8659 assert(isPowerOf2_32(NElem));
8660 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8661 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8662 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8663 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8664 }
8665
8666 assert(VecSize <= 64);
8667
8668 MVT IntVT = MVT::getIntegerVT(VecSize);
8669
8670 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8671 SDValue VecBC = peekThroughBitcasts(Vec);
8672 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8673 SDValue Src = VecBC.getOperand(0);
8674 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8675 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8676 }
8677
8678 unsigned EltSize = EltVT.getSizeInBits();
8679 assert(isPowerOf2_32(EltSize));
8680
8681 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8682
8683 // Convert vector index to bit-index (* EltSize)
8684 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8685
8686 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8687 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8688
8689 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8690 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8691 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8692 }
8693
8694 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8695}
8696
8697static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8698 assert(Elt % 2 == 0);
8699 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8700}
8701
8702static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8703 assert(Elt % 2 == 0);
8704 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8705 !(Mask[Elt + 1] & 1);
8706}
8707
8708SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8709 SelectionDAG &DAG) const {
8710 SDLoc SL(Op);
8711 EVT ResultVT = Op.getValueType();
8712 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8713 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8714 const int NewSrcNumElts = 2;
8715 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8716 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8717
8718 // Break up the shuffle into registers sized pieces.
8719 //
8720 // We're trying to form sub-shuffles that the register allocation pipeline
8721 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8722 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8723 // pair of copies into a consecutive register copy, so use the ordinary
8724 // extract_vector_elt lowering unless we can use the shuffle.
8725 //
8726 // TODO: This is a bit of hack, and we should probably always use
8727 // extract_subvector for the largest possible subvector we can (or at least
8728 // use it for PackVT aligned pieces). However we have worse support for
8729 // combines on them don't directly treat extract_subvector / insert_subvector
8730 // as legal. The DAG scheduler also ends up doing a worse job with the
8731 // extract_subvectors.
8732 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8733
8734 // vector_shuffle <0,1,6,7> lhs, rhs
8735 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8736 //
8737 // vector_shuffle <6,7,2,3> lhs, rhs
8738 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8739 //
8740 // vector_shuffle <6,7,0,1> lhs, rhs
8741 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8742
8743 // Avoid scalarizing when both halves are reading from consecutive elements.
8744
8745 // If we're treating 2 element shuffles as legal, also create odd-to-even
8746 // shuffles of neighboring pairs.
8747 //
8748 // vector_shuffle <3,2,7,6> lhs, rhs
8749 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8750 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8751
8753 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8754 if (ShouldUseConsecutiveExtract &&
8756 const int Idx = SVN->getMaskElt(I);
8757 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8758 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8759 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8760 SVN->getOperand(VecIdx),
8761 DAG.getConstant(EltIdx, SL, MVT::i32));
8762 Pieces.push_back(SubVec);
8763 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8765 int Idx0 = SVN->getMaskElt(I);
8766 int Idx1 = SVN->getMaskElt(I + 1);
8767
8768 SDValue SrcOp0 = SVN->getOperand(0);
8769 SDValue SrcOp1 = SrcOp0;
8770 if (Idx0 >= SrcNumElts) {
8771 SrcOp0 = SVN->getOperand(1);
8772 Idx0 -= SrcNumElts;
8773 }
8774
8775 if (Idx1 >= SrcNumElts) {
8776 SrcOp1 = SVN->getOperand(1);
8777 Idx1 -= SrcNumElts;
8778 }
8779
8780 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8781 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8782
8783 // Extract nearest even aligned piece.
8784 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8785 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8786 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8787 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8788
8789 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8790 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8791
8792 SDValue Result0 = SubVec0;
8793 SDValue Result1 = SubVec0;
8794
8795 if (SubVec0 != SubVec1) {
8796 NewMaskIdx1 += NewSrcNumElts;
8797 Result1 = SubVec1;
8798 } else {
8799 Result1 = DAG.getPOISON(PackVT);
8800 }
8801
8802 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8803 {NewMaskIdx0, NewMaskIdx1});
8804 Pieces.push_back(Shuf);
8805 } else {
8806 const int Idx0 = SVN->getMaskElt(I);
8807 const int Idx1 = SVN->getMaskElt(I + 1);
8808 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8809 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8810 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8811 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8812
8813 SDValue Vec0 = SVN->getOperand(VecIdx0);
8814 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8815 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8816
8817 SDValue Vec1 = SVN->getOperand(VecIdx1);
8818 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8819 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8820 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8821 }
8822 }
8823
8824 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8825}
8826
8827SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8828 SelectionDAG &DAG) const {
8829 SDValue SVal = Op.getOperand(0);
8830 EVT ResultVT = Op.getValueType();
8831 EVT SValVT = SVal.getValueType();
8832 SDValue UndefVal = DAG.getPOISON(SValVT);
8833 SDLoc SL(Op);
8834
8836 VElts.push_back(SVal);
8837 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8838 VElts.push_back(UndefVal);
8839
8840 return DAG.getBuildVector(ResultVT, SL, VElts);
8841}
8842
8843SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8844 SelectionDAG &DAG) const {
8845 SDLoc SL(Op);
8846 EVT VT = Op.getValueType();
8847
8848 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8849 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8850
8851 SDValue Lo = Op.getOperand(0);
8852 SDValue Hi = Op.getOperand(1);
8853
8854 // Avoid adding defined bits with the zero_extend.
8855 if (Hi.isUndef()) {
8856 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8857 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8858 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8859 }
8860
8861 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8862 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8863
8864 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8865 DAG.getConstant(16, SL, MVT::i32));
8866 if (Lo.isUndef())
8867 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8868
8869 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8870 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8871
8872 SDValue Or =
8873 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8874 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8875 }
8876
8877 // Split into 2-element chunks.
8878 const unsigned NumParts = VT.getVectorNumElements() / 2;
8879 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8880 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8881
8883 for (unsigned P = 0; P < NumParts; ++P) {
8884 SDValue Vec = DAG.getBuildVector(
8885 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8886 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8887 }
8888
8889 SDValue Blend =
8890 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8891 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8892}
8893
8895 const GlobalAddressSDNode *GA) const {
8896 // OSes that use ELF REL relocations (instead of RELA) can only store a
8897 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8898 // which can create arbitrary 64-bit addends. (This is only a problem for
8899 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8900 // the high 32 bits of the addend.)
8901 //
8902 // This should be kept in sync with how HasRelocationAddend is initialized in
8903 // the constructor of ELFAMDGPUAsmBackend.
8904 if (!Subtarget->isAmdHsaOS())
8905 return false;
8906
8907 // We can fold offsets for anything that doesn't require a GOT relocation.
8908 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8912}
8913
8914static SDValue
8916 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8917 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8918 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8919 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8920 // lowered to the following code sequence:
8921 //
8922 // For constant address space:
8923 // s_getpc_b64 s[0:1]
8924 // s_add_u32 s0, s0, $symbol
8925 // s_addc_u32 s1, s1, 0
8926 //
8927 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8928 // a fixup or relocation is emitted to replace $symbol with a literal
8929 // constant, which is a pc-relative offset from the encoding of the $symbol
8930 // operand to the global variable.
8931 //
8932 // For global address space:
8933 // s_getpc_b64 s[0:1]
8934 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8935 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8936 //
8937 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8938 // fixups or relocations are emitted to replace $symbol@*@lo and
8939 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8940 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8941 // operand to the global variable.
8942 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8943 assert(GAFlags != SIInstrInfo::MO_NONE);
8944
8945 SDValue Ptr =
8946 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8947 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8948 }
8949
8950 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8951 SDValue PtrHi;
8952 if (GAFlags == SIInstrInfo::MO_NONE)
8953 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8954 else
8955 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8956 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8957}
8958
8959SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8960 SDValue Op,
8961 SelectionDAG &DAG) const {
8962 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8963 SDLoc DL(GSD);
8964 EVT PtrVT = Op.getValueType();
8965
8966 const GlobalValue *GV = GSD->getGlobal();
8972 GV->hasExternalLinkage()) {
8973 Type *Ty = GV->getValueType();
8974 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8975 // zero-sized type in other languages to declare the dynamic shared
8976 // memory which size is not known at the compile time. They will be
8977 // allocated by the runtime and placed directly after the static
8978 // allocated ones. They all share the same offset.
8979 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8980 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8981 // Adjust alignment for that dynamic shared memory array.
8984 MFI->setUsesDynamicLDS(true);
8985 return SDValue(
8986 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8987 }
8988 }
8990 }
8991
8993 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8995 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8996 }
8997
8998 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8999 if (Subtarget->has64BitLiterals()) {
9001 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
9002 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
9003 0);
9004 }
9005
9006 SDValue AddrLo = DAG.getTargetGlobalAddress(
9007 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
9008 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
9009
9010 SDValue AddrHi = DAG.getTargetGlobalAddress(
9011 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
9012 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
9013
9014 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9015 }
9016
9017 if (shouldEmitFixup(GV))
9018 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9019
9020 if (shouldEmitPCReloc(GV))
9021 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
9023
9024 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
9026 PointerType *PtrTy =
9028 const DataLayout &DataLayout = DAG.getDataLayout();
9029 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
9030 MachinePointerInfo PtrInfo =
9032
9033 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9036}
9037
9039 const SDLoc &DL, SDValue V) const {
9040 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9041 // the destination register.
9042 //
9043 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9044 // so we will end up with redundant moves to m0.
9045 //
9046 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9047
9048 // A Null SDValue creates a glue result.
9049 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9050 V, Chain);
9051 return SDValue(M0, 0);
9052}
9053
9054SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9055 MVT VT,
9056 unsigned Offset) const {
9057 SDLoc SL(Op);
9058 SDValue Param = lowerKernargMemParameter(
9059 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9060 // The local size values will have the hi 16-bits as zero.
9061 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9062 DAG.getValueType(VT));
9063}
9064
9066 EVT VT) {
9069 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9070 return DAG.getPOISON(VT);
9071}
9072
9074 EVT VT) {
9077 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9078 return DAG.getPOISON(VT);
9079}
9080
9082 ArrayRef<SDValue> Elts) {
9083 assert(!Elts.empty());
9084 MVT Type;
9085 unsigned NumElts = Elts.size();
9086
9087 if (NumElts <= 12) {
9088 Type = MVT::getVectorVT(MVT::f32, NumElts);
9089 } else {
9090 assert(Elts.size() <= 16);
9091 Type = MVT::v16f32;
9092 NumElts = 16;
9093 }
9094
9095 SmallVector<SDValue, 16> VecElts(NumElts);
9096 for (unsigned i = 0; i < Elts.size(); ++i) {
9097 SDValue Elt = Elts[i];
9098 if (Elt.getValueType() != MVT::f32)
9099 Elt = DAG.getBitcast(MVT::f32, Elt);
9100 VecElts[i] = Elt;
9101 }
9102 for (unsigned i = Elts.size(); i < NumElts; ++i)
9103 VecElts[i] = DAG.getPOISON(MVT::f32);
9104
9105 if (NumElts == 1)
9106 return VecElts[0];
9107 return DAG.getBuildVector(Type, DL, VecElts);
9108}
9109
9110static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9111 SDValue Src, int ExtraElts) {
9112 EVT SrcVT = Src.getValueType();
9113
9115
9116 if (SrcVT.isVector())
9117 DAG.ExtractVectorElements(Src, Elts);
9118 else
9119 Elts.push_back(Src);
9120
9121 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9122 while (ExtraElts--)
9123 Elts.push_back(Undef);
9124
9125 return DAG.getBuildVector(CastVT, DL, Elts);
9126}
9127
9128// Re-construct the required return value for a image load intrinsic.
9129// This is more complicated due to the optional use TexFailCtrl which means the
9130// required return type is an aggregate
9132 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9133 bool Unpacked, bool IsD16, int DMaskPop,
9134 int NumVDataDwords, bool IsAtomicPacked16Bit,
9135 const SDLoc &DL) {
9136 // Determine the required return type. This is the same regardless of
9137 // IsTexFail flag
9138 EVT ReqRetVT = ResultTypes[0];
9139 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9140 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9141 ? (ReqRetNumElts + 1) / 2
9142 : ReqRetNumElts;
9143
9144 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9145
9146 MVT DataDwordVT =
9147 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9148
9149 MVT MaskPopVT =
9150 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9151
9152 SDValue Data(Result, 0);
9153 SDValue TexFail;
9154
9155 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9156 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9157 if (MaskPopVT.isVector()) {
9158 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9159 SDValue(Result, 0), ZeroIdx);
9160 } else {
9161 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9162 SDValue(Result, 0), ZeroIdx);
9163 }
9164 }
9165
9166 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9167 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9168 NumDataDwords - MaskPopDwords);
9169
9170 if (IsD16)
9171 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9172
9173 EVT LegalReqRetVT = ReqRetVT;
9174 if (!ReqRetVT.isVector()) {
9175 if (!Data.getValueType().isInteger())
9176 Data = DAG.getNode(ISD::BITCAST, DL,
9177 Data.getValueType().changeTypeToInteger(), Data);
9178 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9179 } else {
9180 // We need to widen the return vector to a legal type
9181 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9182 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9183 LegalReqRetVT =
9185 ReqRetVT.getVectorNumElements() + 1);
9186 }
9187 }
9188 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9189
9190 if (IsTexFail) {
9191 TexFail =
9192 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9193 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9194
9195 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9196 }
9197
9198 if (Result->getNumValues() == 1)
9199 return Data;
9200
9201 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9202}
9203
9204static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9205 SDValue *LWE, bool &IsTexFail) {
9206 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9207
9208 uint64_t Value = TexFailCtrlConst->getZExtValue();
9209 if (Value) {
9210 IsTexFail = true;
9211 }
9212
9213 SDLoc DL(TexFailCtrlConst);
9214 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9215 Value &= ~(uint64_t)0x1;
9216 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9217 Value &= ~(uint64_t)0x2;
9218
9219 return Value == 0;
9220}
9221
9223 MVT PackVectorVT,
9224 SmallVectorImpl<SDValue> &PackedAddrs,
9225 unsigned DimIdx, unsigned EndIdx,
9226 unsigned NumGradients) {
9227 SDLoc DL(Op);
9228 for (unsigned I = DimIdx; I < EndIdx; I++) {
9229 SDValue Addr = Op.getOperand(I);
9230
9231 // Gradients are packed with undef for each coordinate.
9232 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9233 // 1D: undef,dx/dh; undef,dx/dv
9234 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9235 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9236 if (((I + 1) >= EndIdx) ||
9237 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9238 I == DimIdx + NumGradients - 1))) {
9239 if (Addr.getValueType() != MVT::i16)
9240 Addr = DAG.getBitcast(MVT::i16, Addr);
9241 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9242 } else {
9243 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9244 I++;
9245 }
9246 Addr = DAG.getBitcast(MVT::f32, Addr);
9247 PackedAddrs.push_back(Addr);
9248 }
9249}
9250
9251SDValue SITargetLowering::lowerImage(SDValue Op,
9253 SelectionDAG &DAG, bool WithChain) const {
9254 SDLoc DL(Op);
9255 MachineFunction &MF = DAG.getMachineFunction();
9256 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9257 unsigned IntrOpcode = Intr->BaseOpcode;
9258 // For image atomic: use no-return opcode if result is unused.
9259 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9260 !Op.getNode()->hasAnyUseOfValue(0))
9261 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9262 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9264 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9265 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9266 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9267 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9268
9269 SmallVector<EVT, 3> ResultTypes(Op->values());
9270 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9271 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9272 ResultTypes.erase(&ResultTypes[0]);
9273
9274 bool IsD16 = false;
9275 bool IsG16 = false;
9276 bool IsA16 = false;
9277 SDValue VData;
9278 int NumVDataDwords = 0;
9279 bool AdjustRetType = false;
9280 bool IsAtomicPacked16Bit = false;
9281
9282 // Offset of intrinsic arguments
9283 const unsigned ArgOffset = WithChain ? 2 : 1;
9284
9285 unsigned DMask;
9286 unsigned DMaskLanes = 0;
9287
9288 if (BaseOpcode->Atomic) {
9289 VData = Op.getOperand(2);
9290
9291 IsAtomicPacked16Bit =
9292 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9293 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9294 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9295 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9296
9297 bool Is64Bit = VData.getValueSizeInBits() == 64;
9298 if (BaseOpcode->AtomicX2) {
9299 SDValue VData2 = Op.getOperand(3);
9300 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9301 {VData, VData2});
9302 if (Is64Bit)
9303 VData = DAG.getBitcast(MVT::v4i32, VData);
9304
9305 if (!BaseOpcode->NoReturn)
9306 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9307
9308 DMask = Is64Bit ? 0xf : 0x3;
9309 NumVDataDwords = Is64Bit ? 4 : 2;
9310 } else {
9311 DMask = Is64Bit ? 0x3 : 0x1;
9312 NumVDataDwords = Is64Bit ? 2 : 1;
9313 }
9314 } else {
9315 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9316 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9317
9318 if (BaseOpcode->Store) {
9319 VData = Op.getOperand(2);
9320
9321 MVT StoreVT = VData.getSimpleValueType();
9322 if (StoreVT.getScalarType() == MVT::f16) {
9323 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9324 return Op; // D16 is unsupported for this instruction
9325
9326 IsD16 = true;
9327 VData = handleD16VData(VData, DAG, true);
9328 }
9329
9330 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9331 } else if (!BaseOpcode->NoReturn) {
9332 // Work out the num dwords based on the dmask popcount and underlying type
9333 // and whether packing is supported.
9334 MVT LoadVT = ResultTypes[0].getSimpleVT();
9335 if (LoadVT.getScalarType() == MVT::f16) {
9336 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9337 return Op; // D16 is unsupported for this instruction
9338
9339 IsD16 = true;
9340 }
9341
9342 // Confirm that the return type is large enough for the dmask specified
9343 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9344 (!LoadVT.isVector() && DMaskLanes > 1))
9345 return Op;
9346
9347 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9348 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9349 // instructions.
9350 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9351 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9352 NumVDataDwords = (DMaskLanes + 1) / 2;
9353 else
9354 NumVDataDwords = DMaskLanes;
9355
9356 AdjustRetType = true;
9357 }
9358 }
9359
9360 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9362
9363 // Check for 16 bit addresses or derivatives and pack if true.
9364 MVT VAddrVT =
9365 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9366 MVT VAddrScalarVT = VAddrVT.getScalarType();
9367 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9368 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9369
9370 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9371 VAddrScalarVT = VAddrVT.getScalarType();
9372 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9373 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9374
9375 // Push back extra arguments.
9376 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9377 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9378 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9379 // Special handling of bias when A16 is on. Bias is of type half but
9380 // occupies full 32-bit.
9381 SDValue Bias = DAG.getBuildVector(
9382 MVT::v2f16, DL,
9383 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9384 VAddrs.push_back(Bias);
9385 } else {
9386 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9387 "Bias needs to be converted to 16 bit in A16 mode");
9388 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9389 }
9390 }
9391
9392 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9393 // 16 bit gradients are supported, but are tied to the A16 control
9394 // so both gradients and addresses must be 16 bit
9395 LLVM_DEBUG(
9396 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9397 "require 16 bit args for both gradients and addresses");
9398 return Op;
9399 }
9400
9401 if (IsA16) {
9402 if (!ST->hasA16()) {
9403 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9404 "support 16 bit addresses\n");
9405 return Op;
9406 }
9407 }
9408
9409 // We've dealt with incorrect input so we know that if IsA16, IsG16
9410 // are set then we have to compress/pack operands (either address,
9411 // gradient or both)
9412 // In the case where a16 and gradients are tied (no G16 support) then we
9413 // have already verified that both IsA16 and IsG16 are true
9414 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9415 // Activate g16
9416 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9418 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9419 }
9420
9421 // Add gradients (packed or unpacked)
9422 if (IsG16) {
9423 // Pack the gradients
9424 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9425 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9426 ArgOffset + Intr->GradientStart,
9427 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9428 } else {
9429 for (unsigned I = ArgOffset + Intr->GradientStart;
9430 I < ArgOffset + Intr->CoordStart; I++)
9431 VAddrs.push_back(Op.getOperand(I));
9432 }
9433
9434 // Add addresses (packed or unpacked)
9435 if (IsA16) {
9436 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9437 ArgOffset + Intr->CoordStart, VAddrEnd,
9438 0 /* No gradients */);
9439 } else {
9440 // Add uncompressed address
9441 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9442 VAddrs.push_back(Op.getOperand(I));
9443 }
9444
9445 // If the register allocator cannot place the address registers contiguously
9446 // without introducing moves, then using the non-sequential address encoding
9447 // is always preferable, since it saves VALU instructions and is usually a
9448 // wash in terms of code size or even better.
9449 //
9450 // However, we currently have no way of hinting to the register allocator that
9451 // MIMG addresses should be placed contiguously when it is possible to do so,
9452 // so force non-NSA for the common 2-address case as a heuristic.
9453 //
9454 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9455 // allocation when possible.
9456 //
9457 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9458 // set of the remaining addresses.
9459 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9460 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9461 const bool UseNSA = ST->hasNSAEncoding() &&
9462 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9463 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9464 const bool UsePartialNSA =
9465 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9466
9467 SDValue VAddr;
9468 if (UsePartialNSA) {
9469 VAddr = getBuildDwordsVector(DAG, DL,
9470 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9471 } else if (!UseNSA) {
9472 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9473 }
9474
9475 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9476 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9477 SDValue Unorm;
9478 if (!BaseOpcode->Sampler) {
9479 Unorm = True;
9480 } else {
9481 uint64_t UnormConst =
9482 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9483
9484 Unorm = UnormConst ? True : False;
9485 }
9486
9487 SDValue TFE;
9488 SDValue LWE;
9489 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9490 bool IsTexFail = false;
9491 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9492 return Op;
9493
9494 if (IsTexFail) {
9495 if (!DMaskLanes) {
9496 // Expecting to get an error flag since TFC is on - and dmask is 0
9497 // Force dmask to be at least 1 otherwise the instruction will fail
9498 DMask = 0x1;
9499 DMaskLanes = 1;
9500 NumVDataDwords = 1;
9501 }
9502 NumVDataDwords += 1;
9503 AdjustRetType = true;
9504 }
9505
9506 // Has something earlier tagged that the return type needs adjusting
9507 // This happens if the instruction is a load or has set TexFailCtrl flags
9508 if (AdjustRetType) {
9509 // NumVDataDwords reflects the true number of dwords required in the return
9510 // type
9511 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9512 // This is a no-op load. This can be eliminated
9513 SDValue Undef = DAG.getPOISON(Op.getValueType());
9514 if (isa<MemSDNode>(Op))
9515 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9516 return Undef;
9517 }
9518
9519 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9520 MVT::i32, NumVDataDwords)
9521 : MVT::i32;
9522
9523 ResultTypes[0] = NewVT;
9524 if (ResultTypes.size() == 3) {
9525 // Original result was aggregate type used for TexFailCtrl results
9526 // The actual instruction returns as a vector type which has now been
9527 // created. Remove the aggregate result.
9528 ResultTypes.erase(&ResultTypes[1]);
9529 }
9530 }
9531
9532 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9533 // Keep GLC only when the atomic's result is actually used.
9534 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9536 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9538 return Op;
9539
9541 if (BaseOpcode->Store || BaseOpcode->Atomic)
9542 Ops.push_back(VData); // vdata
9543 if (UsePartialNSA) {
9544 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9545 Ops.push_back(VAddr);
9546 } else if (UseNSA)
9547 append_range(Ops, VAddrs);
9548 else
9549 Ops.push_back(VAddr);
9550 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9551 EVT RsrcVT = Rsrc.getValueType();
9552 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9553 return Op;
9554 Ops.push_back(Rsrc);
9555 if (BaseOpcode->Sampler) {
9556 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9557 if (Samp.getValueType() != MVT::v4i32)
9558 return Op;
9559 Ops.push_back(Samp);
9560 }
9561 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9562 if (IsGFX10Plus)
9563 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9564 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9565 Ops.push_back(Unorm);
9566 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9567 Ops.push_back(IsA16 && // r128, a16 for gfx9
9568 ST->hasFeature(AMDGPU::FeatureR128A16)
9569 ? True
9570 : False);
9571 if (IsGFX10Plus)
9572 Ops.push_back(IsA16 ? True : False);
9573
9574 if (!Subtarget->hasGFX90AInsts())
9575 Ops.push_back(TFE); // tfe
9576 else if (TFE->getAsZExtVal()) {
9577 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9579 "TFE is not supported on this GPU", DL.getDebugLoc()));
9580 }
9581
9582 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9583 Ops.push_back(LWE); // lwe
9584 if (!IsGFX10Plus)
9585 Ops.push_back(DimInfo->DA ? True : False);
9586 if (BaseOpcode->HasD16)
9587 Ops.push_back(IsD16 ? True : False);
9588 if (isa<MemSDNode>(Op))
9589 Ops.push_back(Op.getOperand(0)); // chain
9590
9591 int NumVAddrDwords =
9592 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9593 int Opcode = -1;
9594
9595 if (IsGFX12Plus) {
9596 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9597 NumVDataDwords, NumVAddrDwords);
9598 } else if (IsGFX11Plus) {
9599 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9600 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9601 : AMDGPU::MIMGEncGfx11Default,
9602 NumVDataDwords, NumVAddrDwords);
9603 } else if (IsGFX10Plus) {
9604 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9605 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9606 : AMDGPU::MIMGEncGfx10Default,
9607 NumVDataDwords, NumVAddrDwords);
9608 } else {
9609 if (Subtarget->hasGFX90AInsts()) {
9610 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9611 NumVDataDwords, NumVAddrDwords);
9612 if (Opcode == -1) {
9613 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9615 "requested image instruction is not supported on this GPU",
9616 DL.getDebugLoc()));
9617
9618 unsigned Idx = 0;
9619 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9620 for (EVT VT : OrigResultTypes) {
9621 if (VT == MVT::Other)
9622 RetValues[Idx++] = Op.getOperand(0); // Chain
9623 else
9624 RetValues[Idx++] = DAG.getPOISON(VT);
9625 }
9626
9627 return DAG.getMergeValues(RetValues, DL);
9628 }
9629 }
9630 if (Opcode == -1 &&
9631 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9632 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9633 NumVDataDwords, NumVAddrDwords);
9634 if (Opcode == -1)
9635 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9636 NumVDataDwords, NumVAddrDwords);
9637 }
9638 if (Opcode == -1)
9639 return Op;
9640
9641 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9642 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9643 MachineMemOperand *MemRef = MemOp->getMemOperand();
9644 DAG.setNodeMemRefs(NewNode, {MemRef});
9645 }
9646
9647 if (BaseOpcode->NoReturn) {
9648 if (BaseOpcode->Atomic)
9649 return DAG.getMergeValues(
9650 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
9651
9652 return SDValue(NewNode, 0);
9653 }
9654
9655 if (BaseOpcode->AtomicX2) {
9657 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9658 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9659 }
9660
9661 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9662 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9663 NumVDataDwords, IsAtomicPacked16Bit, DL);
9664}
9665
9666SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9667 SDValue Offset, SDValue CachePolicy,
9668 SelectionDAG &DAG) const {
9669 MachineFunction &MF = DAG.getMachineFunction();
9670
9671 const DataLayout &DataLayout = DAG.getDataLayout();
9672 Align Alignment =
9673 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9674
9675 MachineMemOperand *MMO = MF.getMachineMemOperand(
9676 MachinePointerInfo(),
9679 VT.getStoreSize(), Alignment);
9680
9681 if (!Offset->isDivergent()) {
9682 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9683
9684 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9685 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9686 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9687 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9688 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9689 SDValue BufferLoad =
9690 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
9691 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9692 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9693 }
9694
9695 // Widen vec3 load to vec4.
9696 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9697 !Subtarget->hasScalarDwordx3Loads()) {
9698 EVT WidenedVT =
9700 auto WidenedOp = DAG.getMemIntrinsicNode(
9701 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9702 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9703 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9704 DAG.getVectorIdxConstant(0, DL));
9705 return Subvector;
9706 }
9707
9708 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
9709 DAG.getVTList(VT), Ops, VT, MMO);
9710 }
9711
9712 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9713 // assume that the buffer is unswizzled.
9714 SDValue Ops[] = {
9715 DAG.getEntryNode(), // Chain
9716 Rsrc, // rsrc
9717 DAG.getConstant(0, DL, MVT::i32), // vindex
9718 {}, // voffset
9719 {}, // soffset
9720 {}, // offset
9721 CachePolicy, // cachepolicy
9722 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9723 };
9724 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9725 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9726 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9727 }
9728
9730 unsigned NumLoads = 1;
9731 MVT LoadVT = VT.getSimpleVT();
9732 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9733 assert((LoadVT.getScalarType() == MVT::i32 ||
9734 LoadVT.getScalarType() == MVT::f32));
9735
9736 if (NumElts == 8 || NumElts == 16) {
9737 NumLoads = NumElts / 4;
9738 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9739 }
9740
9741 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9742
9743 // Use the alignment to ensure that the required offsets will fit into the
9744 // immediate offsets.
9745 setBufferOffsets(Offset, DAG, &Ops[3],
9746 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9747
9748 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9749 for (unsigned i = 0; i < NumLoads; ++i) {
9750 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9751 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9752 LoadVT, MMO, DAG));
9753 }
9754
9755 if (NumElts == 8 || NumElts == 16)
9756 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9757
9758 return Loads[0];
9759}
9760
9761SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9762 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9763 if (!Subtarget->hasArchitectedSGPRs())
9764 return {};
9765 SDLoc SL(Op);
9766 MVT VT = MVT::i32;
9767 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9768 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9769 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9770}
9771
9772SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9773 AMDGPU::Hwreg::Id HwReg,
9774 unsigned LowBit,
9775 unsigned Width) const {
9776 SDLoc SL(Op);
9777 using namespace AMDGPU::Hwreg;
9778 return {DAG.getMachineNode(
9779 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9780 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9781 SL, MVT::i32)),
9782 0};
9783}
9784
9785SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9786 unsigned Dim,
9787 const ArgDescriptor &Arg) const {
9788 SDLoc SL(Op);
9789 MachineFunction &MF = DAG.getMachineFunction();
9790 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9791 if (MaxID == 0)
9792 return DAG.getConstant(0, SL, MVT::i32);
9793
9794 // It's undefined behavior if a function marked with the amdgpu-no-*
9795 // attributes uses the corresponding intrinsic.
9796 if (!Arg)
9797 return DAG.getPOISON(Op->getValueType(0));
9798
9799 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9800 SDLoc(DAG.getEntryNode()), Arg);
9801
9802 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9803 // masking operations anyway.
9804 //
9805 // TODO: We could assert the top bit is 0 for the source copy.
9806 if (Arg.isMasked())
9807 return Val;
9808
9809 // Preserve the known bits after expansion to a copy.
9810 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9811 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9812 DAG.getValueType(SmallVT));
9813}
9814
9815SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9816 SelectionDAG &DAG) const {
9817 MachineFunction &MF = DAG.getMachineFunction();
9818 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9819
9820 EVT VT = Op.getValueType();
9821 SDLoc DL(Op);
9822 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9823
9824 // TODO: Should this propagate fast-math-flags?
9825
9826 switch (IntrinsicID) {
9827 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9828 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9829 return emitNonHSAIntrinsicError(DAG, DL, VT);
9830 return getPreloadedValue(DAG, *MFI, VT,
9832 }
9833 case Intrinsic::amdgcn_dispatch_ptr:
9834 case Intrinsic::amdgcn_queue_ptr: {
9835 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9836 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9837 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9838 DL.getDebugLoc()));
9839 return DAG.getPOISON(VT);
9840 }
9841
9842 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9845 return getPreloadedValue(DAG, *MFI, VT, RegID);
9846 }
9847 case Intrinsic::amdgcn_implicitarg_ptr: {
9848 if (MFI->isEntryFunction())
9849 return getImplicitArgPtr(DAG, DL);
9850 return getPreloadedValue(DAG, *MFI, VT,
9852 }
9853 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9854 if (!AMDGPU::isKernel(MF.getFunction())) {
9855 // This only makes sense to call in a kernel, so just lower to null.
9856 return DAG.getConstant(0, DL, VT);
9857 }
9858
9859 return getPreloadedValue(DAG, *MFI, VT,
9861 }
9862 case Intrinsic::amdgcn_dispatch_id: {
9863 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9864 }
9865 case Intrinsic::amdgcn_rcp:
9866 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9867 case Intrinsic::amdgcn_rsq:
9868 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9869 case Intrinsic::amdgcn_rsq_legacy:
9870 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9871 return emitRemovedIntrinsicError(DAG, DL, VT);
9872 return SDValue();
9873 case Intrinsic::amdgcn_rcp_legacy:
9874 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9875 return emitRemovedIntrinsicError(DAG, DL, VT);
9876 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9877 case Intrinsic::amdgcn_rsq_clamp: {
9878 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9879 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9880
9881 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9882 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9883 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9884
9885 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9886 SDValue Tmp =
9887 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9888 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9889 DAG.getConstantFP(Min, DL, VT));
9890 }
9891 case Intrinsic::r600_read_ngroups_x:
9892 if (Subtarget->isAmdHsaOS())
9893 return emitNonHSAIntrinsicError(DAG, DL, VT);
9894
9895 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9897 false);
9898 case Intrinsic::r600_read_ngroups_y:
9899 if (Subtarget->isAmdHsaOS())
9900 return emitNonHSAIntrinsicError(DAG, DL, VT);
9901
9902 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9904 false);
9905 case Intrinsic::r600_read_ngroups_z:
9906 if (Subtarget->isAmdHsaOS())
9907 return emitNonHSAIntrinsicError(DAG, DL, VT);
9908
9909 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9911 false);
9912 case Intrinsic::r600_read_local_size_x:
9913 if (Subtarget->isAmdHsaOS())
9914 return emitNonHSAIntrinsicError(DAG, DL, VT);
9915
9916 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9918 case Intrinsic::r600_read_local_size_y:
9919 if (Subtarget->isAmdHsaOS())
9920 return emitNonHSAIntrinsicError(DAG, DL, VT);
9921
9922 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9924 case Intrinsic::r600_read_local_size_z:
9925 if (Subtarget->isAmdHsaOS())
9926 return emitNonHSAIntrinsicError(DAG, DL, VT);
9927
9928 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9930 case Intrinsic::amdgcn_workgroup_id_x:
9931 return lowerWorkGroupId(DAG, *MFI, VT,
9935 case Intrinsic::amdgcn_workgroup_id_y:
9936 return lowerWorkGroupId(DAG, *MFI, VT,
9940 case Intrinsic::amdgcn_workgroup_id_z:
9941 return lowerWorkGroupId(DAG, *MFI, VT,
9945 case Intrinsic::amdgcn_cluster_id_x:
9946 return Subtarget->hasClusters()
9947 ? getPreloadedValue(DAG, *MFI, VT,
9949 : DAG.getPOISON(VT);
9950 case Intrinsic::amdgcn_cluster_id_y:
9951 return Subtarget->hasClusters()
9952 ? getPreloadedValue(DAG, *MFI, VT,
9954 : DAG.getPOISON(VT);
9955 case Intrinsic::amdgcn_cluster_id_z:
9956 return Subtarget->hasClusters()
9957 ? getPreloadedValue(DAG, *MFI, VT,
9959 : DAG.getPOISON(VT);
9960 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9961 return Subtarget->hasClusters()
9962 ? getPreloadedValue(
9963 DAG, *MFI, VT,
9965 : DAG.getPOISON(VT);
9966 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9967 return Subtarget->hasClusters()
9968 ? getPreloadedValue(
9969 DAG, *MFI, VT,
9971 : DAG.getPOISON(VT);
9972 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9973 return Subtarget->hasClusters()
9974 ? getPreloadedValue(
9975 DAG, *MFI, VT,
9977 : DAG.getPOISON(VT);
9978 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9979 return Subtarget->hasClusters()
9980 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9981 : SDValue();
9982 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9983 return Subtarget->hasClusters()
9984 ? getPreloadedValue(
9985 DAG, *MFI, VT,
9987 : DAG.getPOISON(VT);
9988 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9989 return Subtarget->hasClusters()
9990 ? getPreloadedValue(
9991 DAG, *MFI, VT,
9993 : DAG.getPOISON(VT);
9994 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9995 return Subtarget->hasClusters()
9996 ? getPreloadedValue(
9997 DAG, *MFI, VT,
9999 : DAG.getPOISON(VT);
10000 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10001 return Subtarget->hasClusters()
10002 ? getPreloadedValue(
10003 DAG, *MFI, VT,
10005 : DAG.getPOISON(VT);
10006 case Intrinsic::amdgcn_wave_id:
10007 return lowerWaveID(DAG, Op);
10008 case Intrinsic::amdgcn_lds_kernel_id: {
10009 if (MFI->isEntryFunction())
10010 return getLDSKernelId(DAG, DL);
10011 return getPreloadedValue(DAG, *MFI, VT,
10013 }
10014 case Intrinsic::amdgcn_workitem_id_x:
10015 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
10016 case Intrinsic::amdgcn_workitem_id_y:
10017 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
10018 case Intrinsic::amdgcn_workitem_id_z:
10019 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
10020 case Intrinsic::amdgcn_wavefrontsize:
10021 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10022 SDLoc(Op), MVT::i32);
10023 case Intrinsic::amdgcn_s_buffer_load: {
10024 unsigned CPol = Op.getConstantOperandVal(3);
10025 // s_buffer_load, because of how it's optimized, can't be volatile
10026 // so reject ones with the volatile bit set.
10027 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10030 return Op;
10031 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
10032 Op.getOperand(3), DAG);
10033 }
10034 case Intrinsic::amdgcn_fdiv_fast:
10035 return lowerFDIV_FAST(Op, DAG);
10036 case Intrinsic::amdgcn_sin:
10037 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10038
10039 case Intrinsic::amdgcn_cos:
10040 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10041
10042 case Intrinsic::amdgcn_mul_u24:
10043 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10044 Op.getOperand(2));
10045 case Intrinsic::amdgcn_mul_i24:
10046 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10047 Op.getOperand(2));
10048
10049 case Intrinsic::amdgcn_log_clamp: {
10050 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10051 return SDValue();
10052
10053 return emitRemovedIntrinsicError(DAG, DL, VT);
10054 }
10055 case Intrinsic::amdgcn_fract:
10056 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10057
10058 case Intrinsic::amdgcn_class:
10059 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10060 Op.getOperand(2));
10061 case Intrinsic::amdgcn_div_fmas:
10062 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10063 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10064
10065 case Intrinsic::amdgcn_div_fixup:
10066 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10067 Op.getOperand(2), Op.getOperand(3));
10068
10069 case Intrinsic::amdgcn_div_scale: {
10070 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10071
10072 // Translate to the operands expected by the machine instruction. The
10073 // first parameter must be the same as the first instruction.
10074 SDValue Numerator = Op.getOperand(1);
10075 SDValue Denominator = Op.getOperand(2);
10076
10077 // Note this order is opposite of the machine instruction's operations,
10078 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10079 // intrinsic has the numerator as the first operand to match a normal
10080 // division operation.
10081
10082 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10083
10084 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10085 Denominator, Numerator);
10086 }
10087 case Intrinsic::amdgcn_icmp: {
10088 // There is a Pat that handles this variant, so return it as-is.
10089 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10090 Op.getConstantOperandVal(2) == 0 &&
10091 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10092 return Op;
10093 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10094 }
10095 case Intrinsic::amdgcn_fcmp: {
10096 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10097 }
10098 case Intrinsic::amdgcn_ballot:
10099 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10100 case Intrinsic::amdgcn_fmed3:
10101 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10102 Op.getOperand(2), Op.getOperand(3));
10103 case Intrinsic::amdgcn_fdot2:
10104 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10105 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10106 case Intrinsic::amdgcn_fmul_legacy:
10107 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10108 Op.getOperand(2));
10109 case Intrinsic::amdgcn_sffbh:
10110 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
10111 case Intrinsic::amdgcn_sbfe:
10112 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10113 Op.getOperand(2), Op.getOperand(3));
10114 case Intrinsic::amdgcn_ubfe:
10115 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10116 Op.getOperand(2), Op.getOperand(3));
10117 case Intrinsic::amdgcn_cvt_pkrtz:
10118 case Intrinsic::amdgcn_cvt_pknorm_i16:
10119 case Intrinsic::amdgcn_cvt_pknorm_u16:
10120 case Intrinsic::amdgcn_cvt_pk_i16:
10121 case Intrinsic::amdgcn_cvt_pk_u16: {
10122 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10123 EVT VT = Op.getValueType();
10124 unsigned Opcode;
10125
10126 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10127 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10128 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10129 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10130 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10131 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10132 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10133 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10134 else
10135 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10136
10137 if (isTypeLegal(VT))
10138 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10139
10140 SDValue Node =
10141 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10142 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10143 }
10144 case Intrinsic::amdgcn_fmad_ftz:
10145 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10146 Op.getOperand(2), Op.getOperand(3));
10147
10148 case Intrinsic::amdgcn_if_break:
10149 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10150 Op->getOperand(1), Op->getOperand(2)),
10151 0);
10152
10153 case Intrinsic::amdgcn_groupstaticsize: {
10155 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10156 return Op;
10157
10158 const Module *M = MF.getFunction().getParent();
10159 const GlobalValue *GV =
10160 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10161 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10163 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10164 }
10165 case Intrinsic::amdgcn_is_shared:
10166 case Intrinsic::amdgcn_is_private: {
10167 SDLoc SL(Op);
10168 SDValue SrcVec =
10169 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10170 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10171 DAG.getConstant(1, SL, MVT::i32));
10172
10173 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10175 : AMDGPUAS::PRIVATE_ADDRESS;
10176 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10177 Subtarget->hasGloballyAddressableScratch()) {
10178 SDValue FlatScratchBaseHi(
10179 DAG.getMachineNode(
10180 AMDGPU::S_MOV_B32, DL, MVT::i32,
10181 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10182 0);
10183 // Test bits 63..58 against the aperture address.
10184 return DAG.getSetCC(
10185 SL, MVT::i1,
10186 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10187 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10188 }
10189
10190 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10191 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10192 }
10193 case Intrinsic::amdgcn_perm:
10194 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10195 Op.getOperand(2), Op.getOperand(3));
10196 case Intrinsic::amdgcn_reloc_constant: {
10197 Module *M = MF.getFunction().getParent();
10198 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10199 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10200 auto *RelocSymbol = cast<GlobalVariable>(
10201 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10202 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10204 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10205 }
10206 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10207 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10208 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10209 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10210 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10211 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10212 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10213 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10214 if (Op.getOperand(4).getValueType() == MVT::i32)
10215 return SDValue();
10216
10217 SDLoc SL(Op);
10218 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10219 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10220 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10221 Op.getOperand(3), IndexKeyi32);
10222 }
10223 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10224 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10225 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10226 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10227 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10228 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10229 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10230 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10231 if (Op.getOperand(4).getValueType() == MVT::i64)
10232 return SDValue();
10233
10234 SDLoc SL(Op);
10235 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10236 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10237 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10238 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10239 Op.getOperand(6)});
10240 }
10241 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10242 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10243 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10244 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10245 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10246 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10247 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10248 ? MVT::i64
10249 : MVT::i32;
10250 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10251 return SDValue();
10252
10253 SDLoc SL(Op);
10254 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10255 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10256 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10257 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10258 IndexKey, Op.getOperand(7),
10259 Op.getOperand(8)}); // No clamp operand
10260 }
10261 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10262 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10263 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10264 if (Op.getOperand(6).getValueType() == MVT::i32)
10265 return SDValue();
10266
10267 SDLoc SL(Op);
10268 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10269 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10270 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10271 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10272 IndexKeyi32, Op.getOperand(7)});
10273 }
10274 case Intrinsic::amdgcn_addrspacecast_nonnull:
10275 return lowerADDRSPACECAST(Op, DAG);
10276 case Intrinsic::amdgcn_readlane:
10277 case Intrinsic::amdgcn_readfirstlane:
10278 case Intrinsic::amdgcn_writelane:
10279 case Intrinsic::amdgcn_permlane16:
10280 case Intrinsic::amdgcn_permlanex16:
10281 case Intrinsic::amdgcn_permlane64:
10282 case Intrinsic::amdgcn_set_inactive:
10283 case Intrinsic::amdgcn_set_inactive_chain_arg:
10284 case Intrinsic::amdgcn_mov_dpp8:
10285 case Intrinsic::amdgcn_update_dpp:
10286 return lowerLaneOp(*this, Op.getNode(), DAG);
10287 case Intrinsic::amdgcn_dead: {
10289 for (const EVT ValTy : Op.getNode()->values())
10290 Poisons.push_back(DAG.getPOISON(ValTy));
10291 return DAG.getMergeValues(Poisons, SDLoc(Op));
10292 }
10293 default:
10294 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10296 return lowerImage(Op, ImageDimIntr, DAG, false);
10297
10298 return Op;
10299 }
10300}
10301
10302// On targets not supporting constant in soffset field, turn zero to
10303// SGPR_NULL to avoid generating an extra s_mov with zero.
10305 const GCNSubtarget *Subtarget) {
10306 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10307 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10308 return SOffset;
10309}
10310
10311SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10312 SelectionDAG &DAG,
10313 unsigned NewOpcode) const {
10314 SDLoc DL(Op);
10315
10316 SDValue VData = Op.getOperand(2);
10317 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10318 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10319 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10320 SDValue Ops[] = {
10321 Op.getOperand(0), // Chain
10322 VData, // vdata
10323 Rsrc, // rsrc
10324 DAG.getConstant(0, DL, MVT::i32), // vindex
10325 VOffset, // voffset
10326 SOffset, // soffset
10327 Offset, // offset
10328 Op.getOperand(6), // cachepolicy
10329 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10330 };
10331
10332 auto *M = cast<MemSDNode>(Op);
10333
10334 EVT MemVT = VData.getValueType();
10335 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10336 M->getMemOperand());
10337}
10338
10339SDValue
10340SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10341 unsigned NewOpcode) const {
10342 SDLoc DL(Op);
10343
10344 SDValue VData = Op.getOperand(2);
10345 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10346 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10347 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10348 SDValue Ops[] = {
10349 Op.getOperand(0), // Chain
10350 VData, // vdata
10351 Rsrc, // rsrc
10352 Op.getOperand(4), // vindex
10353 VOffset, // voffset
10354 SOffset, // soffset
10355 Offset, // offset
10356 Op.getOperand(7), // cachepolicy
10357 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10358 };
10359
10360 auto *M = cast<MemSDNode>(Op);
10361
10362 EVT MemVT = VData.getValueType();
10363 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10364 M->getMemOperand());
10365}
10366
10367SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10368 SelectionDAG &DAG) const {
10369 unsigned IntrID = Op.getConstantOperandVal(1);
10370 SDLoc DL(Op);
10371
10372 switch (IntrID) {
10373 case Intrinsic::amdgcn_ds_ordered_add:
10374 case Intrinsic::amdgcn_ds_ordered_swap: {
10375 MemSDNode *M = cast<MemSDNode>(Op);
10376 SDValue Chain = M->getOperand(0);
10377 SDValue M0 = M->getOperand(2);
10378 SDValue Value = M->getOperand(3);
10379 unsigned IndexOperand = M->getConstantOperandVal(7);
10380 unsigned WaveRelease = M->getConstantOperandVal(8);
10381 unsigned WaveDone = M->getConstantOperandVal(9);
10382
10383 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10384 IndexOperand &= ~0x3f;
10385 unsigned CountDw = 0;
10386
10387 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10388 CountDw = (IndexOperand >> 24) & 0xf;
10389 IndexOperand &= ~(0xf << 24);
10390
10391 if (CountDw < 1 || CountDw > 4) {
10392 const Function &Fn = DAG.getMachineFunction().getFunction();
10393 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10394 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10395 DL.getDebugLoc()));
10396 CountDw = 1;
10397 }
10398 }
10399
10400 if (IndexOperand) {
10401 const Function &Fn = DAG.getMachineFunction().getFunction();
10402 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10403 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10404 }
10405
10406 if (WaveDone && !WaveRelease) {
10407 // TODO: Move this to IR verifier
10408 const Function &Fn = DAG.getMachineFunction().getFunction();
10409 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10410 Fn, "ds_ordered_count: wave_done requires wave_release",
10411 DL.getDebugLoc()));
10412 }
10413
10414 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10415 unsigned ShaderType =
10417 unsigned Offset0 = OrderedCountIndex << 2;
10418 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10419
10420 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10421 Offset1 |= (CountDw - 1) << 6;
10422
10423 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10424 Offset1 |= ShaderType << 2;
10425
10426 unsigned Offset = Offset0 | (Offset1 << 8);
10427
10428 SDValue Ops[] = {
10429 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10430 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10431 };
10432 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
10433 M->getVTList(), Ops, M->getMemoryVT(),
10434 M->getMemOperand());
10435 }
10436 case Intrinsic::amdgcn_raw_buffer_load:
10437 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10438 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10439 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10440 case Intrinsic::amdgcn_raw_buffer_load_format:
10441 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10442 const bool IsFormat =
10443 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10444 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10445
10446 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10447 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10448 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10449 SDValue Ops[] = {
10450 Op.getOperand(0), // Chain
10451 Rsrc, // rsrc
10452 DAG.getConstant(0, DL, MVT::i32), // vindex
10453 VOffset, // voffset
10454 SOffset, // soffset
10455 Offset, // offset
10456 Op.getOperand(5), // cachepolicy, swizzled buffer
10457 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10458 };
10459
10460 auto *M = cast<MemSDNode>(Op);
10461 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10462 }
10463 case Intrinsic::amdgcn_struct_buffer_load:
10464 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10465 case Intrinsic::amdgcn_struct_buffer_load_format:
10466 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10467 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10468 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10469 const bool IsFormat =
10470 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10471 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10472
10473 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10474 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10475 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10476 SDValue Ops[] = {
10477 Op.getOperand(0), // Chain
10478 Rsrc, // rsrc
10479 Op.getOperand(3), // vindex
10480 VOffset, // voffset
10481 SOffset, // soffset
10482 Offset, // offset
10483 Op.getOperand(6), // cachepolicy, swizzled buffer
10484 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10485 };
10486
10487 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10488 }
10489 case Intrinsic::amdgcn_raw_tbuffer_load:
10490 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10491 MemSDNode *M = cast<MemSDNode>(Op);
10492 EVT LoadVT = Op.getValueType();
10493 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10494 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10495 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10496
10497 SDValue Ops[] = {
10498 Op.getOperand(0), // Chain
10499 Rsrc, // rsrc
10500 DAG.getConstant(0, DL, MVT::i32), // vindex
10501 VOffset, // voffset
10502 SOffset, // soffset
10503 Offset, // offset
10504 Op.getOperand(5), // format
10505 Op.getOperand(6), // cachepolicy, swizzled buffer
10506 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10507 };
10508
10509 if (LoadVT.getScalarType() == MVT::f16)
10510 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10511 Ops);
10512 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10513 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10514 DAG);
10515 }
10516 case Intrinsic::amdgcn_struct_tbuffer_load:
10517 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10518 MemSDNode *M = cast<MemSDNode>(Op);
10519 EVT LoadVT = Op.getValueType();
10520 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10521 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10522 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10523
10524 SDValue Ops[] = {
10525 Op.getOperand(0), // Chain
10526 Rsrc, // rsrc
10527 Op.getOperand(3), // vindex
10528 VOffset, // voffset
10529 SOffset, // soffset
10530 Offset, // offset
10531 Op.getOperand(6), // format
10532 Op.getOperand(7), // cachepolicy, swizzled buffer
10533 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10534 };
10535
10536 if (LoadVT.getScalarType() == MVT::f16)
10537 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10538 Ops);
10539 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10540 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10541 DAG);
10542 }
10543 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10544 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10545 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10546 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10547 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10548 return lowerStructBufferAtomicIntrin(Op, DAG,
10549 AMDGPUISD::BUFFER_ATOMIC_FADD);
10550 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10551 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10552 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10553 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10554 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10555 return lowerStructBufferAtomicIntrin(Op, DAG,
10556 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10557 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10558 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10559 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10560 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10561 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10562 return lowerStructBufferAtomicIntrin(Op, DAG,
10563 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10564 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10565 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10566 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10567 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10568 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10569 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10570 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10571 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10572 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10573 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10574 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10575 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10576 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10577 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10578 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10579 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10580 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10581 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10582 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10583 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10584 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10585 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10586 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10587 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10588 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10589 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10590 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10591 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10592 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10593 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10594 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10595 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10596 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10597 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10598 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10599 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10600 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10601 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10602 return lowerStructBufferAtomicIntrin(Op, DAG,
10603 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10604 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10605 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10606 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10607 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10608 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10609 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10610 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10611 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10612 return lowerStructBufferAtomicIntrin(Op, DAG,
10613 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10614 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10615 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10616 return lowerStructBufferAtomicIntrin(Op, DAG,
10617 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10618 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10619 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10620 return lowerStructBufferAtomicIntrin(Op, DAG,
10621 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10622 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10623 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10624 return lowerStructBufferAtomicIntrin(Op, DAG,
10625 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10626 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10627 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10628 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10629 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10630 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10631 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10632 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10633 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10634 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10635 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10636 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10637 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10638 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10639 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10640 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10641 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10642 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10643 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10644 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10645 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10646 return lowerStructBufferAtomicIntrin(Op, DAG,
10647 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10648 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10649 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10650 return lowerRawBufferAtomicIntrin(Op, DAG,
10651 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10652 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10653 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10654 return lowerStructBufferAtomicIntrin(Op, DAG,
10655 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10656 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10657 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10658 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10659 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10660 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10661 SDValue Ops[] = {
10662 Op.getOperand(0), // Chain
10663 Op.getOperand(2), // src
10664 Op.getOperand(3), // cmp
10665 Rsrc, // rsrc
10666 DAG.getConstant(0, DL, MVT::i32), // vindex
10667 VOffset, // voffset
10668 SOffset, // soffset
10669 Offset, // offset
10670 Op.getOperand(7), // cachepolicy
10671 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10672 };
10673 EVT VT = Op.getValueType();
10674 auto *M = cast<MemSDNode>(Op);
10675
10676 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10677 Op->getVTList(), Ops, VT,
10678 M->getMemOperand());
10679 }
10680 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10681 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10682 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10683 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10684 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10685 SDValue Ops[] = {
10686 Op.getOperand(0), // Chain
10687 Op.getOperand(2), // src
10688 Op.getOperand(3), // cmp
10689 Rsrc, // rsrc
10690 Op.getOperand(5), // vindex
10691 VOffset, // voffset
10692 SOffset, // soffset
10693 Offset, // offset
10694 Op.getOperand(8), // cachepolicy
10695 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10696 };
10697 EVT VT = Op.getValueType();
10698 auto *M = cast<MemSDNode>(Op);
10699
10700 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10701 Op->getVTList(), Ops, VT,
10702 M->getMemOperand());
10703 }
10704 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10705 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10706 MemSDNode *M = cast<MemSDNode>(Op);
10707 SDValue NodePtr = M->getOperand(2);
10708 SDValue RayExtent = M->getOperand(3);
10709 SDValue InstanceMask = M->getOperand(4);
10710 SDValue RayOrigin = M->getOperand(5);
10711 SDValue RayDir = M->getOperand(6);
10712 SDValue Offsets = M->getOperand(7);
10713 SDValue TDescr = M->getOperand(8);
10714
10715 assert(NodePtr.getValueType() == MVT::i64);
10716 assert(RayDir.getValueType() == MVT::v3f32);
10717
10718 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10719 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10720 return SDValue();
10721 }
10722
10723 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10724 const unsigned NumVDataDwords = 10;
10725 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10726 int Opcode = AMDGPU::getMIMGOpcode(
10727 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10728 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10729 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10730 assert(Opcode != -1);
10731
10733 Ops.push_back(NodePtr);
10734 Ops.push_back(DAG.getBuildVector(
10735 MVT::v2i32, DL,
10736 {DAG.getBitcast(MVT::i32, RayExtent),
10737 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10738 Ops.push_back(RayOrigin);
10739 Ops.push_back(RayDir);
10740 Ops.push_back(Offsets);
10741 Ops.push_back(TDescr);
10742 Ops.push_back(M->getChain());
10743
10744 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10745 MachineMemOperand *MemRef = M->getMemOperand();
10746 DAG.setNodeMemRefs(NewNode, {MemRef});
10747 return SDValue(NewNode, 0);
10748 }
10749 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10750 MemSDNode *M = cast<MemSDNode>(Op);
10751 SDValue NodePtr = M->getOperand(2);
10752 SDValue RayExtent = M->getOperand(3);
10753 SDValue RayOrigin = M->getOperand(4);
10754 SDValue RayDir = M->getOperand(5);
10755 SDValue RayInvDir = M->getOperand(6);
10756 SDValue TDescr = M->getOperand(7);
10757
10758 assert(NodePtr.getValueType() == MVT::i32 ||
10759 NodePtr.getValueType() == MVT::i64);
10760 assert(RayDir.getValueType() == MVT::v3f16 ||
10761 RayDir.getValueType() == MVT::v3f32);
10762
10763 if (!Subtarget->hasGFX10_AEncoding()) {
10764 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10765 return SDValue();
10766 }
10767
10768 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10769 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10770 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10771 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10772 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10773 const unsigned NumVDataDwords = 4;
10774 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10775 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10776 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10777 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10778 IsGFX12Plus;
10779 const unsigned BaseOpcodes[2][2] = {
10780 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10781 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10782 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10783 int Opcode;
10784 if (UseNSA) {
10785 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10786 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10787 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10788 : AMDGPU::MIMGEncGfx10NSA,
10789 NumVDataDwords, NumVAddrDwords);
10790 } else {
10791 assert(!IsGFX12Plus);
10792 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10793 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10794 : AMDGPU::MIMGEncGfx10Default,
10795 NumVDataDwords, NumVAddrDwords);
10796 }
10797 assert(Opcode != -1);
10798
10800
10801 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10803 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10804 if (Lanes[0].getValueSizeInBits() == 32) {
10805 for (unsigned I = 0; I < 3; ++I)
10806 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10807 } else {
10808 if (IsAligned) {
10809 Ops.push_back(DAG.getBitcast(
10810 MVT::i32,
10811 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10812 Ops.push_back(Lanes[2]);
10813 } else {
10814 SDValue Elt0 = Ops.pop_back_val();
10815 Ops.push_back(DAG.getBitcast(
10816 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10817 Ops.push_back(DAG.getBitcast(
10818 MVT::i32,
10819 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10820 }
10821 }
10822 };
10823
10824 if (UseNSA && IsGFX11Plus) {
10825 Ops.push_back(NodePtr);
10826 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10827 Ops.push_back(RayOrigin);
10828 if (IsA16) {
10829 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10830 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10831 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10832 for (unsigned I = 0; I < 3; ++I) {
10833 MergedLanes.push_back(DAG.getBitcast(
10834 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10835 {DirLanes[I], InvDirLanes[I]})));
10836 }
10837 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10838 } else {
10839 Ops.push_back(RayDir);
10840 Ops.push_back(RayInvDir);
10841 }
10842 } else {
10843 if (Is64)
10844 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10845 2);
10846 else
10847 Ops.push_back(NodePtr);
10848
10849 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10850 packLanes(RayOrigin, true);
10851 packLanes(RayDir, true);
10852 packLanes(RayInvDir, false);
10853 }
10854
10855 if (!UseNSA) {
10856 // Build a single vector containing all the operands so far prepared.
10857 if (NumVAddrDwords > 12) {
10858 SDValue Undef = DAG.getPOISON(MVT::i32);
10859 Ops.append(16 - Ops.size(), Undef);
10860 }
10861 assert(Ops.size() >= 8 && Ops.size() <= 12);
10862 SDValue MergedOps =
10863 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10864 Ops.clear();
10865 Ops.push_back(MergedOps);
10866 }
10867
10868 Ops.push_back(TDescr);
10869 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10870 Ops.push_back(M->getChain());
10871
10872 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10873 MachineMemOperand *MemRef = M->getMemOperand();
10874 DAG.setNodeMemRefs(NewNode, {MemRef});
10875 return SDValue(NewNode, 0);
10876 }
10877 case Intrinsic::amdgcn_global_atomic_fmin_num:
10878 case Intrinsic::amdgcn_global_atomic_fmax_num:
10879 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10880 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10881 MemSDNode *M = cast<MemSDNode>(Op);
10882 SDValue Ops[] = {
10883 M->getOperand(0), // Chain
10884 M->getOperand(2), // Ptr
10885 M->getOperand(3) // Value
10886 };
10887 unsigned Opcode = 0;
10888 switch (IntrID) {
10889 case Intrinsic::amdgcn_global_atomic_fmin_num:
10890 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10891 Opcode = ISD::ATOMIC_LOAD_FMIN;
10892 break;
10893 }
10894 case Intrinsic::amdgcn_global_atomic_fmax_num:
10895 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10896 Opcode = ISD::ATOMIC_LOAD_FMAX;
10897 break;
10898 }
10899 default:
10900 llvm_unreachable("unhandled atomic opcode");
10901 }
10902 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10903 Ops, M->getMemOperand());
10904 }
10905 case Intrinsic::amdgcn_s_get_barrier_state:
10906 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10907 SDValue Chain = Op->getOperand(0);
10909 unsigned Opc;
10910
10911 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10912 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10913 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10914 BarID = (BarID >> 4) & 0x3F;
10915 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10916 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10917 Ops.push_back(K);
10918 Ops.push_back(Chain);
10919 } else {
10920 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10921 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10922 SDValue M0Val;
10923 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10924 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10925 M0Val = SDValue(
10926 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10927 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10928 0);
10929 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10930 } else
10931 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10932 }
10933
10934 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10935 return SDValue(NewMI, 0);
10936 }
10937 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10938 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10939 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10940 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10941 SDValue Chain = Op->getOperand(0);
10942 SDValue Ptr = Op->getOperand(2);
10943 EVT VT = Op->getValueType(0);
10944 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10945 Chain, Ptr, MII->getMemOperand());
10946 }
10947 default:
10948
10949 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10951 return lowerImage(Op, ImageDimIntr, DAG, true);
10952
10953 return SDValue();
10954 }
10955}
10956
10957// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10958// dwordx4 if on SI and handle TFE loads.
10959SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10960 SDVTList VTList,
10961 ArrayRef<SDValue> Ops, EVT MemVT,
10962 MachineMemOperand *MMO,
10963 SelectionDAG &DAG) const {
10964 LLVMContext &C = *DAG.getContext();
10965 MachineFunction &MF = DAG.getMachineFunction();
10966 EVT VT = VTList.VTs[0];
10967
10968 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10969 bool IsTFE = VTList.NumVTs == 3;
10970 if (IsTFE) {
10971 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10972 unsigned NumOpDWords = NumValueDWords + 1;
10973 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10974 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10975 MachineMemOperand *OpDWordsMMO =
10976 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10977 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10978 OpDWordsVT, OpDWordsMMO, DAG);
10979 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10980 DAG.getVectorIdxConstant(NumValueDWords, DL));
10981 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10982 SDValue ValueDWords =
10983 NumValueDWords == 1
10984 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10986 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10987 ZeroIdx);
10988 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10989 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10990 }
10991
10992 if (!Subtarget->hasDwordx3LoadStores() &&
10993 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10994 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10995 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10996 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10997 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10998 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10999 WidenedMemVT, WidenedMMO);
11001 DAG.getVectorIdxConstant(0, DL));
11002 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
11003 }
11004
11005 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
11006}
11007
11008SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11009 bool ImageStore) const {
11010 EVT StoreVT = VData.getValueType();
11011
11012 // No change for f16 and legal vector D16 types.
11013 if (!StoreVT.isVector())
11014 return VData;
11015
11016 SDLoc DL(VData);
11017 unsigned NumElements = StoreVT.getVectorNumElements();
11018
11019 if (Subtarget->hasUnpackedD16VMem()) {
11020 // We need to unpack the packed data to store.
11021 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11022 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11023
11024 EVT EquivStoreVT =
11025 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
11026 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
11027 return DAG.UnrollVectorOp(ZExt.getNode());
11028 }
11029
11030 // The sq block of gfx8.1 does not estimate register use correctly for d16
11031 // image store instructions. The data operand is computed as if it were not a
11032 // d16 image instruction.
11033 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11034 // Bitcast to i16
11035 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11036 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11037
11038 // Decompose into scalars
11040 DAG.ExtractVectorElements(IntVData, Elts);
11041
11042 // Group pairs of i16 into v2i16 and bitcast to i32
11043 SmallVector<SDValue, 4> PackedElts;
11044 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11045 SDValue Pair =
11046 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11047 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11048 PackedElts.push_back(IntPair);
11049 }
11050 if ((NumElements % 2) == 1) {
11051 // Handle v3i16
11052 unsigned I = Elts.size() / 2;
11053 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11054 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11055 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11056 PackedElts.push_back(IntPair);
11057 }
11058
11059 // Pad using UNDEF
11060 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11061
11062 // Build final vector
11063 EVT VecVT =
11064 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11065 return DAG.getBuildVector(VecVT, DL, PackedElts);
11066 }
11067
11068 if (NumElements == 3) {
11069 EVT IntStoreVT =
11071 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11072
11073 EVT WidenedStoreVT = EVT::getVectorVT(
11074 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11075 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11076 WidenedStoreVT.getStoreSizeInBits());
11077 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11078 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11079 }
11080
11081 assert(isTypeLegal(StoreVT));
11082 return VData;
11083}
11084
11085SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11086 SelectionDAG &DAG) const {
11087 SDLoc DL(Op);
11088 SDValue Chain = Op.getOperand(0);
11089 unsigned IntrinsicID = Op.getConstantOperandVal(1);
11090 MachineFunction &MF = DAG.getMachineFunction();
11091
11092 switch (IntrinsicID) {
11093 case Intrinsic::amdgcn_exp_compr: {
11094 if (!Subtarget->hasCompressedExport()) {
11095 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11097 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11098 }
11099 SDValue Src0 = Op.getOperand(4);
11100 SDValue Src1 = Op.getOperand(5);
11101 // Hack around illegal type on SI by directly selecting it.
11102 if (isTypeLegal(Src0.getValueType()))
11103 return SDValue();
11104
11105 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
11106 SDValue Undef = DAG.getPOISON(MVT::f32);
11107 const SDValue Ops[] = {
11108 Op.getOperand(2), // tgt
11109 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
11110 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
11111 Undef, // src2
11112 Undef, // src3
11113 Op.getOperand(7), // vm
11114 DAG.getTargetConstant(1, DL, MVT::i1), // compr
11115 Op.getOperand(3), // en
11116 Op.getOperand(0) // Chain
11117 };
11118
11119 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11120 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11121 }
11122
11123 case Intrinsic::amdgcn_struct_tbuffer_store:
11124 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11125 SDValue VData = Op.getOperand(2);
11126 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11127 if (IsD16)
11128 VData = handleD16VData(VData, DAG);
11129 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11130 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11131 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11132 SDValue Ops[] = {
11133 Chain,
11134 VData, // vdata
11135 Rsrc, // rsrc
11136 Op.getOperand(4), // vindex
11137 VOffset, // voffset
11138 SOffset, // soffset
11139 Offset, // offset
11140 Op.getOperand(7), // format
11141 Op.getOperand(8), // cachepolicy, swizzled buffer
11142 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11143 };
11144 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11145 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11146 MemSDNode *M = cast<MemSDNode>(Op);
11147 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11148 M->getMemoryVT(), M->getMemOperand());
11149 }
11150
11151 case Intrinsic::amdgcn_raw_tbuffer_store:
11152 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11153 SDValue VData = Op.getOperand(2);
11154 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11155 if (IsD16)
11156 VData = handleD16VData(VData, DAG);
11157 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11158 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11159 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11160 SDValue Ops[] = {
11161 Chain,
11162 VData, // vdata
11163 Rsrc, // rsrc
11164 DAG.getConstant(0, DL, MVT::i32), // vindex
11165 VOffset, // voffset
11166 SOffset, // soffset
11167 Offset, // offset
11168 Op.getOperand(6), // format
11169 Op.getOperand(7), // cachepolicy, swizzled buffer
11170 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11171 };
11172 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11173 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11174 MemSDNode *M = cast<MemSDNode>(Op);
11175 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11176 M->getMemoryVT(), M->getMemOperand());
11177 }
11178
11179 case Intrinsic::amdgcn_raw_buffer_store:
11180 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11181 case Intrinsic::amdgcn_raw_buffer_store_format:
11182 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11183 const bool IsFormat =
11184 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11185 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11186
11187 SDValue VData = Op.getOperand(2);
11188 EVT VDataVT = VData.getValueType();
11189 EVT EltType = VDataVT.getScalarType();
11190 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11191 if (IsD16) {
11192 VData = handleD16VData(VData, DAG);
11193 VDataVT = VData.getValueType();
11194 }
11195
11196 if (!isTypeLegal(VDataVT)) {
11197 VData =
11198 DAG.getNode(ISD::BITCAST, DL,
11199 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11200 }
11201
11202 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11203 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11204 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11205 SDValue Ops[] = {
11206 Chain,
11207 VData,
11208 Rsrc,
11209 DAG.getConstant(0, DL, MVT::i32), // vindex
11210 VOffset, // voffset
11211 SOffset, // soffset
11212 Offset, // offset
11213 Op.getOperand(6), // cachepolicy, swizzled buffer
11214 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11215 };
11216 unsigned Opc =
11217 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11218 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11219 MemSDNode *M = cast<MemSDNode>(Op);
11220
11221 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11222 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11223 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11224
11225 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11226 M->getMemoryVT(), M->getMemOperand());
11227 }
11228
11229 case Intrinsic::amdgcn_struct_buffer_store:
11230 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11231 case Intrinsic::amdgcn_struct_buffer_store_format:
11232 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11233 const bool IsFormat =
11234 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11235 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11236
11237 SDValue VData = Op.getOperand(2);
11238 EVT VDataVT = VData.getValueType();
11239 EVT EltType = VDataVT.getScalarType();
11240 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11241
11242 if (IsD16) {
11243 VData = handleD16VData(VData, DAG);
11244 VDataVT = VData.getValueType();
11245 }
11246
11247 if (!isTypeLegal(VDataVT)) {
11248 VData =
11249 DAG.getNode(ISD::BITCAST, DL,
11250 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11251 }
11252
11253 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11254 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11255 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11256 SDValue Ops[] = {
11257 Chain,
11258 VData,
11259 Rsrc,
11260 Op.getOperand(4), // vindex
11261 VOffset, // voffset
11262 SOffset, // soffset
11263 Offset, // offset
11264 Op.getOperand(7), // cachepolicy, swizzled buffer
11265 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11266 };
11267 unsigned Opc =
11268 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11269 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11270 MemSDNode *M = cast<MemSDNode>(Op);
11271
11272 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11273 EVT VDataType = VData.getValueType().getScalarType();
11274 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11275 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11276
11277 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11278 M->getMemoryVT(), M->getMemOperand());
11279 }
11280 case Intrinsic::amdgcn_raw_buffer_load_lds:
11281 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11282 case Intrinsic::amdgcn_struct_buffer_load_lds:
11283 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11284 if (!Subtarget->hasVMemToLDSLoad())
11285 return SDValue();
11286 unsigned Opc;
11287 bool HasVIndex =
11288 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11289 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11290 unsigned OpOffset = HasVIndex ? 1 : 0;
11291 SDValue VOffset = Op.getOperand(5 + OpOffset);
11292 bool HasVOffset = !isNullConstant(VOffset);
11293 unsigned Size = Op->getConstantOperandVal(4);
11294
11295 switch (Size) {
11296 default:
11297 return SDValue();
11298 case 1:
11299 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11300 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11301 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11302 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11303 break;
11304 case 2:
11305 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11306 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11307 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11308 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11309 break;
11310 case 4:
11311 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11312 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11313 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11314 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11315 break;
11316 case 12:
11317 if (!Subtarget->hasLDSLoadB96_B128())
11318 return SDValue();
11319 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11320 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11321 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11322 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11323 break;
11324 case 16:
11325 if (!Subtarget->hasLDSLoadB96_B128())
11326 return SDValue();
11327 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11328 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11329 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11330 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11331 break;
11332 }
11333
11334 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11335
11337
11338 if (HasVIndex && HasVOffset)
11339 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11340 {Op.getOperand(5), // VIndex
11341 VOffset}));
11342 else if (HasVIndex)
11343 Ops.push_back(Op.getOperand(5));
11344 else if (HasVOffset)
11345 Ops.push_back(VOffset);
11346
11347 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11348 Ops.push_back(Rsrc);
11349 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11350 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11351 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11352 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11353 Ops.push_back(DAG.getTargetConstant(
11354 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11355 DL, MVT::i8)); // cpol
11356 Ops.push_back(DAG.getTargetConstant(
11357 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11358 ? 1
11359 : 0,
11360 DL, MVT::i8)); // swz
11361 Ops.push_back(M0Val.getValue(0)); // Chain
11362 Ops.push_back(M0Val.getValue(1)); // Glue
11363
11364 auto *M = cast<MemSDNode>(Op);
11365 MachineMemOperand *LoadMMO = M->getMemOperand();
11366 // Don't set the offset value here because the pointer points to the base of
11367 // the buffer.
11368 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11369
11370 MachinePointerInfo StorePtrI = LoadPtrI;
11371 LoadPtrI.V = PoisonValue::get(
11375
11376 auto F = LoadMMO->getFlags() &
11378 LoadMMO =
11380 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11381
11382 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11383 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11384 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11385
11386 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11387 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11388
11389 return SDValue(Load, 0);
11390 }
11391 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11392 // for "trust me" that the remaining cases are global pointers until
11393 // such time as we can put two mem operands on an intrinsic.
11394 case Intrinsic::amdgcn_load_to_lds:
11395 case Intrinsic::amdgcn_global_load_lds: {
11396 if (!Subtarget->hasVMemToLDSLoad())
11397 return SDValue();
11398
11399 unsigned Opc;
11400 unsigned Size = Op->getConstantOperandVal(4);
11401 switch (Size) {
11402 default:
11403 return SDValue();
11404 case 1:
11405 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11406 break;
11407 case 2:
11408 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11409 break;
11410 case 4:
11411 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11412 break;
11413 case 12:
11414 if (!Subtarget->hasLDSLoadB96_B128())
11415 return SDValue();
11416 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11417 break;
11418 case 16:
11419 if (!Subtarget->hasLDSLoadB96_B128())
11420 return SDValue();
11421 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11422 break;
11423 }
11424
11425 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11426
11428
11429 SDValue Addr = Op.getOperand(2); // Global ptr
11430 SDValue VOffset;
11431 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11432 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11433 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11434 SDValue LHS = Addr.getOperand(0);
11435 SDValue RHS = Addr.getOperand(1);
11436
11437 if (LHS->isDivergent())
11438 std::swap(LHS, RHS);
11439
11440 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11441 RHS.getOperand(0).getValueType() == MVT::i32) {
11442 // add (i64 sgpr), (zero_extend (i32 vgpr))
11443 Addr = LHS;
11444 VOffset = RHS.getOperand(0);
11445 }
11446 }
11447
11448 Ops.push_back(Addr);
11449 if (!Addr->isDivergent()) {
11451 if (!VOffset)
11452 VOffset =
11453 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11454 DAG.getTargetConstant(0, DL, MVT::i32)),
11455 0);
11456 Ops.push_back(VOffset);
11457 }
11458
11459 Ops.push_back(Op.getOperand(5)); // Offset
11460
11461 unsigned Aux = Op.getConstantOperandVal(6);
11462 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11463 MVT::i32)); // CPol
11464
11465 Ops.push_back(M0Val.getValue(0)); // Chain
11466 Ops.push_back(M0Val.getValue(1)); // Glue
11467
11468 auto *M = cast<MemSDNode>(Op);
11469 MachineMemOperand *LoadMMO = M->getMemOperand();
11470 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11471 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11472 MachinePointerInfo StorePtrI = LoadPtrI;
11473 LoadPtrI.V = PoisonValue::get(
11477 auto F = LoadMMO->getFlags() &
11479 LoadMMO =
11481 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11482 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11483 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11484 LoadMMO->getAAInfo());
11485
11486 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11487 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11488
11489 return SDValue(Load, 0);
11490 }
11491 case Intrinsic::amdgcn_end_cf:
11492 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11493 Op->getOperand(2), Chain),
11494 0);
11495 case Intrinsic::amdgcn_s_barrier_init:
11496 case Intrinsic::amdgcn_s_barrier_signal_var: {
11497 // these two intrinsics have two operands: barrier pointer and member count
11498 SDValue Chain = Op->getOperand(0);
11500 SDValue BarOp = Op->getOperand(2);
11501 SDValue CntOp = Op->getOperand(3);
11502 SDValue M0Val;
11503 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11504 ? AMDGPU::S_BARRIER_INIT_M0
11505 : AMDGPU::S_BARRIER_SIGNAL_M0;
11506 // extract the BarrierID from bits 4-9 of BarOp
11507 SDValue BarID;
11508 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11509 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11510 BarID =
11511 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11512 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11513 0);
11514 // Member count should be put into M0[ShAmt:+6]
11515 // Barrier ID should be put into M0[5:0]
11516 M0Val =
11517 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11518 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11519 0);
11520 constexpr unsigned ShAmt = 16;
11521 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11522 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11523
11524 M0Val = SDValue(
11525 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11526
11527 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11528
11529 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11530 return SDValue(NewMI, 0);
11531 }
11532 case Intrinsic::amdgcn_s_wakeup_barrier: {
11533 if (!Subtarget->hasSWakeupBarrier())
11534 return SDValue();
11535 [[fallthrough]];
11536 }
11537 case Intrinsic::amdgcn_s_barrier_join: {
11538 // these three intrinsics have one operand: barrier pointer
11539 SDValue Chain = Op->getOperand(0);
11541 SDValue BarOp = Op->getOperand(2);
11542 unsigned Opc;
11543
11544 if (isa<ConstantSDNode>(BarOp)) {
11545 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11546 switch (IntrinsicID) {
11547 default:
11548 return SDValue();
11549 case Intrinsic::amdgcn_s_barrier_join:
11550 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11551 break;
11552 case Intrinsic::amdgcn_s_wakeup_barrier:
11553 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11554 break;
11555 }
11556 // extract the BarrierID from bits 4-9 of the immediate
11557 unsigned BarID = (BarVal >> 4) & 0x3F;
11558 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11559 Ops.push_back(K);
11560 Ops.push_back(Chain);
11561 } else {
11562 switch (IntrinsicID) {
11563 default:
11564 return SDValue();
11565 case Intrinsic::amdgcn_s_barrier_join:
11566 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11567 break;
11568 case Intrinsic::amdgcn_s_wakeup_barrier:
11569 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11570 break;
11571 }
11572 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11573 SDValue M0Val;
11574 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11575 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11576 M0Val =
11577 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11578 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11579 0);
11580 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11581 }
11582
11583 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11584 return SDValue(NewMI, 0);
11585 }
11586 case Intrinsic::amdgcn_s_prefetch_data: {
11587 // For non-global address space preserve the chain and remove the call.
11589 return Op.getOperand(0);
11590 return Op;
11591 }
11592 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11593 SDValue Ops[] = {
11594 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11595 Op.getOperand(3), // offset
11596 Op.getOperand(4), // length
11597 };
11598
11599 MemSDNode *M = cast<MemSDNode>(Op);
11600 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
11601 Op->getVTList(), Ops, M->getMemoryVT(),
11602 M->getMemOperand());
11603 }
11604 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11605 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11606 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11607 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11608 SDValue Chain = Op->getOperand(0);
11609 SDValue Ptr = Op->getOperand(2);
11610 SDValue Val = Op->getOperand(3);
11611 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11612 Ptr, MII->getMemOperand());
11613 }
11614 default: {
11615 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11617 return lowerImage(Op, ImageDimIntr, DAG, true);
11618
11619 return Op;
11620 }
11621 }
11622}
11623
11624// Return whether the operation has NoUnsignedWrap property.
11625static bool isNoUnsignedWrap(SDValue Addr) {
11626 return (Addr.getOpcode() == ISD::ADD &&
11627 Addr->getFlags().hasNoUnsignedWrap()) ||
11628 Addr->getOpcode() == ISD::OR;
11629}
11630
11632 EVT PtrVT) const {
11633 return PtrVT == MVT::i64;
11634}
11635
11637 EVT PtrVT) const {
11638 return true;
11639}
11640
11641// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11642// offset (the offset that is included in bounds checking and swizzling, to be
11643// split between the instruction's voffset and immoffset fields) and soffset
11644// (the offset that is excluded from bounds checking and swizzling, to go in
11645// the instruction's soffset field). This function takes the first kind of
11646// offset and figures out how to split it between voffset and immoffset.
11647std::pair<SDValue, SDValue>
11648SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11649 SDLoc DL(Offset);
11650 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11651 SDValue N0 = Offset;
11652 ConstantSDNode *C1 = nullptr;
11653
11654 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11655 N0 = SDValue();
11656 else if (DAG.isBaseWithConstantOffset(N0)) {
11657 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11658 // being added, so we can only safely match a 32-bit addition with no
11659 // unsigned overflow.
11660 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11661 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11662 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11663 N0 = N0.getOperand(0);
11664 }
11665 }
11666
11667 if (C1) {
11668 unsigned ImmOffset = C1->getZExtValue();
11669 // If the immediate value is too big for the immoffset field, put only bits
11670 // that would normally fit in the immoffset field. The remaining value that
11671 // is copied/added for the voffset field is a large power of 2, and it
11672 // stands more chance of being CSEd with the copy/add for another similar
11673 // load/store.
11674 // However, do not do that rounding down if that is a negative
11675 // number, as it appears to be illegal to have a negative offset in the
11676 // vgpr, even if adding the immediate offset makes it positive.
11677 unsigned Overflow = ImmOffset & ~MaxImm;
11678 ImmOffset -= Overflow;
11679 if ((int32_t)Overflow < 0) {
11680 Overflow += ImmOffset;
11681 ImmOffset = 0;
11682 }
11683 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11684 if (Overflow) {
11685 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11686 if (!N0)
11687 N0 = OverflowVal;
11688 else {
11689 SDValue Ops[] = {N0, OverflowVal};
11690 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11691 }
11692 }
11693 }
11694 if (!N0)
11695 N0 = DAG.getConstant(0, DL, MVT::i32);
11696 if (!C1)
11697 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11698 return {N0, SDValue(C1, 0)};
11699}
11700
11701// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11702// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11703// pointed to by Offsets.
11704void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11705 SelectionDAG &DAG, SDValue *Offsets,
11706 Align Alignment) const {
11707 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11708 SDLoc DL(CombinedOffset);
11709 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11710 uint32_t Imm = C->getZExtValue();
11711 uint32_t SOffset, ImmOffset;
11712 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11713 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11714 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11715 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11716 return;
11717 }
11718 }
11719 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11720 SDValue N0 = CombinedOffset.getOperand(0);
11721 SDValue N1 = CombinedOffset.getOperand(1);
11722 uint32_t SOffset, ImmOffset;
11723 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11724 if (Offset >= 0 &&
11725 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11726 Offsets[0] = N0;
11727 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11728 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11729 return;
11730 }
11731 }
11732
11733 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11734 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11735 : DAG.getConstant(0, DL, MVT::i32);
11736
11737 Offsets[0] = CombinedOffset;
11738 Offsets[1] = SOffsetZero;
11739 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11740}
11741
11742SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11743 SelectionDAG &DAG) const {
11744 if (!MaybePointer.getValueType().isScalarInteger())
11745 return MaybePointer;
11746
11747 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11748 return Rsrc;
11749}
11750
11751// Wrap a global or flat pointer into a buffer intrinsic using the flags
11752// specified in the intrinsic.
11753SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11754 SelectionDAG &DAG) const {
11755 SDLoc Loc(Op);
11756
11757 SDValue Pointer = Op->getOperand(1);
11758 SDValue Stride = Op->getOperand(2);
11759 SDValue NumRecords = Op->getOperand(3);
11760 SDValue Flags = Op->getOperand(4);
11761
11762 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11763 SDValue Rsrc;
11764
11765 if (Subtarget->has45BitNumRecordsBufferResource()) {
11766 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
11767 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
11768 // num_records.
11769 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
11770 SDValue NumRecordsLHS =
11771 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
11772 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
11773 SDValue LowHalf =
11774 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
11775
11776 // Build the higher 64-bit value, which has the higher 38-bit num_records,
11777 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
11778 SDValue NumRecordsRHS =
11779 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
11780 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
11781 SDValue ShiftedStride =
11782 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11783 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
11784 SDValue ExtShiftedStrideVec =
11785 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
11786 SDValue ExtShiftedStride =
11787 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11788 SDValue ShiftedFlags =
11789 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
11790 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
11791 SDValue ExtShiftedFlagsVec =
11792 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
11793 SDValue ExtShiftedFlags =
11794 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11795 SDValue CombinedFields =
11796 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11797 SDValue HighHalf =
11798 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11799
11800 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
11801 } else {
11802 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
11803 auto [LowHalf, HighHalf] =
11804 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11805 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11806 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11807 SDValue ShiftedStride =
11808 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11809 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11810 SDValue NewHighHalf =
11811 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11812
11813 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
11814 NumRecords, Flags);
11815 }
11816
11817 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11818 return RsrcPtr;
11819}
11820
11821// Handle 8 bit and 16 bit buffer loads
11822SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11823 EVT LoadVT, SDLoc DL,
11825 MachineMemOperand *MMO,
11826 bool IsTFE) const {
11827 EVT IntVT = LoadVT.changeTypeToInteger();
11828
11829 if (IsTFE) {
11830 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11831 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11832 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11833 MachineFunction &MF = DAG.getMachineFunction();
11834 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11835 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11836 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11837 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11838 DAG.getConstant(1, DL, MVT::i32));
11839 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11840 DAG.getConstant(0, DL, MVT::i32));
11841 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11842 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11843 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11844 }
11845
11846 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11847 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11848 : AMDGPUISD::BUFFER_LOAD_USHORT;
11849
11850 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11851 SDValue BufferLoad =
11852 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11853 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11854 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11855
11856 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11857}
11858
11859// Handle 8 bit and 16 bit buffer stores
11860SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11861 EVT VDataType, SDLoc DL,
11862 SDValue Ops[],
11863 MemSDNode *M) const {
11864 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11865 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11866
11867 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11868 Ops[1] = BufferStoreExt;
11869 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11870 : AMDGPUISD::BUFFER_STORE_SHORT;
11871 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11872 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11873 M->getMemOperand());
11874}
11875
11877 SDValue Op, const SDLoc &SL, EVT VT) {
11878 if (VT.bitsLT(Op.getValueType()))
11879 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11880
11881 switch (ExtType) {
11882 case ISD::SEXTLOAD:
11883 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11884 case ISD::ZEXTLOAD:
11885 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11886 case ISD::EXTLOAD:
11887 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11888 case ISD::NON_EXTLOAD:
11889 return Op;
11890 }
11891
11892 llvm_unreachable("invalid ext type");
11893}
11894
11895// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11896// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11897SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11898 DAGCombinerInfo &DCI) const {
11899 SelectionDAG &DAG = DCI.DAG;
11900 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11901 return SDValue();
11902
11903 // FIXME: Constant loads should all be marked invariant.
11904 unsigned AS = Ld->getAddressSpace();
11905 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11907 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11908 return SDValue();
11909
11910 // Don't do this early, since it may interfere with adjacent load merging for
11911 // illegal types. We can avoid losing alignment information for exotic types
11912 // pre-legalize.
11913 EVT MemVT = Ld->getMemoryVT();
11914 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11915 MemVT.getSizeInBits() >= 32)
11916 return SDValue();
11917
11918 SDLoc SL(Ld);
11919
11920 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11921 "unexpected vector extload");
11922
11923 // TODO: Drop only high part of range.
11924 SDValue Ptr = Ld->getBasePtr();
11925 SDValue NewLoad = DAG.getLoad(
11926 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11927 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11928 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11929 nullptr); // Drop ranges
11930
11931 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11932 if (MemVT.isFloatingPoint()) {
11934 "unexpected fp extload");
11935 TruncVT = MemVT.changeTypeToInteger();
11936 }
11937
11938 SDValue Cvt = NewLoad;
11939 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11940 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11941 DAG.getValueType(TruncVT));
11942 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11944 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11945 } else {
11947 }
11948
11949 EVT VT = Ld->getValueType(0);
11950 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11951
11952 DCI.AddToWorklist(Cvt.getNode());
11953
11954 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11955 // the appropriate extension from the 32-bit load.
11956 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11957 DCI.AddToWorklist(Cvt.getNode());
11958
11959 // Handle conversion back to floating point if necessary.
11960 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11961
11962 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11963}
11964
11966 const SIMachineFunctionInfo &Info) {
11967 // TODO: Should check if the address can definitely not access stack.
11968 if (Info.isEntryFunction())
11969 return Info.getUserSGPRInfo().hasFlatScratchInit();
11970 return true;
11971}
11972
11973SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11974 SDLoc DL(Op);
11975 LoadSDNode *Load = cast<LoadSDNode>(Op);
11976 ISD::LoadExtType ExtType = Load->getExtensionType();
11977 EVT MemVT = Load->getMemoryVT();
11978 MachineMemOperand *MMO = Load->getMemOperand();
11979
11980 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11981 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11982 return SDValue();
11983
11984 // FIXME: Copied from PPC
11985 // First, load into 32 bits, then truncate to 1 bit.
11986
11987 SDValue Chain = Load->getChain();
11988 SDValue BasePtr = Load->getBasePtr();
11989
11990 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11991
11992 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11993 RealMemVT, MMO);
11994
11995 if (!MemVT.isVector()) {
11996 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11997 NewLD.getValue(1)};
11998
11999 return DAG.getMergeValues(Ops, DL);
12000 }
12001
12003 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12004 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
12005 DAG.getConstant(I, DL, MVT::i32));
12006
12007 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
12008 }
12009
12010 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
12011
12012 return DAG.getMergeValues(Ops, DL);
12013 }
12014
12015 if (!MemVT.isVector())
12016 return SDValue();
12017
12018 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12019 "Custom lowering for non-i32 vectors hasn't been implemented.");
12020
12021 Align Alignment = Load->getAlign();
12022 unsigned AS = Load->getAddressSpace();
12023 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12024 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12025 return SplitVectorLoad(Op, DAG);
12026 }
12027
12028 MachineFunction &MF = DAG.getMachineFunction();
12029 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12030 // If there is a possibility that flat instruction access scratch memory
12031 // then we need to use the same legalization rules we use for private.
12032 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12033 !Subtarget->hasMultiDwordFlatScratchAddressing())
12034 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
12037
12038 unsigned NumElements = MemVT.getVectorNumElements();
12039
12040 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12042 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12043 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12044 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
12045 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12046 Alignment >= Align(4) && NumElements < 32) {
12047 if (MemVT.isPow2VectorType() ||
12048 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12049 return SDValue();
12050 return WidenOrSplitVectorLoad(Op, DAG);
12051 }
12052 // Non-uniform loads will be selected to MUBUF instructions, so they
12053 // have the same legalization requirements as global and private
12054 // loads.
12055 //
12056 }
12057 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12060 if (NumElements > 4)
12061 return SplitVectorLoad(Op, DAG);
12062 // v3 loads not supported on SI.
12063 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12064 return WidenOrSplitVectorLoad(Op, DAG);
12065
12066 // v3 and v4 loads are supported for private and global memory.
12067 return SDValue();
12068 }
12069 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12070 // Depending on the setting of the private_element_size field in the
12071 // resource descriptor, we can only make private accesses up to a certain
12072 // size.
12073 switch (Subtarget->getMaxPrivateElementSize()) {
12074 case 4: {
12075 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
12076 return DAG.getMergeValues({Op0, Op1}, DL);
12077 }
12078 case 8:
12079 if (NumElements > 2)
12080 return SplitVectorLoad(Op, DAG);
12081 return SDValue();
12082 case 16:
12083 // Same as global/flat
12084 if (NumElements > 4)
12085 return SplitVectorLoad(Op, DAG);
12086 // v3 loads not supported on SI.
12087 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12088 return WidenOrSplitVectorLoad(Op, DAG);
12089
12090 return SDValue();
12091 default:
12092 llvm_unreachable("unsupported private_element_size");
12093 }
12094 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12095 unsigned Fast = 0;
12096 auto Flags = Load->getMemOperand()->getFlags();
12098 Load->getAlign(), Flags, &Fast) &&
12099 Fast > 1)
12100 return SDValue();
12101
12102 if (MemVT.isVector())
12103 return SplitVectorLoad(Op, DAG);
12104 }
12105
12107 MemVT, *Load->getMemOperand())) {
12108 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
12109 return DAG.getMergeValues({Op0, Op1}, DL);
12110 }
12111
12112 return SDValue();
12113}
12114
12115SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12116 EVT VT = Op.getValueType();
12117 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12118 VT.getSizeInBits() == 512)
12119 return splitTernaryVectorOp(Op, DAG);
12120
12121 assert(VT.getSizeInBits() == 64);
12122
12123 SDLoc DL(Op);
12124 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
12125
12126 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
12127 SDValue One = DAG.getConstant(1, DL, MVT::i32);
12128
12129 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
12130 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
12131
12132 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
12133 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
12134
12135 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
12136
12137 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
12138 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
12139
12140 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
12141
12142 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
12143 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
12144}
12145
12146// Catch division cases where we can use shortcuts with rcp and rsq
12147// instructions.
12148SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12149 SelectionDAG &DAG) const {
12150 SDLoc SL(Op);
12151 SDValue LHS = Op.getOperand(0);
12152 SDValue RHS = Op.getOperand(1);
12153 EVT VT = Op.getValueType();
12154 const SDNodeFlags Flags = Op->getFlags();
12155
12156 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12157
12158 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
12159 // Without !fpmath accuracy information, we can't do more because we don't
12160 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12161 // f16 is always accurate enough
12162 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12163 return SDValue();
12164
12165 if (CLHS->isExactlyValue(1.0)) {
12166 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12167 // the CI documentation has a worst case error of 1 ulp.
12168 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12169 // use it as long as we aren't trying to use denormals.
12170 //
12171 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12172
12173 // 1.0 / sqrt(x) -> rsq(x)
12174
12175 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12176 // error seems really high at 2^29 ULP.
12177 // 1.0 / x -> rcp(x)
12178 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12179 }
12180
12181 // Same as for 1.0, but expand the sign out of the constant.
12182 if (CLHS->isExactlyValue(-1.0)) {
12183 // -1.0 / x -> rcp (fneg x)
12184 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12185 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12186 }
12187 }
12188
12189 // For f16 and bf16 require afn or arcp.
12190 // For f32 require afn.
12191 if (!AllowInaccurateRcp &&
12192 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12193 return SDValue();
12194
12195 // Turn into multiply by the reciprocal.
12196 // x / y -> x * (1.0 / y)
12197 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12198 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12199}
12200
12201SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12202 SelectionDAG &DAG) const {
12203 SDLoc SL(Op);
12204 SDValue X = Op.getOperand(0);
12205 SDValue Y = Op.getOperand(1);
12206 EVT VT = Op.getValueType();
12207 const SDNodeFlags Flags = Op->getFlags();
12208
12209 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12210 if (!AllowInaccurateDiv)
12211 return SDValue();
12212
12213 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12214 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12215
12216 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12217 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12218
12219 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12220 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12221 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12222 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12223 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12224 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12225}
12226
12227static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12228 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12229 SDNodeFlags Flags) {
12230 if (GlueChain->getNumValues() <= 1) {
12231 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12232 }
12233
12234 assert(GlueChain->getNumValues() == 3);
12235
12236 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12237 switch (Opcode) {
12238 default:
12239 llvm_unreachable("no chain equivalent for opcode");
12240 case ISD::FMUL:
12241 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12242 break;
12243 }
12244
12245 return DAG.getNode(Opcode, SL, VTList,
12246 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12247 Flags);
12248}
12249
12250static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12251 EVT VT, SDValue A, SDValue B, SDValue C,
12252 SDValue GlueChain, SDNodeFlags Flags) {
12253 if (GlueChain->getNumValues() <= 1) {
12254 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12255 }
12256
12257 assert(GlueChain->getNumValues() == 3);
12258
12259 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12260 switch (Opcode) {
12261 default:
12262 llvm_unreachable("no chain equivalent for opcode");
12263 case ISD::FMA:
12264 Opcode = AMDGPUISD::FMA_W_CHAIN;
12265 break;
12266 }
12267
12268 return DAG.getNode(Opcode, SL, VTList,
12269 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12270 Flags);
12271}
12272
12273SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12274 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12275 return FastLowered;
12276
12277 SDLoc SL(Op);
12278 EVT VT = Op.getValueType();
12279 SDValue LHS = Op.getOperand(0);
12280 SDValue RHS = Op.getOperand(1);
12281
12282 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12283 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12284
12285 if (VT == MVT::bf16) {
12286 SDValue ExtDiv =
12287 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12288 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12289 DAG.getTargetConstant(0, SL, MVT::i32));
12290 }
12291
12292 assert(VT == MVT::f16);
12293
12294 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12295 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12296 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12297 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12298 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12299 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12300 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12301 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12302 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12303 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12304 // q16.u = opx(V_CVT_F16_F32, q32.u);
12305 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12306
12307 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12308 unsigned FMADOpCode =
12310 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12311 SDValue Rcp =
12312 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12313 SDValue Quot =
12314 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12315 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12316 Op->getFlags());
12317 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12318 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12319 Op->getFlags());
12320 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12321 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12322 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12323 DAG.getConstant(0xff800000, SL, MVT::i32));
12324 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12325 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12326 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12327 DAG.getTargetConstant(0, SL, MVT::i32));
12328 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12329 Op->getFlags());
12330}
12331
12332// Faster 2.5 ULP division that does not support denormals.
12333SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12334 SDNodeFlags Flags = Op->getFlags();
12335 SDLoc SL(Op);
12336 SDValue LHS = Op.getOperand(1);
12337 SDValue RHS = Op.getOperand(2);
12338
12339 // TODO: The combiner should probably handle elimination of redundant fabs.
12341 ? RHS
12342 : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12343
12344 const APFloat K0Val(0x1p+96f);
12345 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12346
12347 const APFloat K1Val(0x1p-32f);
12348 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12349
12350 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12351
12352 EVT SetCCVT =
12353 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12354
12355 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12356
12357 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12358
12359 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12360
12361 // rcp does not support denormals.
12362 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12363
12364 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12365
12366 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12367}
12368
12369// Returns immediate value for setting the F32 denorm mode when using the
12370// S_DENORM_MODE instruction.
12373 const GCNSubtarget *ST) {
12374 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12375 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12376 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12377 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12378}
12379
12380SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12381 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12382 return FastLowered;
12383
12384 // The selection matcher assumes anything with a chain selecting to a
12385 // mayRaiseFPException machine instruction. Since we're introducing a chain
12386 // here, we need to explicitly report nofpexcept for the regular fdiv
12387 // lowering.
12388 SDNodeFlags Flags = Op->getFlags();
12389 Flags.setNoFPExcept(true);
12390
12391 SDLoc SL(Op);
12392 SDValue LHS = Op.getOperand(0);
12393 SDValue RHS = Op.getOperand(1);
12394
12395 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12396
12397 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12398
12399 SDValue DenominatorScaled =
12400 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12401 SDValue NumeratorScaled =
12402 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12403
12404 // Denominator is scaled to not be denormal, so using rcp is ok.
12405 SDValue ApproxRcp =
12406 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12407 SDValue NegDivScale0 =
12408 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12409
12410 using namespace AMDGPU::Hwreg;
12411 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12412 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12413
12414 const MachineFunction &MF = DAG.getMachineFunction();
12415 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12416 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12417
12418 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12419 const bool HasDynamicDenormals =
12420 (DenormMode.Input == DenormalMode::Dynamic) ||
12421 (DenormMode.Output == DenormalMode::Dynamic);
12422
12423 SDValue SavedDenormMode;
12424
12425 if (!PreservesDenormals) {
12426 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12427 // lowering. The chain dependence is insufficient, and we need glue. We do
12428 // not need the glue variants in a strictfp function.
12429
12430 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12431
12432 SDValue Glue = DAG.getEntryNode();
12433 if (HasDynamicDenormals) {
12434 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12435 DAG.getVTList(MVT::i32, MVT::Glue),
12436 {BitField, Glue});
12437 SavedDenormMode = SDValue(GetReg, 0);
12438
12439 Glue = DAG.getMergeValues(
12440 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12441 }
12442
12443 SDNode *EnableDenorm;
12444 if (Subtarget->hasDenormModeInst()) {
12445 const SDValue EnableDenormValue =
12447
12448 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12449 EnableDenormValue)
12450 .getNode();
12451 } else {
12452 const SDValue EnableDenormValue =
12453 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12454 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12455 {EnableDenormValue, BitField, Glue});
12456 }
12457
12458 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12459 SDValue(EnableDenorm, 1)};
12460
12461 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12462 }
12463
12464 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12465 ApproxRcp, One, NegDivScale0, Flags);
12466
12467 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12468 ApproxRcp, Fma0, Flags);
12469
12470 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12471 Fma1, Flags);
12472
12473 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12474 NumeratorScaled, Mul, Flags);
12475
12476 SDValue Fma3 =
12477 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12478
12479 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12480 NumeratorScaled, Fma3, Flags);
12481
12482 if (!PreservesDenormals) {
12483 SDNode *DisableDenorm;
12484 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12485 const SDValue DisableDenormValue = getSPDenormModeValue(
12486 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12487
12488 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12489 DisableDenorm =
12490 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12491 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12492 .getNode();
12493 } else {
12494 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12495 const SDValue DisableDenormValue =
12496 HasDynamicDenormals
12497 ? SavedDenormMode
12498 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12499
12500 DisableDenorm = DAG.getMachineNode(
12501 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12502 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12503 }
12504
12505 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12506 SDValue(DisableDenorm, 0), DAG.getRoot());
12507 DAG.setRoot(OutputChain);
12508 }
12509
12510 SDValue Scale = NumeratorScaled.getValue(1);
12511 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12512 {Fma4, Fma1, Fma3, Scale}, Flags);
12513
12514 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12515}
12516
12517SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12518 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12519 return FastLowered;
12520
12521 SDLoc SL(Op);
12522 SDValue X = Op.getOperand(0);
12523 SDValue Y = Op.getOperand(1);
12524
12525 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12526
12527 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12528
12529 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12530
12531 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12532
12533 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12534
12535 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12536
12537 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12538
12539 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12540
12541 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12542
12543 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12544 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12545
12546 SDValue Fma4 =
12547 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12548
12549 SDValue Scale;
12550
12551 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12552 // Workaround a hardware bug on SI where the condition output from div_scale
12553 // is not usable.
12554
12555 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12556
12557 // Figure out if the scale to use for div_fmas.
12558 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12559 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12560 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12561 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12562
12563 SDValue NumHi =
12564 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12565 SDValue DenHi =
12566 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12567
12568 SDValue Scale0Hi =
12569 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12570 SDValue Scale1Hi =
12571 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12572
12573 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12574 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12575 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12576 } else {
12577 Scale = DivScale1.getValue(1);
12578 }
12579
12580 SDValue Fmas =
12581 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12582
12583 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12584}
12585
12586SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12587 EVT VT = Op.getValueType();
12588
12589 if (VT == MVT::f32)
12590 return LowerFDIV32(Op, DAG);
12591
12592 if (VT == MVT::f64)
12593 return LowerFDIV64(Op, DAG);
12594
12595 if (VT == MVT::f16 || VT == MVT::bf16)
12596 return LowerFDIV16(Op, DAG);
12597
12598 llvm_unreachable("Unexpected type for fdiv");
12599}
12600
12601SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12602 SDLoc dl(Op);
12603 SDValue Val = Op.getOperand(0);
12604 EVT VT = Val.getValueType();
12605 EVT ResultExpVT = Op->getValueType(1);
12606 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12607
12608 SDValue Mant = DAG.getNode(
12610 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12611
12612 SDValue Exp = DAG.getNode(
12613 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12614 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12615
12616 if (Subtarget->hasFractBug()) {
12617 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12618 SDValue Inf =
12620
12621 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12622 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12623 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12624 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12625 }
12626
12627 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12628 return DAG.getMergeValues({Mant, CastExp}, dl);
12629}
12630
12631SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12632 SDLoc DL(Op);
12633 StoreSDNode *Store = cast<StoreSDNode>(Op);
12634 EVT VT = Store->getMemoryVT();
12635
12636 if (VT == MVT::i1) {
12637 return DAG.getTruncStore(
12638 Store->getChain(), DL,
12639 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12640 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12641 }
12642
12643 assert(VT.isVector() &&
12644 Store->getValue().getValueType().getScalarType() == MVT::i32);
12645
12646 unsigned AS = Store->getAddressSpace();
12647 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12648 Store->getAlign().value() < VT.getStoreSize() &&
12649 VT.getSizeInBits() > 32) {
12650 return SplitVectorStore(Op, DAG);
12651 }
12652
12653 MachineFunction &MF = DAG.getMachineFunction();
12654 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12655 // If there is a possibility that flat instruction access scratch memory
12656 // then we need to use the same legalization rules we use for private.
12657 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12658 !Subtarget->hasMultiDwordFlatScratchAddressing())
12659 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12662
12663 unsigned NumElements = VT.getVectorNumElements();
12665 if (NumElements > 4)
12666 return SplitVectorStore(Op, DAG);
12667 // v3 stores not supported on SI.
12668 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12669 return SplitVectorStore(Op, DAG);
12670
12672 VT, *Store->getMemOperand()))
12673 return expandUnalignedStore(Store, DAG);
12674
12675 return SDValue();
12676 }
12677 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12678 switch (Subtarget->getMaxPrivateElementSize()) {
12679 case 4:
12680 return scalarizeVectorStore(Store, DAG);
12681 case 8:
12682 if (NumElements > 2)
12683 return SplitVectorStore(Op, DAG);
12684 return SDValue();
12685 case 16:
12686 if (NumElements > 4 ||
12687 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12688 return SplitVectorStore(Op, DAG);
12689 return SDValue();
12690 default:
12691 llvm_unreachable("unsupported private_element_size");
12692 }
12693 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12694 unsigned Fast = 0;
12695 auto Flags = Store->getMemOperand()->getFlags();
12697 Store->getAlign(), Flags, &Fast) &&
12698 Fast > 1)
12699 return SDValue();
12700
12701 if (VT.isVector())
12702 return SplitVectorStore(Op, DAG);
12703
12704 return expandUnalignedStore(Store, DAG);
12705 }
12706
12707 // Probably an invalid store. If so we'll end up emitting a selection error.
12708 return SDValue();
12709}
12710
12711// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12712SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12713 SDLoc SL(Op);
12714 assert(!Subtarget->has16BitInsts());
12715 SDNodeFlags Flags = Op->getFlags();
12716 SDValue Ext =
12717 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12718
12719 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12720 SDValue Sqrt =
12721 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12722
12723 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12724 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12725}
12726
12727SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12728 SDLoc DL(Op);
12729 SDNodeFlags Flags = Op->getFlags();
12730 MVT VT = Op.getValueType().getSimpleVT();
12731 const SDValue X = Op.getOperand(0);
12732
12733 if (allowApproxFunc(DAG, Flags)) {
12734 // Instruction is 1ulp but ignores denormals.
12735 return DAG.getNode(
12737 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12738 }
12739
12740 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12741 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12742
12743 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12744
12745 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12746
12747 SDValue SqrtX =
12748 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12749
12750 SDValue SqrtS;
12751 if (needsDenormHandlingF32(DAG, X, Flags)) {
12752 SDValue SqrtID =
12753 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12754 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12755
12756 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12757 SDValue SqrtSNextDownInt =
12758 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12759 DAG.getAllOnesConstant(DL, MVT::i32));
12760 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12761
12762 SDValue NegSqrtSNextDown =
12763 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12764
12765 SDValue SqrtVP =
12766 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12767
12768 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12769 DAG.getConstant(1, DL, MVT::i32));
12770 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12771
12772 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12773 SDValue SqrtVS =
12774 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12775
12776 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12777 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12778
12779 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12780 Flags);
12781
12782 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12783 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12784 Flags);
12785 } else {
12786 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12787
12788 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12789
12790 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12791 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12792 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12793
12794 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12795 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12796 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12797
12798 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12799 SDValue SqrtD =
12800 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12801 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12802 }
12803
12804 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12805
12806 SDValue ScaledDown =
12807 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12808
12809 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12810 SDValue IsZeroOrInf =
12811 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12812 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12813
12814 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12815}
12816
12817SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12818 // For double type, the SQRT and RSQ instructions don't have required
12819 // precision, we apply Goldschmidt's algorithm to improve the result:
12820 //
12821 // y0 = rsq(x)
12822 // g0 = x * y0
12823 // h0 = 0.5 * y0
12824 //
12825 // r0 = 0.5 - h0 * g0
12826 // g1 = g0 * r0 + g0
12827 // h1 = h0 * r0 + h0
12828 //
12829 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12830 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12831 // h2 = h1 * r1 + h1
12832 //
12833 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12834 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12835 //
12836 // sqrt(x) = g3
12837
12838 SDNodeFlags Flags = Op->getFlags();
12839
12840 SDLoc DL(Op);
12841
12842 SDValue X = Op.getOperand(0);
12843 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12844
12845 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12846
12847 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12848
12849 // Scale up input if it is too small.
12850 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12851 SDValue ScaleUp =
12852 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12853 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12854
12855 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12856
12857 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12858
12859 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12860 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12861
12862 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12863 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12864
12865 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12866
12867 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12868
12869 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12870 SDValue SqrtD0 =
12871 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12872
12873 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12874
12875 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12876 SDValue SqrtD1 =
12877 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12878
12879 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12880
12881 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12882 SDValue ScaleDown =
12883 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12884 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12885
12886 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12887 // with finite only or nsz because rsq(+/-0) = +/-inf
12888
12889 // TODO: Check for DAZ and expand to subnormals
12890 SDValue IsZeroOrInf =
12891 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12892 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12893
12894 // If x is +INF, +0, or -0, use its original value
12895 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12896 Flags);
12897}
12898
12899SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12900 SDLoc DL(Op);
12901 EVT VT = Op.getValueType();
12902 SDValue Arg = Op.getOperand(0);
12903 SDValue TrigVal;
12904
12905 // Propagate fast-math flags so that the multiply we introduce can be folded
12906 // if Arg is already the result of a multiply by constant.
12907 auto Flags = Op->getFlags();
12908
12909 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12910
12911 if (Subtarget->hasTrigReducedRange()) {
12912 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12913 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12914 } else {
12915 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12916 }
12917
12918 switch (Op.getOpcode()) {
12919 case ISD::FCOS:
12920 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12921 case ISD::FSIN:
12922 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12923 default:
12924 llvm_unreachable("Wrong trig opcode");
12925 }
12926}
12927
12928SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12929 SelectionDAG &DAG) const {
12930 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12931 assert(AtomicNode->isCompareAndSwap());
12932 unsigned AS = AtomicNode->getAddressSpace();
12933
12934 // No custom lowering required for local address space
12936 return Op;
12937
12938 // Non-local address space requires custom lowering for atomic compare
12939 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12940 SDLoc DL(Op);
12941 SDValue ChainIn = Op.getOperand(0);
12942 SDValue Addr = Op.getOperand(1);
12943 SDValue Old = Op.getOperand(2);
12944 SDValue New = Op.getOperand(3);
12945 EVT VT = Op.getValueType();
12946 MVT SimpleVT = VT.getSimpleVT();
12947 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12948
12949 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12950 SDValue Ops[] = {ChainIn, Addr, NewOld};
12951
12952 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
12953 Op->getVTList(), Ops, VT,
12954 AtomicNode->getMemOperand());
12955}
12956
12957//===----------------------------------------------------------------------===//
12958// Custom DAG optimizations
12959//===----------------------------------------------------------------------===//
12960
12961SDValue
12962SITargetLowering::performUCharToFloatCombine(SDNode *N,
12963 DAGCombinerInfo &DCI) const {
12964 EVT VT = N->getValueType(0);
12965 EVT ScalarVT = VT.getScalarType();
12966 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12967 return SDValue();
12968
12969 SelectionDAG &DAG = DCI.DAG;
12970 SDLoc DL(N);
12971
12972 SDValue Src = N->getOperand(0);
12973 EVT SrcVT = Src.getValueType();
12974
12975 // TODO: We could try to match extracting the higher bytes, which would be
12976 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12977 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12978 // about in practice.
12979 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12980 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12981 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12982 DCI.AddToWorklist(Cvt.getNode());
12983
12984 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12985 if (ScalarVT != MVT::f32) {
12986 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12987 DAG.getTargetConstant(0, DL, MVT::i32));
12988 }
12989 return Cvt;
12990 }
12991 }
12992
12993 return SDValue();
12994}
12995
12996SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12997 DAGCombinerInfo &DCI) const {
12998 SDValue MagnitudeOp = N->getOperand(0);
12999 SDValue SignOp = N->getOperand(1);
13000
13001 // The generic combine for fcopysign + fp cast is too conservative with
13002 // vectors, and also gets confused by the splitting we will perform here, so
13003 // peek through FP casts.
13004 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13005 SignOp.getOpcode() == ISD::FP_ROUND)
13006 SignOp = SignOp.getOperand(0);
13007
13008 SelectionDAG &DAG = DCI.DAG;
13009 SDLoc DL(N);
13010 EVT SignVT = SignOp.getValueType();
13011
13012 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13013 // lower half with a copy.
13014 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13015 EVT MagVT = MagnitudeOp.getValueType();
13016
13017 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13018
13019 if (MagVT.getScalarType() == MVT::f64) {
13020 EVT F32VT = MagVT.isVector()
13021 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13022 : MVT::v2f32;
13023
13024 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
13025
13027 for (unsigned I = 0; I != NumElts; ++I) {
13028 SDValue MagLo =
13029 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13030 DAG.getConstant(2 * I, DL, MVT::i32));
13031 SDValue MagHi =
13032 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13033 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13034
13035 SDValue SignOpElt =
13036 MagVT.isVector()
13038 SignOp, DAG.getConstant(I, DL, MVT::i32))
13039 : SignOp;
13040
13041 SDValue HiOp =
13042 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
13043
13044 SDValue Vector =
13045 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
13046
13047 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
13048 NewElts.push_back(NewElt);
13049 }
13050
13051 if (NewElts.size() == 1)
13052 return NewElts[0];
13053
13054 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
13055 }
13056
13057 if (SignVT.getScalarType() != MVT::f64)
13058 return SDValue();
13059
13060 // Reduce width of sign operand, we only need the highest bit.
13061 //
13062 // fcopysign f64:x, f64:y ->
13063 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13064 // TODO: In some cases it might make sense to go all the way to f16.
13065
13066 EVT F32VT = MagVT.isVector()
13067 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13068 : MVT::v2f32;
13069
13070 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
13071
13072 SmallVector<SDValue, 8> F32Signs;
13073 for (unsigned I = 0; I != NumElts; ++I) {
13074 // Take sign from odd elements of cast vector
13075 SDValue SignAsF32 =
13076 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
13077 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13078 F32Signs.push_back(SignAsF32);
13079 }
13080
13081 SDValue NewSign =
13082 NumElts == 1
13083 ? F32Signs.back()
13085 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
13086 F32Signs);
13087
13088 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
13089 NewSign);
13090}
13091
13092// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13093// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13094// bits
13095
13096// This is a variant of
13097// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13098//
13099// The normal DAG combiner will do this, but only if the add has one use since
13100// that would increase the number of instructions.
13101//
13102// This prevents us from seeing a constant offset that can be folded into a
13103// memory instruction's addressing mode. If we know the resulting add offset of
13104// a pointer can be folded into an addressing offset, we can replace the pointer
13105// operand with the add of new constant offset. This eliminates one of the uses,
13106// and may allow the remaining use to also be simplified.
13107//
13108SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13109 EVT MemVT,
13110 DAGCombinerInfo &DCI) const {
13111 SDValue N0 = N->getOperand(0);
13112 SDValue N1 = N->getOperand(1);
13113
13114 // We only do this to handle cases where it's profitable when there are
13115 // multiple uses of the add, so defer to the standard combine.
13116 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13117 return SDValue();
13118
13119 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
13120 if (!CN1)
13121 return SDValue();
13122
13123 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13124 if (!CAdd)
13125 return SDValue();
13126
13127 SelectionDAG &DAG = DCI.DAG;
13128
13129 if (N0->getOpcode() == ISD::OR &&
13130 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
13131 return SDValue();
13132
13133 // If the resulting offset is too large, we can't fold it into the
13134 // addressing mode offset.
13135 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13136 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
13137
13138 AddrMode AM;
13139 AM.HasBaseReg = true;
13140 AM.BaseOffs = Offset.getSExtValue();
13141 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
13142 return SDValue();
13143
13144 SDLoc SL(N);
13145 EVT VT = N->getValueType(0);
13146
13147 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
13148 SDValue COffset = DAG.getConstant(Offset, SL, VT);
13149
13150 SDNodeFlags Flags;
13151 Flags.setNoUnsignedWrap(
13152 N->getFlags().hasNoUnsignedWrap() &&
13153 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13154
13155 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13156 // be sure that the new left operand is a proper base pointer.
13157 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
13158}
13159
13160/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13161/// by the chain and intrinsic ID. Theoretically we would also need to check the
13162/// specific intrinsic, but they all place the pointer operand first.
13163static unsigned getBasePtrIndex(const MemSDNode *N) {
13164 switch (N->getOpcode()) {
13165 case ISD::STORE:
13168 return 2;
13169 default:
13170 return 1;
13171 }
13172}
13173
13174SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13175 DAGCombinerInfo &DCI) const {
13176 SelectionDAG &DAG = DCI.DAG;
13177
13178 unsigned PtrIdx = getBasePtrIndex(N);
13179 SDValue Ptr = N->getOperand(PtrIdx);
13180
13181 // TODO: We could also do this for multiplies.
13182 if (Ptr.getOpcode() == ISD::SHL) {
13183 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13184 N->getMemoryVT(), DCI);
13185 if (NewPtr) {
13186 SmallVector<SDValue, 8> NewOps(N->ops());
13187
13188 NewOps[PtrIdx] = NewPtr;
13189 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13190 }
13191 }
13192
13193 return SDValue();
13194}
13195
13196static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13197 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13198 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13199 (Opc == ISD::XOR && Val == 0);
13200}
13201
13202// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13203// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13204// integer combine opportunities since most 64-bit operations are decomposed
13205// this way. TODO: We won't want this for SALU especially if it is an inline
13206// immediate.
13207SDValue SITargetLowering::splitBinaryBitConstantOp(
13208 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13209 const ConstantSDNode *CRHS) const {
13210 uint64_t Val = CRHS->getZExtValue();
13211 uint32_t ValLo = Lo_32(Val);
13212 uint32_t ValHi = Hi_32(Val);
13213 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13214
13215 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13217 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13218 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13219 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13220 !CRHS->user_begin()->isDivergent())
13221 return SDValue();
13222
13223 // If we need to materialize a 64-bit immediate, it will be split up later
13224 // anyway. Avoid creating the harder to understand 64-bit immediate
13225 // materialization.
13226 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13227 }
13228
13229 return SDValue();
13230}
13231
13233 if (V.getValueType() != MVT::i1)
13234 return false;
13235 switch (V.getOpcode()) {
13236 default:
13237 break;
13238 case ISD::SETCC:
13239 case ISD::IS_FPCLASS:
13240 case AMDGPUISD::FP_CLASS:
13241 return true;
13242 case ISD::AND:
13243 case ISD::OR:
13244 case ISD::XOR:
13245 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13246 case ISD::SADDO:
13247 case ISD::UADDO:
13248 case ISD::SSUBO:
13249 case ISD::USUBO:
13250 case ISD::SMULO:
13251 case ISD::UMULO:
13252 return V.getResNo() == 1;
13254 unsigned IntrinsicID = V.getConstantOperandVal(0);
13255 switch (IntrinsicID) {
13256 case Intrinsic::amdgcn_is_shared:
13257 case Intrinsic::amdgcn_is_private:
13258 return true;
13259 default:
13260 return false;
13261 }
13262
13263 return false;
13264 }
13265 }
13266 return false;
13267}
13268
13269// If a constant has all zeroes or all ones within each byte return it.
13270// Otherwise return 0.
13272 // 0xff for any zero byte in the mask
13273 uint32_t ZeroByteMask = 0;
13274 if (!(C & 0x000000ff))
13275 ZeroByteMask |= 0x000000ff;
13276 if (!(C & 0x0000ff00))
13277 ZeroByteMask |= 0x0000ff00;
13278 if (!(C & 0x00ff0000))
13279 ZeroByteMask |= 0x00ff0000;
13280 if (!(C & 0xff000000))
13281 ZeroByteMask |= 0xff000000;
13282 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13283 if ((NonZeroByteMask & C) != NonZeroByteMask)
13284 return 0; // Partial bytes selected.
13285 return C;
13286}
13287
13288// Check if a node selects whole bytes from its operand 0 starting at a byte
13289// boundary while masking the rest. Returns select mask as in the v_perm_b32
13290// or -1 if not succeeded.
13291// Note byte select encoding:
13292// value 0-3 selects corresponding source byte;
13293// value 0xc selects zero;
13294// value 0xff selects 0xff.
13296 assert(V.getValueSizeInBits() == 32);
13297
13298 if (V.getNumOperands() != 2)
13299 return ~0;
13300
13301 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13302 if (!N1)
13303 return ~0;
13304
13305 uint32_t C = N1->getZExtValue();
13306
13307 switch (V.getOpcode()) {
13308 default:
13309 break;
13310 case ISD::AND:
13311 if (uint32_t ConstMask = getConstantPermuteMask(C))
13312 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13313 break;
13314
13315 case ISD::OR:
13316 if (uint32_t ConstMask = getConstantPermuteMask(C))
13317 return (0x03020100 & ~ConstMask) | ConstMask;
13318 break;
13319
13320 case ISD::SHL:
13321 if (C % 8)
13322 return ~0;
13323
13324 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13325
13326 case ISD::SRL:
13327 if (C % 8)
13328 return ~0;
13329
13330 return uint32_t(0x0c0c0c0c03020100ull >> C);
13331 }
13332
13333 return ~0;
13334}
13335
13336SDValue SITargetLowering::performAndCombine(SDNode *N,
13337 DAGCombinerInfo &DCI) const {
13338 if (DCI.isBeforeLegalize())
13339 return SDValue();
13340
13341 SelectionDAG &DAG = DCI.DAG;
13342 EVT VT = N->getValueType(0);
13343 SDValue LHS = N->getOperand(0);
13344 SDValue RHS = N->getOperand(1);
13345
13346 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13347 if (VT == MVT::i64 && CRHS) {
13348 if (SDValue Split =
13349 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13350 return Split;
13351 }
13352
13353 if (CRHS && VT == MVT::i32) {
13354 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13355 // nb = number of trailing zeroes in mask
13356 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13357 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13358 uint64_t Mask = CRHS->getZExtValue();
13359 unsigned Bits = llvm::popcount(Mask);
13360 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13361 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13362 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13363 unsigned Shift = CShift->getZExtValue();
13364 unsigned NB = CRHS->getAPIntValue().countr_zero();
13365 unsigned Offset = NB + Shift;
13366 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13367 SDLoc SL(N);
13368 SDValue BFE =
13369 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13370 DAG.getConstant(Offset, SL, MVT::i32),
13371 DAG.getConstant(Bits, SL, MVT::i32));
13372 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13373 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13374 DAG.getValueType(NarrowVT));
13375 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13376 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13377 return Shl;
13378 }
13379 }
13380 }
13381
13382 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13383 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13384 isa<ConstantSDNode>(LHS.getOperand(2))) {
13385 uint32_t Sel = getConstantPermuteMask(Mask);
13386 if (!Sel)
13387 return SDValue();
13388
13389 // Select 0xc for all zero bytes
13390 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13391 SDLoc DL(N);
13392 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13393 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13394 }
13395 }
13396
13397 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13398 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13399 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13400 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13401 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13402
13403 SDValue X = LHS.getOperand(0);
13404 SDValue Y = RHS.getOperand(0);
13405 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13406 !isTypeLegal(X.getValueType()))
13407 return SDValue();
13408
13409 if (LCC == ISD::SETO) {
13410 if (X != LHS.getOperand(1))
13411 return SDValue();
13412
13413 if (RCC == ISD::SETUNE) {
13414 const ConstantFPSDNode *C1 =
13415 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13416 if (!C1 || !C1->isInfinity() || C1->isNegative())
13417 return SDValue();
13418
13419 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13423
13424 static_assert(
13427 0x3ff) == Mask,
13428 "mask not equal");
13429
13430 SDLoc DL(N);
13431 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13432 DAG.getConstant(Mask, DL, MVT::i32));
13433 }
13434 }
13435 }
13436
13437 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13438 std::swap(LHS, RHS);
13439
13440 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13441 RHS.hasOneUse()) {
13442 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13443 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13444 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13445 // | n_nan)
13446 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13447 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13448 (RHS.getOperand(0) == LHS.getOperand(0) &&
13449 LHS.getOperand(0) == LHS.getOperand(1))) {
13450 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13451 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13452 : Mask->getZExtValue() & OrdMask;
13453
13454 SDLoc DL(N);
13455 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13456 DAG.getConstant(NewMask, DL, MVT::i32));
13457 }
13458 }
13459
13460 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13461 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13462 // and x, (sext cc from i1) => select cc, x, 0
13463 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13464 std::swap(LHS, RHS);
13465 if (isBoolSGPR(RHS.getOperand(0)))
13466 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13467 DAG.getConstant(0, SDLoc(N), MVT::i32));
13468 }
13469
13470 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13471 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13472 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13473 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13474 uint32_t LHSMask = getPermuteMask(LHS);
13475 uint32_t RHSMask = getPermuteMask(RHS);
13476 if (LHSMask != ~0u && RHSMask != ~0u) {
13477 // Canonicalize the expression in an attempt to have fewer unique masks
13478 // and therefore fewer registers used to hold the masks.
13479 if (LHSMask > RHSMask) {
13480 std::swap(LHSMask, RHSMask);
13481 std::swap(LHS, RHS);
13482 }
13483
13484 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13485 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13486 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13487 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13488
13489 // Check of we need to combine values from two sources within a byte.
13490 if (!(LHSUsedLanes & RHSUsedLanes) &&
13491 // If we select high and lower word keep it for SDWA.
13492 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13493 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13494 // Each byte in each mask is either selector mask 0-3, or has higher
13495 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13496 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13497 // mask which is not 0xff wins. By anding both masks we have a correct
13498 // result except that 0x0c shall be corrected to give 0x0c only.
13499 uint32_t Mask = LHSMask & RHSMask;
13500 for (unsigned I = 0; I < 32; I += 8) {
13501 uint32_t ByteSel = 0xff << I;
13502 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13503 Mask &= (0x0c << I) & 0xffffffff;
13504 }
13505
13506 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13507 // or 0x0c.
13508 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13509 SDLoc DL(N);
13510
13511 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13512 RHS.getOperand(0),
13513 DAG.getConstant(Sel, DL, MVT::i32));
13514 }
13515 }
13516 }
13517
13518 return SDValue();
13519}
13520
13521// A key component of v_perm is a mapping between byte position of the src
13522// operands, and the byte position of the dest. To provide such, we need: 1. the
13523// node that provides x byte of the dest of the OR, and 2. the byte of the node
13524// used to provide that x byte. calculateByteProvider finds which node provides
13525// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13526// and finds an ultimate src and byte position For example: The supported
13527// LoadCombine pattern for vector loads is as follows
13528// t1
13529// or
13530// / \
13531// t2 t3
13532// zext shl
13533// | | \
13534// t4 t5 16
13535// or anyext
13536// / \ |
13537// t6 t7 t8
13538// srl shl or
13539// / | / \ / \
13540// t9 t10 t11 t12 t13 t14
13541// trunc* 8 trunc* 8 and and
13542// | | / | | \
13543// t15 t16 t17 t18 t19 t20
13544// trunc* 255 srl -256
13545// | / \
13546// t15 t15 16
13547//
13548// *In this example, the truncs are from i32->i16
13549//
13550// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13551// respectively. calculateSrcByte would find (given node) -> ultimate src &
13552// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13553// After finding the mapping, we can combine the tree into vperm t15, t16,
13554// 0x05000407
13555
13556// Find the source and byte position from a node.
13557// \p DestByte is the byte position of the dest of the or that the src
13558// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13559// dest of the or byte. \p Depth tracks how many recursive iterations we have
13560// performed.
13561static const std::optional<ByteProvider<SDValue>>
13562calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13563 unsigned Depth = 0) {
13564 // We may need to recursively traverse a series of SRLs
13565 if (Depth >= 6)
13566 return std::nullopt;
13567
13568 if (Op.getValueSizeInBits() < 8)
13569 return std::nullopt;
13570
13571 if (Op.getValueType().isVector())
13572 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13573
13574 switch (Op->getOpcode()) {
13575 case ISD::TRUNCATE: {
13576 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13577 }
13578
13579 case ISD::SIGN_EXTEND:
13580 case ISD::ZERO_EXTEND:
13582 SDValue NarrowOp = Op->getOperand(0);
13583 auto NarrowVT = NarrowOp.getValueType();
13584 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13585 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13586 NarrowVT = VTSign->getVT();
13587 }
13588 if (!NarrowVT.isByteSized())
13589 return std::nullopt;
13590 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13591
13592 if (SrcIndex >= NarrowByteWidth)
13593 return std::nullopt;
13594 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13595 }
13596
13597 case ISD::SRA:
13598 case ISD::SRL: {
13599 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13600 if (!ShiftOp)
13601 return std::nullopt;
13602
13603 uint64_t BitShift = ShiftOp->getZExtValue();
13604
13605 if (BitShift % 8 != 0)
13606 return std::nullopt;
13607
13608 SrcIndex += BitShift / 8;
13609
13610 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13611 }
13612
13613 default: {
13614 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13615 }
13616 }
13617 llvm_unreachable("fully handled switch");
13618}
13619
13620// For a byte position in the result of an Or, traverse the tree and find the
13621// node (and the byte of the node) which ultimately provides this {Or,
13622// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13623// the byte position of the Op that corresponds with the originally requested
13624// byte of the Or \p Depth tracks how many recursive iterations we have
13625// performed. \p StartingIndex is the originally requested byte of the Or
13626static const std::optional<ByteProvider<SDValue>>
13627calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13628 unsigned StartingIndex = 0) {
13629 // Finding Src tree of RHS of or typically requires at least 1 additional
13630 // depth
13631 if (Depth > 6)
13632 return std::nullopt;
13633
13634 unsigned BitWidth = Op.getScalarValueSizeInBits();
13635 if (BitWidth % 8 != 0)
13636 return std::nullopt;
13637 if (Index > BitWidth / 8 - 1)
13638 return std::nullopt;
13639
13640 bool IsVec = Op.getValueType().isVector();
13641 switch (Op.getOpcode()) {
13642 case ISD::OR: {
13643 if (IsVec)
13644 return std::nullopt;
13645
13646 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13647 StartingIndex);
13648 if (!RHS)
13649 return std::nullopt;
13650 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13651 StartingIndex);
13652 if (!LHS)
13653 return std::nullopt;
13654 // A well formed Or will have two ByteProviders for each byte, one of which
13655 // is constant zero
13656 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13657 return std::nullopt;
13658 if (!LHS || LHS->isConstantZero())
13659 return RHS;
13660 if (!RHS || RHS->isConstantZero())
13661 return LHS;
13662 return std::nullopt;
13663 }
13664
13665 case ISD::AND: {
13666 if (IsVec)
13667 return std::nullopt;
13668
13669 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13670 if (!BitMaskOp)
13671 return std::nullopt;
13672
13673 uint32_t BitMask = BitMaskOp->getZExtValue();
13674 // Bits we expect for our StartingIndex
13675 uint32_t IndexMask = 0xFF << (Index * 8);
13676
13677 if ((IndexMask & BitMask) != IndexMask) {
13678 // If the result of the and partially provides the byte, then it
13679 // is not well formatted
13680 if (IndexMask & BitMask)
13681 return std::nullopt;
13683 }
13684
13685 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13686 }
13687
13688 case ISD::FSHR: {
13689 if (IsVec)
13690 return std::nullopt;
13691
13692 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13693 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13694 if (!ShiftOp || Op.getValueType().isVector())
13695 return std::nullopt;
13696
13697 uint64_t BitsProvided = Op.getValueSizeInBits();
13698 if (BitsProvided % 8 != 0)
13699 return std::nullopt;
13700
13701 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13702 if (BitShift % 8)
13703 return std::nullopt;
13704
13705 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13706 uint64_t ByteShift = BitShift / 8;
13707
13708 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13709 uint64_t BytesProvided = BitsProvided / 8;
13710 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13711 NewIndex %= BytesProvided;
13712 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13713 }
13714
13715 case ISD::SRA:
13716 case ISD::SRL: {
13717 if (IsVec)
13718 return std::nullopt;
13719
13720 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13721 if (!ShiftOp)
13722 return std::nullopt;
13723
13724 uint64_t BitShift = ShiftOp->getZExtValue();
13725 if (BitShift % 8)
13726 return std::nullopt;
13727
13728 auto BitsProvided = Op.getScalarValueSizeInBits();
13729 if (BitsProvided % 8 != 0)
13730 return std::nullopt;
13731
13732 uint64_t BytesProvided = BitsProvided / 8;
13733 uint64_t ByteShift = BitShift / 8;
13734 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13735 // If the byte we are trying to provide (as tracked by index) falls in this
13736 // range, then the SRL provides the byte. The byte of interest of the src of
13737 // the SRL is Index + ByteShift
13738 return BytesProvided - ByteShift > Index
13739 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13740 Index + ByteShift)
13742 }
13743
13744 case ISD::SHL: {
13745 if (IsVec)
13746 return std::nullopt;
13747
13748 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13749 if (!ShiftOp)
13750 return std::nullopt;
13751
13752 uint64_t BitShift = ShiftOp->getZExtValue();
13753 if (BitShift % 8 != 0)
13754 return std::nullopt;
13755 uint64_t ByteShift = BitShift / 8;
13756
13757 // If we are shifting by an amount greater than (or equal to)
13758 // the index we are trying to provide, then it provides 0s. If not,
13759 // then this bytes are not definitively 0s, and the corresponding byte
13760 // of interest is Index - ByteShift of the src
13761 return Index < ByteShift
13763 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13764 Depth + 1, StartingIndex);
13765 }
13766 case ISD::ANY_EXTEND:
13767 case ISD::SIGN_EXTEND:
13768 case ISD::ZERO_EXTEND:
13770 case ISD::AssertZext:
13771 case ISD::AssertSext: {
13772 if (IsVec)
13773 return std::nullopt;
13774
13775 SDValue NarrowOp = Op->getOperand(0);
13776 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13777 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13778 Op->getOpcode() == ISD::AssertZext ||
13779 Op->getOpcode() == ISD::AssertSext) {
13780 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13781 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13782 }
13783 if (NarrowBitWidth % 8 != 0)
13784 return std::nullopt;
13785 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13786
13787 if (Index >= NarrowByteWidth)
13788 return Op.getOpcode() == ISD::ZERO_EXTEND
13789 ? std::optional<ByteProvider<SDValue>>(
13791 : std::nullopt;
13792 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13793 }
13794
13795 case ISD::TRUNCATE: {
13796 if (IsVec)
13797 return std::nullopt;
13798
13799 uint64_t NarrowByteWidth = BitWidth / 8;
13800
13801 if (NarrowByteWidth >= Index) {
13802 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13803 StartingIndex);
13804 }
13805
13806 return std::nullopt;
13807 }
13808
13809 case ISD::CopyFromReg: {
13810 if (BitWidth / 8 > Index)
13811 return calculateSrcByte(Op, StartingIndex, Index);
13812
13813 return std::nullopt;
13814 }
13815
13816 case ISD::LOAD: {
13817 auto *L = cast<LoadSDNode>(Op.getNode());
13818
13819 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13820 if (NarrowBitWidth % 8 != 0)
13821 return std::nullopt;
13822 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13823
13824 // If the width of the load does not reach byte we are trying to provide for
13825 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13826 // question
13827 if (Index >= NarrowByteWidth) {
13828 return L->getExtensionType() == ISD::ZEXTLOAD
13829 ? std::optional<ByteProvider<SDValue>>(
13831 : std::nullopt;
13832 }
13833
13834 if (NarrowByteWidth > Index) {
13835 return calculateSrcByte(Op, StartingIndex, Index);
13836 }
13837
13838 return std::nullopt;
13839 }
13840
13841 case ISD::BSWAP: {
13842 if (IsVec)
13843 return std::nullopt;
13844
13845 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13846 Depth + 1, StartingIndex);
13847 }
13848
13850 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13851 if (!IdxOp)
13852 return std::nullopt;
13853 auto VecIdx = IdxOp->getZExtValue();
13854 auto ScalarSize = Op.getScalarValueSizeInBits();
13855 if (ScalarSize < 32)
13856 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13857 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13858 StartingIndex, Index);
13859 }
13860
13861 case AMDGPUISD::PERM: {
13862 if (IsVec)
13863 return std::nullopt;
13864
13865 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13866 if (!PermMask)
13867 return std::nullopt;
13868
13869 auto IdxMask =
13870 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13871 if (IdxMask > 0x07 && IdxMask != 0x0c)
13872 return std::nullopt;
13873
13874 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13875 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13876
13877 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13880 }
13881
13882 default: {
13883 return std::nullopt;
13884 }
13885 }
13886
13887 llvm_unreachable("fully handled switch");
13888}
13889
13890// Returns true if the Operand is a scalar and is 16 bits
13891static bool isExtendedFrom16Bits(SDValue &Operand) {
13892
13893 switch (Operand.getOpcode()) {
13894 case ISD::ANY_EXTEND:
13895 case ISD::SIGN_EXTEND:
13896 case ISD::ZERO_EXTEND: {
13897 auto OpVT = Operand.getOperand(0).getValueType();
13898 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13899 }
13900 case ISD::LOAD: {
13901 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13902 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13903 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13904 ExtType == ISD::EXTLOAD) {
13905 auto MemVT = L->getMemoryVT();
13906 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13907 }
13908 return L->getMemoryVT().getSizeInBits() == 16;
13909 }
13910 default:
13911 return false;
13912 }
13913}
13914
13915// Returns true if the mask matches consecutive bytes, and the first byte
13916// begins at a power of 2 byte offset from 0th byte
13917static bool addresses16Bits(int Mask) {
13918 int Low8 = Mask & 0xff;
13919 int Hi8 = (Mask & 0xff00) >> 8;
13920
13921 assert(Low8 < 8 && Hi8 < 8);
13922 // Are the bytes contiguous in the order of increasing addresses.
13923 bool IsConsecutive = (Hi8 - Low8 == 1);
13924 // Is the first byte at location that is aligned for 16 bit instructions.
13925 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13926 // In this case, we still need code to extract the 16 bit operand, so it
13927 // is better to use i8 v_perm
13928 bool Is16Aligned = !(Low8 % 2);
13929
13930 return IsConsecutive && Is16Aligned;
13931}
13932
13933// Do not lower into v_perm if the operands are actually 16 bit
13934// and the selected bits (based on PermMask) correspond with two
13935// easily addressable 16 bit operands.
13937 SDValue &OtherOp) {
13938 int Low16 = PermMask & 0xffff;
13939 int Hi16 = (PermMask & 0xffff0000) >> 16;
13940
13941 auto TempOp = peekThroughBitcasts(Op);
13942 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13943
13944 auto OpIs16Bit =
13945 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13946 if (!OpIs16Bit)
13947 return true;
13948
13949 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13950 isExtendedFrom16Bits(TempOtherOp);
13951 if (!OtherOpIs16Bit)
13952 return true;
13953
13954 // Do we cleanly address both
13955 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13956}
13957
13959 unsigned DWordOffset) {
13960 SDValue Ret;
13961
13962 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13963 // ByteProvider must be at least 8 bits
13964 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13965
13966 if (TypeSize <= 32)
13967 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13968
13969 if (Src.getValueType().isVector()) {
13970 auto ScalarTySize = Src.getScalarValueSizeInBits();
13971 auto ScalarTy = Src.getValueType().getScalarType();
13972 if (ScalarTySize == 32) {
13973 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13974 DAG.getConstant(DWordOffset, SL, MVT::i32));
13975 }
13976 if (ScalarTySize > 32) {
13977 Ret = DAG.getNode(
13978 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13979 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13980 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13981 if (ShiftVal)
13982 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13983 DAG.getConstant(ShiftVal, SL, MVT::i32));
13984 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13985 }
13986
13987 assert(ScalarTySize < 32);
13988 auto NumElements = TypeSize / ScalarTySize;
13989 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13990 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13991 auto NumElementsIn32 = 32 / ScalarTySize;
13992 auto NumAvailElements = DWordOffset < Trunc32Elements
13993 ? NumElementsIn32
13994 : NumElements - NormalizedTrunc;
13995
13997 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13998 NumAvailElements);
13999
14000 Ret = DAG.getBuildVector(
14001 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
14002 VecSrcs);
14003 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14004 }
14005
14006 /// Scalar Type
14007 auto ShiftVal = 32 * DWordOffset;
14008 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
14009 DAG.getConstant(ShiftVal, SL, MVT::i32));
14010 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14011}
14012
14014 SelectionDAG &DAG = DCI.DAG;
14015 [[maybe_unused]] EVT VT = N->getValueType(0);
14017
14018 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14019 assert(VT == MVT::i32);
14020 for (int i = 0; i < 4; i++) {
14021 // Find the ByteProvider that provides the ith byte of the result of OR
14022 std::optional<ByteProvider<SDValue>> P =
14023 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
14024 // TODO support constantZero
14025 if (!P || P->isConstantZero())
14026 return SDValue();
14027
14028 PermNodes.push_back(*P);
14029 }
14030 if (PermNodes.size() != 4)
14031 return SDValue();
14032
14033 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14034 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14035 uint64_t PermMask = 0x00000000;
14036 for (size_t i = 0; i < PermNodes.size(); i++) {
14037 auto PermOp = PermNodes[i];
14038 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14039 // by sizeof(Src2) = 4
14040 int SrcByteAdjust = 4;
14041
14042 // If the Src uses a byte from a different DWORD, then it corresponds
14043 // with a difference source
14044 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14045 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14046 if (SecondSrc)
14047 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14048 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14049 return SDValue();
14050
14051 // Set the index of the second distinct Src node
14052 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14053 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14054 SrcByteAdjust = 0;
14055 }
14056 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14058 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14059 }
14060 SDLoc DL(N);
14061 SDValue Op = *PermNodes[FirstSrc.first].Src;
14062 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
14063 assert(Op.getValueSizeInBits() == 32);
14064
14065 // Check that we are not just extracting the bytes in order from an op
14066 if (!SecondSrc) {
14067 int Low16 = PermMask & 0xffff;
14068 int Hi16 = (PermMask & 0xffff0000) >> 16;
14069
14070 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14071 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14072
14073 // The perm op would really just produce Op. So combine into Op
14074 if (WellFormedLow && WellFormedHi)
14075 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
14076 }
14077
14078 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14079
14080 if (SecondSrc) {
14081 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
14082 assert(OtherOp.getValueSizeInBits() == 32);
14083 }
14084
14085 // Check that we haven't just recreated the same FSHR node.
14086 if (N->getOpcode() == ISD::FSHR &&
14087 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
14088 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
14089 return SDValue();
14090
14091 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14092
14093 assert(Op.getValueType().isByteSized() &&
14094 OtherOp.getValueType().isByteSized());
14095
14096 // If the ultimate src is less than 32 bits, then we will only be
14097 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14098 // CalculateByteProvider would not have returned Op as source if we
14099 // used a byte that is outside its ValueType. Thus, we are free to
14100 // ANY_EXTEND as the extended bits are dont-cares.
14101 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
14102 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
14103
14104 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
14105 DAG.getConstant(PermMask, DL, MVT::i32));
14106 }
14107 return SDValue();
14108}
14109
14110SDValue SITargetLowering::performOrCombine(SDNode *N,
14111 DAGCombinerInfo &DCI) const {
14112 SelectionDAG &DAG = DCI.DAG;
14113 SDValue LHS = N->getOperand(0);
14114 SDValue RHS = N->getOperand(1);
14115
14116 EVT VT = N->getValueType(0);
14117 if (VT == MVT::i1) {
14118 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14119 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14120 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14121 SDValue Src = LHS.getOperand(0);
14122 if (Src != RHS.getOperand(0))
14123 return SDValue();
14124
14125 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
14126 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14127 if (!CLHS || !CRHS)
14128 return SDValue();
14129
14130 // Only 10 bits are used.
14131 static const uint32_t MaxMask = 0x3ff;
14132
14133 uint32_t NewMask =
14134 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14135 SDLoc DL(N);
14136 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
14137 DAG.getConstant(NewMask, DL, MVT::i32));
14138 }
14139
14140 return SDValue();
14141 }
14142
14143 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14145 LHS.getOpcode() == AMDGPUISD::PERM &&
14146 isa<ConstantSDNode>(LHS.getOperand(2))) {
14147 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
14148 if (!Sel)
14149 return SDValue();
14150
14151 Sel |= LHS.getConstantOperandVal(2);
14152 SDLoc DL(N);
14153 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14154 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14155 }
14156
14157 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14158 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14159 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14160 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14161
14162 // If all the uses of an or need to extract the individual elements, do not
14163 // attempt to lower into v_perm
14164 auto usesCombinedOperand = [](SDNode *OrUse) {
14165 // If we have any non-vectorized use, then it is a candidate for v_perm
14166 if (OrUse->getOpcode() != ISD::BITCAST ||
14167 !OrUse->getValueType(0).isVector())
14168 return true;
14169
14170 // If we have any non-vectorized use, then it is a candidate for v_perm
14171 for (auto *VUser : OrUse->users()) {
14172 if (!VUser->getValueType(0).isVector())
14173 return true;
14174
14175 // If the use of a vector is a store, then combining via a v_perm
14176 // is beneficial.
14177 // TODO -- whitelist more uses
14178 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14179 if (VUser->getOpcode() == VectorwiseOp)
14180 return true;
14181 }
14182 return false;
14183 };
14184
14185 if (!any_of(N->users(), usesCombinedOperand))
14186 return SDValue();
14187
14188 uint32_t LHSMask = getPermuteMask(LHS);
14189 uint32_t RHSMask = getPermuteMask(RHS);
14190
14191 if (LHSMask != ~0u && RHSMask != ~0u) {
14192 // Canonicalize the expression in an attempt to have fewer unique masks
14193 // and therefore fewer registers used to hold the masks.
14194 if (LHSMask > RHSMask) {
14195 std::swap(LHSMask, RHSMask);
14196 std::swap(LHS, RHS);
14197 }
14198
14199 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14200 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14201 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14202 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14203
14204 // Check of we need to combine values from two sources within a byte.
14205 if (!(LHSUsedLanes & RHSUsedLanes) &&
14206 // If we select high and lower word keep it for SDWA.
14207 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14208 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14209 // Kill zero bytes selected by other mask. Zero value is 0xc.
14210 LHSMask &= ~RHSUsedLanes;
14211 RHSMask &= ~LHSUsedLanes;
14212 // Add 4 to each active LHS lane
14213 LHSMask |= LHSUsedLanes & 0x04040404;
14214 // Combine masks
14215 uint32_t Sel = LHSMask | RHSMask;
14216 SDLoc DL(N);
14217
14218 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14219 RHS.getOperand(0),
14220 DAG.getConstant(Sel, DL, MVT::i32));
14221 }
14222 }
14223 if (LHSMask == ~0u || RHSMask == ~0u) {
14224 if (SDValue Perm = matchPERM(N, DCI))
14225 return Perm;
14226 }
14227 }
14228
14229 // Detect identity v2i32 OR and replace with identity source node.
14230 // Specifically an Or that has operands constructed from the same source node
14231 // via extract_vector_elt and build_vector. I.E.
14232 // v2i32 or(
14233 // v2i32 build_vector(
14234 // i32 extract_elt(%IdentitySrc, 0),
14235 // i32 0
14236 // ),
14237 // v2i32 build_vector(
14238 // i32 0,
14239 // i32 extract_elt(%IdentitySrc, 1)
14240 // ) )
14241 // =>
14242 // v2i32 %IdentitySrc
14243
14244 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14245 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14246
14247 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14248 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14249
14250 // Test for and normalise build vectors.
14251 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14252
14253 // Get the extract_vector_element operands.
14254 SDValue LEVE = LHS->getOperand(0);
14255 SDValue REVE = RHS->getOperand(1);
14256
14257 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14259 // Check that different elements from the same vector are
14260 // extracted.
14261 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14262 LEVE->getOperand(1) != REVE->getOperand(1)) {
14263 SDValue IdentitySrc = LEVE.getOperand(0);
14264 return IdentitySrc;
14265 }
14266 }
14267 }
14268 }
14269
14270 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14271 return SDValue();
14272
14273 // TODO: This could be a generic combine with a predicate for extracting the
14274 // high half of an integer being free.
14275
14276 // (or i64:x, (zero_extend i32:y)) ->
14277 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14278 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14279 RHS.getOpcode() != ISD::ZERO_EXTEND)
14280 std::swap(LHS, RHS);
14281
14282 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14283 SDValue ExtSrc = RHS.getOperand(0);
14284 EVT SrcVT = ExtSrc.getValueType();
14285 if (SrcVT == MVT::i32) {
14286 SDLoc SL(N);
14287 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14288 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14289
14290 DCI.AddToWorklist(LowOr.getNode());
14291 DCI.AddToWorklist(HiBits.getNode());
14292
14293 SDValue Vec =
14294 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14295 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14296 }
14297 }
14298
14299 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14300 if (CRHS) {
14301 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14302 N->getOperand(0), CRHS))
14303 return Split;
14304 }
14305
14306 return SDValue();
14307}
14308
14309SDValue SITargetLowering::performXorCombine(SDNode *N,
14310 DAGCombinerInfo &DCI) const {
14311 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14312 return RV;
14313
14314 SDValue LHS = N->getOperand(0);
14315 SDValue RHS = N->getOperand(1);
14316
14317 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14318 SelectionDAG &DAG = DCI.DAG;
14319
14320 EVT VT = N->getValueType(0);
14321 if (CRHS && VT == MVT::i64) {
14322 if (SDValue Split =
14323 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14324 return Split;
14325 }
14326
14327 // v2i32 (xor (vselect cc, x, y), K) ->
14328 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14329 // replaced with source modifiers when the select is lowered to CNDMASK.
14330 unsigned Opc = LHS.getOpcode();
14331 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14332 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14333 CRHS && CRHS->getAPIntValue().isSignMask()) {
14334 SDValue CC = LHS->getOperand(0);
14335 SDValue TRUE = LHS->getOperand(1);
14336 SDValue FALSE = LHS->getOperand(2);
14337 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14338 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14339 SDValue XSelect =
14340 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14341 return XSelect;
14342 }
14343
14344 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14345 // fneg-like xors into 64-bit select.
14346 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14347 // This looks like an fneg, try to fold as a source modifier.
14348 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14350 // xor (select c, a, b), 0x80000000 ->
14351 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14352 SDLoc DL(N);
14353 SDValue CastLHS =
14354 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14355 SDValue CastRHS =
14356 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14357 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14358 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14359 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14360 LHS->getOperand(0), FNegLHS, FNegRHS);
14361 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14362 }
14363 }
14364
14365 return SDValue();
14366}
14367
14368SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14369 DAGCombinerInfo &DCI) const {
14370 if (!Subtarget->has16BitInsts() ||
14371 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14372 return SDValue();
14373
14374 EVT VT = N->getValueType(0);
14375 if (VT != MVT::i32)
14376 return SDValue();
14377
14378 SDValue Src = N->getOperand(0);
14379 if (Src.getValueType() != MVT::i16)
14380 return SDValue();
14381
14382 return SDValue();
14383}
14384
14385SDValue
14386SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14387 DAGCombinerInfo &DCI) const {
14388 SDValue Src = N->getOperand(0);
14389 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14390
14391 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14392 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14393 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14394 VTSign->getVT() == MVT::i8) ||
14395 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14396 VTSign->getVT() == MVT::i16))) {
14397 assert(Subtarget->hasScalarSubwordLoads() &&
14398 "s_buffer_load_{u8, i8} are supported "
14399 "in GFX12 (or newer) architectures.");
14400 EVT VT = Src.getValueType();
14401 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14402 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14403 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14404 SDLoc DL(N);
14405 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14406 SDValue Ops[] = {
14407 Src.getOperand(0), // source register
14408 Src.getOperand(1), // offset
14409 Src.getOperand(2) // cachePolicy
14410 };
14411 auto *M = cast<MemSDNode>(Src);
14412 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14413 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14414 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14415 return LoadVal;
14416 }
14417 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14418 VTSign->getVT() == MVT::i8) ||
14419 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14420 VTSign->getVT() == MVT::i16)) &&
14421 Src.hasOneUse()) {
14422 auto *M = cast<MemSDNode>(Src);
14423 SDValue Ops[] = {Src.getOperand(0), // Chain
14424 Src.getOperand(1), // rsrc
14425 Src.getOperand(2), // vindex
14426 Src.getOperand(3), // voffset
14427 Src.getOperand(4), // soffset
14428 Src.getOperand(5), // offset
14429 Src.getOperand(6), Src.getOperand(7)};
14430 // replace with BUFFER_LOAD_BYTE/SHORT
14431 SDVTList ResList =
14432 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14433 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14434 ? AMDGPUISD::BUFFER_LOAD_BYTE
14435 : AMDGPUISD::BUFFER_LOAD_SHORT;
14436 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14437 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14438 return DCI.DAG.getMergeValues(
14439 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14440 }
14441 return SDValue();
14442}
14443
14444SDValue SITargetLowering::performClassCombine(SDNode *N,
14445 DAGCombinerInfo &DCI) const {
14446 SelectionDAG &DAG = DCI.DAG;
14447 SDValue Mask = N->getOperand(1);
14448
14449 // fp_class x, 0 -> false
14450 if (isNullConstant(Mask))
14451 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14452
14453 if (N->getOperand(0).isUndef())
14454 return DAG.getUNDEF(MVT::i1);
14455
14456 return SDValue();
14457}
14458
14459SDValue SITargetLowering::performRcpCombine(SDNode *N,
14460 DAGCombinerInfo &DCI) const {
14461 EVT VT = N->getValueType(0);
14462 SDValue N0 = N->getOperand(0);
14463
14464 if (N0.isUndef()) {
14465 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14466 SDLoc(N), VT);
14467 }
14468
14469 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14470 N0.getOpcode() == ISD::SINT_TO_FP)) {
14471 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14472 N->getFlags());
14473 }
14474
14475 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14476 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14477 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14478 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14479 N->getFlags());
14480 }
14481
14483}
14484
14486 unsigned MaxDepth) const {
14487 unsigned Opcode = Op.getOpcode();
14488 if (Opcode == ISD::FCANONICALIZE)
14489 return true;
14490
14491 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14492 const auto &F = CFP->getValueAPF();
14493 if (F.isNaN() && F.isSignaling())
14494 return false;
14495 if (!F.isDenormal())
14496 return true;
14497
14498 DenormalMode Mode =
14499 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14500 return Mode == DenormalMode::getIEEE();
14501 }
14502
14503 // If source is a result of another standard FP operation it is already in
14504 // canonical form.
14505 if (MaxDepth == 0)
14506 return false;
14507
14508 switch (Opcode) {
14509 // These will flush denorms if required.
14510 case ISD::FADD:
14511 case ISD::FSUB:
14512 case ISD::FMUL:
14513 case ISD::FCEIL:
14514 case ISD::FFLOOR:
14515 case ISD::FMA:
14516 case ISD::FMAD:
14517 case ISD::FSQRT:
14518 case ISD::FDIV:
14519 case ISD::FREM:
14520 case ISD::FP_ROUND:
14521 case ISD::FP_EXTEND:
14522 case ISD::FP16_TO_FP:
14523 case ISD::FP_TO_FP16:
14524 case ISD::BF16_TO_FP:
14525 case ISD::FP_TO_BF16:
14526 case ISD::FLDEXP:
14527 case AMDGPUISD::FMUL_LEGACY:
14528 case AMDGPUISD::FMAD_FTZ:
14529 case AMDGPUISD::RCP:
14530 case AMDGPUISD::RSQ:
14531 case AMDGPUISD::RSQ_CLAMP:
14532 case AMDGPUISD::RCP_LEGACY:
14533 case AMDGPUISD::RCP_IFLAG:
14534 case AMDGPUISD::LOG:
14535 case AMDGPUISD::EXP:
14536 case AMDGPUISD::DIV_SCALE:
14537 case AMDGPUISD::DIV_FMAS:
14538 case AMDGPUISD::DIV_FIXUP:
14539 case AMDGPUISD::FRACT:
14540 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14541 case AMDGPUISD::CVT_F32_UBYTE0:
14542 case AMDGPUISD::CVT_F32_UBYTE1:
14543 case AMDGPUISD::CVT_F32_UBYTE2:
14544 case AMDGPUISD::CVT_F32_UBYTE3:
14545 case AMDGPUISD::FP_TO_FP16:
14546 case AMDGPUISD::SIN_HW:
14547 case AMDGPUISD::COS_HW:
14548 return true;
14549
14550 // It can/will be lowered or combined as a bit operation.
14551 // Need to check their input recursively to handle.
14552 case ISD::FNEG:
14553 case ISD::FABS:
14554 case ISD::FCOPYSIGN:
14555 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14556
14557 case ISD::AND:
14558 if (Op.getValueType() == MVT::i32) {
14559 // Be careful as we only know it is a bitcast floating point type. It
14560 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14561 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14562 // is valid to optimize for all types.
14563 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14564 if (RHS->getZExtValue() == 0xffff0000) {
14565 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14566 }
14567 }
14568 }
14569 break;
14570
14571 case ISD::FSIN:
14572 case ISD::FCOS:
14573 case ISD::FSINCOS:
14574 return Op.getValueType().getScalarType() != MVT::f16;
14575
14576 case ISD::FMINNUM:
14577 case ISD::FMAXNUM:
14578 case ISD::FMINNUM_IEEE:
14579 case ISD::FMAXNUM_IEEE:
14580 case ISD::FMINIMUM:
14581 case ISD::FMAXIMUM:
14582 case ISD::FMINIMUMNUM:
14583 case ISD::FMAXIMUMNUM:
14584 case AMDGPUISD::CLAMP:
14585 case AMDGPUISD::FMED3:
14586 case AMDGPUISD::FMAX3:
14587 case AMDGPUISD::FMIN3:
14588 case AMDGPUISD::FMAXIMUM3:
14589 case AMDGPUISD::FMINIMUM3: {
14590 // FIXME: Shouldn't treat the generic operations different based these.
14591 // However, we aren't really required to flush the result from
14592 // minnum/maxnum..
14593
14594 // snans will be quieted, so we only need to worry about denormals.
14595 if (Subtarget->supportsMinMaxDenormModes() ||
14596 // FIXME: denormalsEnabledForType is broken for dynamic
14597 denormalsEnabledForType(DAG, Op.getValueType()))
14598 return true;
14599
14600 // Flushing may be required.
14601 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14602 // targets need to check their input recursively.
14603
14604 // FIXME: Does this apply with clamp? It's implemented with max.
14605 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14606 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14607 return false;
14608 }
14609
14610 return true;
14611 }
14612 case ISD::SELECT: {
14613 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14614 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14615 }
14616 case ISD::BUILD_VECTOR: {
14617 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14618 SDValue SrcOp = Op.getOperand(i);
14619 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14620 return false;
14621 }
14622
14623 return true;
14624 }
14627 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14628 }
14630 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14631 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14632 }
14633 case ISD::UNDEF:
14634 // Could be anything.
14635 return false;
14636
14637 case ISD::BITCAST:
14638 // TODO: This is incorrect as it loses track of the operand's type. We may
14639 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14640 // same bits that are canonicalized in one type need not be in the other.
14641 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14642 case ISD::TRUNCATE: {
14643 // Hack round the mess we make when legalizing extract_vector_elt
14644 if (Op.getValueType() == MVT::i16) {
14645 SDValue TruncSrc = Op.getOperand(0);
14646 if (TruncSrc.getValueType() == MVT::i32 &&
14647 TruncSrc.getOpcode() == ISD::BITCAST &&
14648 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14649 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14650 }
14651 }
14652 return false;
14653 }
14655 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14656 // TODO: Handle more intrinsics
14657 switch (IntrinsicID) {
14658 case Intrinsic::amdgcn_cvt_pkrtz:
14659 case Intrinsic::amdgcn_cubeid:
14660 case Intrinsic::amdgcn_frexp_mant:
14661 case Intrinsic::amdgcn_fdot2:
14662 case Intrinsic::amdgcn_rcp:
14663 case Intrinsic::amdgcn_rsq:
14664 case Intrinsic::amdgcn_rsq_clamp:
14665 case Intrinsic::amdgcn_rcp_legacy:
14666 case Intrinsic::amdgcn_rsq_legacy:
14667 case Intrinsic::amdgcn_trig_preop:
14668 case Intrinsic::amdgcn_tanh:
14669 case Intrinsic::amdgcn_log:
14670 case Intrinsic::amdgcn_exp2:
14671 case Intrinsic::amdgcn_sqrt:
14672 return true;
14673 default:
14674 break;
14675 }
14676
14677 break;
14678 }
14679 default:
14680 break;
14681 }
14682
14683 // FIXME: denormalsEnabledForType is broken for dynamic
14684 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14685 DAG.isKnownNeverSNaN(Op);
14686}
14687
14689 unsigned MaxDepth) const {
14690 const MachineRegisterInfo &MRI = MF.getRegInfo();
14691 MachineInstr *MI = MRI.getVRegDef(Reg);
14692 unsigned Opcode = MI->getOpcode();
14693
14694 if (Opcode == AMDGPU::G_FCANONICALIZE)
14695 return true;
14696
14697 std::optional<FPValueAndVReg> FCR;
14698 // Constant splat (can be padded with undef) or scalar constant.
14700 if (FCR->Value.isSignaling())
14701 return false;
14702 if (!FCR->Value.isDenormal())
14703 return true;
14704
14705 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14706 return Mode == DenormalMode::getIEEE();
14707 }
14708
14709 if (MaxDepth == 0)
14710 return false;
14711
14712 switch (Opcode) {
14713 case AMDGPU::G_FADD:
14714 case AMDGPU::G_FSUB:
14715 case AMDGPU::G_FMUL:
14716 case AMDGPU::G_FCEIL:
14717 case AMDGPU::G_FFLOOR:
14718 case AMDGPU::G_FRINT:
14719 case AMDGPU::G_FNEARBYINT:
14720 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14721 case AMDGPU::G_INTRINSIC_TRUNC:
14722 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14723 case AMDGPU::G_FMA:
14724 case AMDGPU::G_FMAD:
14725 case AMDGPU::G_FSQRT:
14726 case AMDGPU::G_FDIV:
14727 case AMDGPU::G_FREM:
14728 case AMDGPU::G_FPOW:
14729 case AMDGPU::G_FPEXT:
14730 case AMDGPU::G_FLOG:
14731 case AMDGPU::G_FLOG2:
14732 case AMDGPU::G_FLOG10:
14733 case AMDGPU::G_FPTRUNC:
14734 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14735 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14736 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14737 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14738 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14739 return true;
14740 case AMDGPU::G_FNEG:
14741 case AMDGPU::G_FABS:
14742 case AMDGPU::G_FCOPYSIGN:
14743 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14744 case AMDGPU::G_FMINNUM:
14745 case AMDGPU::G_FMAXNUM:
14746 case AMDGPU::G_FMINNUM_IEEE:
14747 case AMDGPU::G_FMAXNUM_IEEE:
14748 case AMDGPU::G_FMINIMUM:
14749 case AMDGPU::G_FMAXIMUM:
14750 case AMDGPU::G_FMINIMUMNUM:
14751 case AMDGPU::G_FMAXIMUMNUM: {
14752 if (Subtarget->supportsMinMaxDenormModes() ||
14753 // FIXME: denormalsEnabledForType is broken for dynamic
14754 denormalsEnabledForType(MRI.getType(Reg), MF))
14755 return true;
14756
14757 [[fallthrough]];
14758 }
14759 case AMDGPU::G_BUILD_VECTOR:
14760 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14761 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14762 return false;
14763 return true;
14764 case AMDGPU::G_INTRINSIC:
14765 case AMDGPU::G_INTRINSIC_CONVERGENT:
14766 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14767 case Intrinsic::amdgcn_fmul_legacy:
14768 case Intrinsic::amdgcn_fmad_ftz:
14769 case Intrinsic::amdgcn_sqrt:
14770 case Intrinsic::amdgcn_fmed3:
14771 case Intrinsic::amdgcn_sin:
14772 case Intrinsic::amdgcn_cos:
14773 case Intrinsic::amdgcn_log:
14774 case Intrinsic::amdgcn_exp2:
14775 case Intrinsic::amdgcn_log_clamp:
14776 case Intrinsic::amdgcn_rcp:
14777 case Intrinsic::amdgcn_rcp_legacy:
14778 case Intrinsic::amdgcn_rsq:
14779 case Intrinsic::amdgcn_rsq_clamp:
14780 case Intrinsic::amdgcn_rsq_legacy:
14781 case Intrinsic::amdgcn_div_scale:
14782 case Intrinsic::amdgcn_div_fmas:
14783 case Intrinsic::amdgcn_div_fixup:
14784 case Intrinsic::amdgcn_fract:
14785 case Intrinsic::amdgcn_cvt_pkrtz:
14786 case Intrinsic::amdgcn_cubeid:
14787 case Intrinsic::amdgcn_cubema:
14788 case Intrinsic::amdgcn_cubesc:
14789 case Intrinsic::amdgcn_cubetc:
14790 case Intrinsic::amdgcn_frexp_mant:
14791 case Intrinsic::amdgcn_fdot2:
14792 case Intrinsic::amdgcn_trig_preop:
14793 case Intrinsic::amdgcn_tanh:
14794 return true;
14795 default:
14796 break;
14797 }
14798
14799 [[fallthrough]];
14800 default:
14801 return false;
14802 }
14803
14804 llvm_unreachable("invalid operation");
14805}
14806
14807// Constant fold canonicalize.
14808SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14809 const SDLoc &SL, EVT VT,
14810 const APFloat &C) const {
14811 // Flush denormals to 0 if not enabled.
14812 if (C.isDenormal()) {
14813 DenormalMode Mode =
14814 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14815 if (Mode == DenormalMode::getPreserveSign()) {
14816 return DAG.getConstantFP(
14817 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14818 }
14819
14820 if (Mode != DenormalMode::getIEEE())
14821 return SDValue();
14822 }
14823
14824 if (C.isNaN()) {
14825 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14826 if (C.isSignaling()) {
14827 // Quiet a signaling NaN.
14828 // FIXME: Is this supposed to preserve payload bits?
14829 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14830 }
14831
14832 // Make sure it is the canonical NaN bitpattern.
14833 //
14834 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14835 // immediate?
14836 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14837 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14838 }
14839
14840 // Already canonical.
14841 return DAG.getConstantFP(C, SL, VT);
14842}
14843
14845 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14846}
14847
14848SDValue
14849SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14850 DAGCombinerInfo &DCI) const {
14851 SelectionDAG &DAG = DCI.DAG;
14852 SDValue N0 = N->getOperand(0);
14853 EVT VT = N->getValueType(0);
14854
14855 // fcanonicalize undef -> qnan
14856 if (N0.isUndef()) {
14858 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14859 }
14860
14861 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14862 EVT VT = N->getValueType(0);
14863 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14864 }
14865
14866 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14867 // (fcanonicalize k)
14868 //
14869 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14870
14871 // TODO: This could be better with wider vectors that will be split to v2f16,
14872 // and to consider uses since there aren't that many packed operations.
14873 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14874 isTypeLegal(MVT::v2f16)) {
14875 SDLoc SL(N);
14876 SDValue NewElts[2];
14877 SDValue Lo = N0.getOperand(0);
14878 SDValue Hi = N0.getOperand(1);
14879 EVT EltVT = Lo.getValueType();
14880
14882 for (unsigned I = 0; I != 2; ++I) {
14883 SDValue Op = N0.getOperand(I);
14884 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14885 NewElts[I] =
14886 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14887 } else if (Op.isUndef()) {
14888 // Handled below based on what the other operand is.
14889 NewElts[I] = Op;
14890 } else {
14891 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14892 }
14893 }
14894
14895 // If one half is undef, and one is constant, prefer a splat vector rather
14896 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14897 // cheaper to use and may be free with a packed operation.
14898 if (NewElts[0].isUndef()) {
14899 if (isa<ConstantFPSDNode>(NewElts[1]))
14900 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14901 ? NewElts[1]
14902 : DAG.getConstantFP(0.0f, SL, EltVT);
14903 }
14904
14905 if (NewElts[1].isUndef()) {
14906 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14907 ? NewElts[0]
14908 : DAG.getConstantFP(0.0f, SL, EltVT);
14909 }
14910
14911 return DAG.getBuildVector(VT, SL, NewElts);
14912 }
14913 }
14914
14915 return SDValue();
14916}
14917
14918static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14919 switch (Opc) {
14920 case ISD::FMAXNUM:
14921 case ISD::FMAXNUM_IEEE:
14922 case ISD::FMAXIMUMNUM:
14923 return AMDGPUISD::FMAX3;
14924 case ISD::FMAXIMUM:
14925 return AMDGPUISD::FMAXIMUM3;
14926 case ISD::SMAX:
14927 return AMDGPUISD::SMAX3;
14928 case ISD::UMAX:
14929 return AMDGPUISD::UMAX3;
14930 case ISD::FMINNUM:
14931 case ISD::FMINNUM_IEEE:
14932 case ISD::FMINIMUMNUM:
14933 return AMDGPUISD::FMIN3;
14934 case ISD::FMINIMUM:
14935 return AMDGPUISD::FMINIMUM3;
14936 case ISD::SMIN:
14937 return AMDGPUISD::SMIN3;
14938 case ISD::UMIN:
14939 return AMDGPUISD::UMIN3;
14940 default:
14941 llvm_unreachable("Not a min/max opcode");
14942 }
14943}
14944
14945SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14946 const SDLoc &SL, SDValue Src,
14947 SDValue MinVal,
14948 SDValue MaxVal,
14949 bool Signed) const {
14950
14951 // med3 comes from
14952 // min(max(x, K0), K1), K0 < K1
14953 // max(min(x, K0), K1), K1 < K0
14954 //
14955 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14956 // min/max op.
14957 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14958 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14959
14960 if (!MinK || !MaxK)
14961 return SDValue();
14962
14963 if (Signed) {
14964 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14965 return SDValue();
14966 } else {
14967 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14968 return SDValue();
14969 }
14970
14971 EVT VT = MinK->getValueType(0);
14972 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14973 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14974 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14975
14976 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14977 // not available, but this is unlikely to be profitable as constants
14978 // will often need to be materialized & extended, especially on
14979 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14980 return SDValue();
14981}
14982
14985 return C;
14986
14988 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14989 return C;
14990 }
14991
14992 return nullptr;
14993}
14994
14995SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14996 const SDLoc &SL, SDValue Op0,
14997 SDValue Op1) const {
14998 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14999 if (!K1)
15000 return SDValue();
15001
15002 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
15003 if (!K0)
15004 return SDValue();
15005
15006 // Ordered >= (although NaN inputs should have folded away by now).
15007 if (K0->getValueAPF() > K1->getValueAPF())
15008 return SDValue();
15009
15010 // med3 with a nan input acts like
15011 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15012 //
15013 // So the result depends on whether the IEEE mode bit is enabled or not with a
15014 // signaling nan input.
15015 // ieee=1
15016 // s0 snan: yields s2
15017 // s1 snan: yields s2
15018 // s2 snan: qnan
15019
15020 // s0 qnan: min(s1, s2)
15021 // s1 qnan: min(s0, s2)
15022 // s2 qnan: min(s0, s1)
15023
15024 // ieee=0
15025 // s0 snan: min(s1, s2)
15026 // s1 snan: min(s0, s2)
15027 // s2 snan: qnan
15028
15029 // s0 qnan: min(s1, s2)
15030 // s1 qnan: min(s0, s2)
15031 // s2 qnan: min(s0, s1)
15032 const MachineFunction &MF = DAG.getMachineFunction();
15033 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15034
15035 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15036 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15037 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15038 EVT VT = Op0.getValueType();
15039 if (Info->getMode().DX10Clamp) {
15040 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15041 // hardware fmed3 behavior converting to a min.
15042 // FIXME: Should this be allowing -0.0?
15043 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
15044 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
15045 }
15046
15047 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15048 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15049 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15050 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15051 // then give the other result, which is different from med3 with a NaN
15052 // input.
15053 SDValue Var = Op0.getOperand(0);
15054 if (!DAG.isKnownNeverSNaN(Var))
15055 return SDValue();
15056
15057 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15058
15059 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
15060 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
15061 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
15062 SDValue(K0, 0), SDValue(K1, 0));
15063 }
15064 }
15065
15066 return SDValue();
15067}
15068
15069/// \return true if the subtarget supports minimum3 and maximum3 with the given
15070/// base min/max opcode \p Opc for type \p VT.
15071static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15072 EVT VT) {
15073 switch (Opc) {
15074 case ISD::FMINNUM:
15075 case ISD::FMAXNUM:
15076 case ISD::FMINNUM_IEEE:
15077 case ISD::FMAXNUM_IEEE:
15078 case ISD::FMINIMUMNUM:
15079 case ISD::FMAXIMUMNUM:
15080 case AMDGPUISD::FMIN_LEGACY:
15081 case AMDGPUISD::FMAX_LEGACY:
15082 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15083 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15084 case ISD::FMINIMUM:
15085 case ISD::FMAXIMUM:
15086 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15087 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15088 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15089 case ISD::SMAX:
15090 case ISD::SMIN:
15091 case ISD::UMAX:
15092 case ISD::UMIN:
15093 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15094 default:
15095 return false;
15096 }
15097
15098 llvm_unreachable("not a min/max opcode");
15099}
15100
15101SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15102 DAGCombinerInfo &DCI) const {
15103 SelectionDAG &DAG = DCI.DAG;
15104
15105 EVT VT = N->getValueType(0);
15106 unsigned Opc = N->getOpcode();
15107 SDValue Op0 = N->getOperand(0);
15108 SDValue Op1 = N->getOperand(1);
15109
15110 // Only do this if the inner op has one use since this will just increases
15111 // register pressure for no benefit.
15112
15113 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
15114 // max(max(a, b), c) -> max3(a, b, c)
15115 // min(min(a, b), c) -> min3(a, b, c)
15116 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15117 SDLoc DL(N);
15118 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15119 Op0.getOperand(0), Op0.getOperand(1), Op1);
15120 }
15121
15122 // Try commuted.
15123 // max(a, max(b, c)) -> max3(a, b, c)
15124 // min(a, min(b, c)) -> min3(a, b, c)
15125 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15126 SDLoc DL(N);
15127 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15128 Op0, Op1.getOperand(0), Op1.getOperand(1));
15129 }
15130 }
15131
15132 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15133 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15134 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15135 if (SDValue Med3 = performIntMed3ImmCombine(
15136 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
15137 return Med3;
15138 }
15139 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15140 if (SDValue Med3 = performIntMed3ImmCombine(
15141 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
15142 return Med3;
15143 }
15144
15145 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15146 if (SDValue Med3 = performIntMed3ImmCombine(
15147 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
15148 return Med3;
15149 }
15150 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15151 if (SDValue Med3 = performIntMed3ImmCombine(
15152 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
15153 return Med3;
15154 }
15155
15156 // if !is_snan(x):
15157 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15158 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15159 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15160 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15161 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15164 (Opc == AMDGPUISD::FMIN_LEGACY &&
15165 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15166 (VT == MVT::f32 || VT == MVT::f64 ||
15167 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15168 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15169 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15170 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15171 Op0.hasOneUse()) {
15172 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
15173 return Res;
15174 }
15175
15176 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15177 // for some types, but at a higher cost since it's implemented with a 3
15178 // operand form.
15179 const SDNodeFlags Flags = N->getFlags();
15180 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
15181 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
15182 unsigned NewOpc =
15184 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15185 }
15186
15187 return SDValue();
15188}
15189
15193 // FIXME: Should this be allowing -0.0?
15194 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15195 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15196 }
15197 }
15198
15199 return false;
15200}
15201
15202// FIXME: Should only worry about snans for version with chain.
15203SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15204 DAGCombinerInfo &DCI) const {
15205 EVT VT = N->getValueType(0);
15206 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15207 // NaNs. With a NaN input, the order of the operands may change the result.
15208
15209 SelectionDAG &DAG = DCI.DAG;
15210 SDLoc SL(N);
15211
15212 SDValue Src0 = N->getOperand(0);
15213 SDValue Src1 = N->getOperand(1);
15214 SDValue Src2 = N->getOperand(2);
15215
15216 if (isClampZeroToOne(Src0, Src1)) {
15217 // const_a, const_b, x -> clamp is safe in all cases including signaling
15218 // nans.
15219 // FIXME: Should this be allowing -0.0?
15220 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15221 }
15222
15223 const MachineFunction &MF = DAG.getMachineFunction();
15224 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15225
15226 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15227 // handling no dx10-clamp?
15228 if (Info->getMode().DX10Clamp) {
15229 // If NaNs is clamped to 0, we are free to reorder the inputs.
15230
15231 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15232 std::swap(Src0, Src1);
15233
15234 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15235 std::swap(Src1, Src2);
15236
15237 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15238 std::swap(Src0, Src1);
15239
15240 if (isClampZeroToOne(Src1, Src2))
15241 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15242 }
15243
15244 return SDValue();
15245}
15246
15247SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15248 DAGCombinerInfo &DCI) const {
15249 SDValue Src0 = N->getOperand(0);
15250 SDValue Src1 = N->getOperand(1);
15251 if (Src0.isUndef() && Src1.isUndef())
15252 return DCI.DAG.getUNDEF(N->getValueType(0));
15253 return SDValue();
15254}
15255
15256// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15257// expanded into a set of cmp/select instructions.
15259 unsigned NumElem,
15260 bool IsDivergentIdx,
15261 const GCNSubtarget *Subtarget) {
15263 return false;
15264
15265 unsigned VecSize = EltSize * NumElem;
15266
15267 // Sub-dword vectors of size 2 dword or less have better implementation.
15268 if (VecSize <= 64 && EltSize < 32)
15269 return false;
15270
15271 // Always expand the rest of sub-dword instructions, otherwise it will be
15272 // lowered via memory.
15273 if (EltSize < 32)
15274 return true;
15275
15276 // Always do this if var-idx is divergent, otherwise it will become a loop.
15277 if (IsDivergentIdx)
15278 return true;
15279
15280 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15281 unsigned NumInsts = NumElem /* Number of compares */ +
15282 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15283
15284 // On some architectures (GFX9) movrel is not available and it's better
15285 // to expand.
15286 if (Subtarget->useVGPRIndexMode())
15287 return NumInsts <= 16;
15288
15289 // If movrel is available, use it instead of expanding for vector of 8
15290 // elements.
15291 if (Subtarget->hasMovrel())
15292 return NumInsts <= 15;
15293
15294 return true;
15295}
15296
15298 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15299 if (isa<ConstantSDNode>(Idx))
15300 return false;
15301
15302 SDValue Vec = N->getOperand(0);
15303 EVT VecVT = Vec.getValueType();
15304 EVT EltVT = VecVT.getVectorElementType();
15305 unsigned EltSize = EltVT.getSizeInBits();
15306 unsigned NumElem = VecVT.getVectorNumElements();
15307
15309 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15310}
15311
15312SDValue
15313SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15314 DAGCombinerInfo &DCI) const {
15315 SDValue Vec = N->getOperand(0);
15316 SelectionDAG &DAG = DCI.DAG;
15317
15318 EVT VecVT = Vec.getValueType();
15319 EVT VecEltVT = VecVT.getVectorElementType();
15320 EVT ResVT = N->getValueType(0);
15321
15322 unsigned VecSize = VecVT.getSizeInBits();
15323 unsigned VecEltSize = VecEltVT.getSizeInBits();
15324
15325 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15327 SDLoc SL(N);
15328 SDValue Idx = N->getOperand(1);
15329 SDValue Elt =
15330 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15331 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15332 }
15333
15334 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15335 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15336 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15337 // depending on the shift operand. See e.g. performSraCombine().
15338 // This combine ensures that the optimisation is compatible with v2i32
15339 // legalised AND.
15340 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15341 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15342
15344 if (!C || C->getZExtValue() != 0x1f)
15345 return SDValue();
15346
15347 SDLoc SL(N);
15348 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15349 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15350 Vec->getOperand(0), N->getOperand(1));
15351 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15352 DAG.ReplaceAllUsesWith(N, A.getNode());
15353 }
15354
15355 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15356 // =>
15357 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15358 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15359 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15360 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15361 SDLoc SL(N);
15362 SDValue Idx = N->getOperand(1);
15363 unsigned Opc = Vec.getOpcode();
15364
15365 switch (Opc) {
15366 default:
15367 break;
15368 // TODO: Support other binary operations.
15369 case ISD::FADD:
15370 case ISD::FSUB:
15371 case ISD::FMUL:
15372 case ISD::ADD:
15373 case ISD::UMIN:
15374 case ISD::UMAX:
15375 case ISD::SMIN:
15376 case ISD::SMAX:
15377 case ISD::FMAXNUM:
15378 case ISD::FMINNUM:
15379 case ISD::FMAXNUM_IEEE:
15380 case ISD::FMINNUM_IEEE:
15381 case ISD::FMAXIMUM:
15382 case ISD::FMINIMUM: {
15383 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15384 Vec.getOperand(0), Idx);
15385 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15386 Vec.getOperand(1), Idx);
15387
15388 DCI.AddToWorklist(Elt0.getNode());
15389 DCI.AddToWorklist(Elt1.getNode());
15390 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15391 }
15392 }
15393 }
15394
15395 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15397 SDLoc SL(N);
15398 SDValue Idx = N->getOperand(1);
15399 SDValue V;
15400 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15401 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15402 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15403 if (I == 0)
15404 V = Elt;
15405 else
15406 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15407 }
15408 return V;
15409 }
15410
15411 if (!DCI.isBeforeLegalize())
15412 return SDValue();
15413
15414 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15415 // elements. This exposes more load reduction opportunities by replacing
15416 // multiple small extract_vector_elements with a single 32-bit extract.
15417 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15418 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15419 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15420 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15421
15422 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15423 unsigned EltIdx = BitIndex / 32;
15424 unsigned LeftoverBitIdx = BitIndex % 32;
15425 SDLoc SL(N);
15426
15427 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15428 DCI.AddToWorklist(Cast.getNode());
15429
15430 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15431 DAG.getConstant(EltIdx, SL, MVT::i32));
15432 DCI.AddToWorklist(Elt.getNode());
15433 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15434 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15435 DCI.AddToWorklist(Srl.getNode());
15436
15437 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15438 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15439 DCI.AddToWorklist(Trunc.getNode());
15440
15441 if (VecEltVT == ResVT) {
15442 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15443 }
15444
15445 assert(ResVT.isScalarInteger());
15446 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15447 }
15448
15449 return SDValue();
15450}
15451
15452SDValue
15453SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15454 DAGCombinerInfo &DCI) const {
15455 SDValue Vec = N->getOperand(0);
15456 SDValue Idx = N->getOperand(2);
15457 EVT VecVT = Vec.getValueType();
15458 EVT EltVT = VecVT.getVectorElementType();
15459
15460 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15461 // => BUILD_VECTOR n x select (e, const-idx)
15463 return SDValue();
15464
15465 SelectionDAG &DAG = DCI.DAG;
15466 SDLoc SL(N);
15467 SDValue Ins = N->getOperand(1);
15468 EVT IdxVT = Idx.getValueType();
15469
15471 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15472 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15473 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15474 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15475 Ops.push_back(V);
15476 }
15477
15478 return DAG.getBuildVector(VecVT, SL, Ops);
15479}
15480
15481/// Return the source of an fp_extend from f16 to f32, or a converted FP
15482/// constant.
15484 if (Src.getOpcode() == ISD::FP_EXTEND &&
15485 Src.getOperand(0).getValueType() == MVT::f16) {
15486 return Src.getOperand(0);
15487 }
15488
15489 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15490 APFloat Val = CFP->getValueAPF();
15491 bool LosesInfo = true;
15493 if (!LosesInfo)
15494 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15495 }
15496
15497 return SDValue();
15498}
15499
15500SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15501 DAGCombinerInfo &DCI) const {
15502 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15503 "combine only useful on gfx8");
15504
15505 SDValue TruncSrc = N->getOperand(0);
15506 EVT VT = N->getValueType(0);
15507 if (VT != MVT::f16)
15508 return SDValue();
15509
15510 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15511 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15512 return SDValue();
15513
15514 SelectionDAG &DAG = DCI.DAG;
15515 SDLoc SL(N);
15516
15517 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15518 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15519 // casting back.
15520
15521 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15522 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15523 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15524 if (!A)
15525 return SDValue();
15526
15527 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15528 if (!B)
15529 return SDValue();
15530
15531 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15532 if (!C)
15533 return SDValue();
15534
15535 // This changes signaling nan behavior. If an input is a signaling nan, it
15536 // would have been quieted by the fpext originally. We don't care because
15537 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15538 // we would be worse off than just doing the promotion.
15539 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15540 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15541 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15542 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15543}
15544
15545unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15546 const SDNode *N0,
15547 const SDNode *N1) const {
15548 EVT VT = N0->getValueType(0);
15549
15550 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15551 // support denormals ever.
15552 if (((VT == MVT::f32 &&
15554 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15557 return ISD::FMAD;
15558
15559 const TargetOptions &Options = DAG.getTarget().Options;
15560 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15561 (N0->getFlags().hasAllowContract() &&
15562 N1->getFlags().hasAllowContract())) &&
15564 return ISD::FMA;
15565 }
15566
15567 return 0;
15568}
15569
15570// For a reassociatable opcode perform:
15571// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15572SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15573 SelectionDAG &DAG) const {
15574 EVT VT = N->getValueType(0);
15575 if (VT != MVT::i32 && VT != MVT::i64)
15576 return SDValue();
15577
15578 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15579 return SDValue();
15580
15581 unsigned Opc = N->getOpcode();
15582 SDValue Op0 = N->getOperand(0);
15583 SDValue Op1 = N->getOperand(1);
15584
15585 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15586 return SDValue();
15587
15588 if (Op0->isDivergent())
15589 std::swap(Op0, Op1);
15590
15591 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15592 return SDValue();
15593
15594 SDValue Op2 = Op1.getOperand(1);
15595 Op1 = Op1.getOperand(0);
15596 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15597 return SDValue();
15598
15599 if (Op1->isDivergent())
15600 std::swap(Op1, Op2);
15601
15602 SDLoc SL(N);
15603 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15604 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15605}
15606
15607static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15608 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15610 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15611 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15612 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15613}
15614
15615// Fold
15616// y = lshr i64 x, 32
15617// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15618// with Const.hi == -1
15619// To
15620// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15622 SDValue MulLHS, SDValue MulRHS,
15623 SDValue AddRHS) {
15624 if (MulRHS.getOpcode() == ISD::SRL)
15625 std::swap(MulLHS, MulRHS);
15626
15627 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15628 return SDValue();
15629
15630 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15631 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15632 MulLHS.getOperand(0) != AddRHS)
15633 return SDValue();
15634
15636 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15637 return SDValue();
15638
15639 SDValue ConstMul =
15640 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15641 return getMad64_32(DAG, SL, MVT::i64,
15642 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15643 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15644}
15645
15646// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15647// multiplies, if any.
15648//
15649// Full 64-bit multiplies that feed into an addition are lowered here instead
15650// of using the generic expansion. The generic expansion ends up with
15651// a tree of ADD nodes that prevents us from using the "add" part of the
15652// MAD instruction. The expansion produced here results in a chain of ADDs
15653// instead of a tree.
15654SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15655 DAGCombinerInfo &DCI) const {
15656 assert(N->isAnyAdd());
15657
15658 SelectionDAG &DAG = DCI.DAG;
15659 EVT VT = N->getValueType(0);
15660 SDLoc SL(N);
15661 SDValue LHS = N->getOperand(0);
15662 SDValue RHS = N->getOperand(1);
15663
15664 if (VT.isVector())
15665 return SDValue();
15666
15667 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15668 // result in scalar registers for uniform values.
15669 if (!N->isDivergent() && Subtarget->hasSMulHi())
15670 return SDValue();
15671
15672 unsigned NumBits = VT.getScalarSizeInBits();
15673 if (NumBits <= 32 || NumBits > 64)
15674 return SDValue();
15675
15676 if (LHS.getOpcode() != ISD::MUL) {
15677 assert(RHS.getOpcode() == ISD::MUL);
15678 std::swap(LHS, RHS);
15679 }
15680
15681 // Avoid the fold if it would unduly increase the number of multiplies due to
15682 // multiple uses, except on hardware with full-rate multiply-add (which is
15683 // part of full-rate 64-bit ops).
15684 if (!Subtarget->hasFullRate64Ops()) {
15685 unsigned NumUsers = 0;
15686 for (SDNode *User : LHS->users()) {
15687 // There is a use that does not feed into addition, so the multiply can't
15688 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15689 if (!User->isAnyAdd())
15690 return SDValue();
15691
15692 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15693 // MUL + 3xADD + 3xADDC over 3xMAD.
15694 ++NumUsers;
15695 if (NumUsers >= 3)
15696 return SDValue();
15697 }
15698 }
15699
15700 SDValue MulLHS = LHS.getOperand(0);
15701 SDValue MulRHS = LHS.getOperand(1);
15702 SDValue AddRHS = RHS;
15703
15704 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15705 return FoldedMAD;
15706
15707 // Always check whether operands are small unsigned values, since that
15708 // knowledge is useful in more cases. Check for small signed values only if
15709 // doing so can unlock a shorter code sequence.
15710 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15711 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15712
15713 bool MulSignedLo = false;
15714 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15715 MulSignedLo =
15716 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15717 }
15718
15719 // The operands and final result all have the same number of bits. If
15720 // operands need to be extended, they can be extended with garbage. The
15721 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15722 // truncated away in the end.
15723 if (VT != MVT::i64) {
15724 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15725 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15726 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15727 }
15728
15729 // The basic code generated is conceptually straightforward. Pseudo code:
15730 //
15731 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15732 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15733 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15734 //
15735 // The second and third lines are optional, depending on whether the factors
15736 // are {sign,zero}-extended or not.
15737 //
15738 // The actual DAG is noisier than the pseudo code, but only due to
15739 // instructions that disassemble values into low and high parts, and
15740 // assemble the final result.
15741 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15742
15743 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15744 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15745 SDValue Accum =
15746 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15747
15748 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15749 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15750
15751 if (!MulLHSUnsigned32) {
15752 auto MulLHSHi =
15753 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15754 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15755 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15756 }
15757
15758 if (!MulRHSUnsigned32) {
15759 auto MulRHSHi =
15760 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15761 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15762 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15763 }
15764
15765 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15766 Accum = DAG.getBitcast(MVT::i64, Accum);
15767 }
15768
15769 if (VT != MVT::i64)
15770 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15771 return Accum;
15772}
15773
15774SDValue
15775SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15776 DAGCombinerInfo &DCI) const {
15777 SDValue RHS = N->getOperand(1);
15778 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15779 if (!CRHS)
15780 return SDValue();
15781
15782 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15783 // common.
15784 uint64_t Val = CRHS->getZExtValue();
15785 if (countr_zero(Val) >= 32) {
15786 SelectionDAG &DAG = DCI.DAG;
15787 SDLoc SL(N);
15788 SDValue LHS = N->getOperand(0);
15789
15790 // Avoid carry machinery if we know the low half of the add does not
15791 // contribute to the final result.
15792 //
15793 // add i64:x, K if computeTrailingZeros(K) >= 32
15794 // => build_pair (add x.hi, K.hi), x.lo
15795
15796 // Breaking the 64-bit add here with this strange constant is unlikely
15797 // to interfere with addressing mode patterns.
15798
15799 SDValue Hi = getHiHalf64(LHS, DAG);
15800 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15801 unsigned Opcode = N->getOpcode();
15802 if (Opcode == ISD::PTRADD)
15803 Opcode = ISD::ADD;
15804 SDValue AddHi =
15805 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15806
15807 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15808 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15809 }
15810
15811 return SDValue();
15812}
15813
15814// Collect the ultimate src of each of the mul node's operands, and confirm
15815// each operand is 8 bytes.
15816static std::optional<ByteProvider<SDValue>>
15817handleMulOperand(const SDValue &MulOperand) {
15818 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15819 if (!Byte0 || Byte0->isConstantZero()) {
15820 return std::nullopt;
15821 }
15822 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15823 if (Byte1 && !Byte1->isConstantZero()) {
15824 return std::nullopt;
15825 }
15826 return Byte0;
15827}
15828
15829static unsigned addPermMasks(unsigned First, unsigned Second) {
15830 unsigned FirstCs = First & 0x0c0c0c0c;
15831 unsigned SecondCs = Second & 0x0c0c0c0c;
15832 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15833 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15834
15835 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15836 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15837 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15838 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15839
15840 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15841}
15842
15843struct DotSrc {
15845 int64_t PermMask;
15847};
15848
15852 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15853
15854 assert(Src0.Src.has_value() && Src1.Src.has_value());
15855 // Src0s and Src1s are empty, just place arbitrarily.
15856 if (Step == 0) {
15857 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15858 Src0.SrcOffset / 4});
15859 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15860 Src1.SrcOffset / 4});
15861 return;
15862 }
15863
15864 for (int BPI = 0; BPI < 2; BPI++) {
15865 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15866 if (BPI == 1) {
15867 BPP = {Src1, Src0};
15868 }
15869 unsigned ZeroMask = 0x0c0c0c0c;
15870 unsigned FMask = 0xFF << (8 * (3 - Step));
15871
15872 unsigned FirstMask =
15873 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15874 unsigned SecondMask =
15875 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15876 // Attempt to find Src vector which contains our SDValue, if so, add our
15877 // perm mask to the existing one. If we are unable to find a match for the
15878 // first SDValue, attempt to find match for the second.
15879 int FirstGroup = -1;
15880 for (int I = 0; I < 2; I++) {
15881 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15882 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15883 return IterElt.SrcOp == *BPP.first.Src &&
15884 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15885 };
15886
15887 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15888 if (Match != Srcs.end()) {
15889 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15890 FirstGroup = I;
15891 break;
15892 }
15893 }
15894 if (FirstGroup != -1) {
15895 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15896 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15897 return IterElt.SrcOp == *BPP.second.Src &&
15898 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15899 };
15900 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15901 if (Match != Srcs.end()) {
15902 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15903 } else
15904 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15905 return;
15906 }
15907 }
15908
15909 // If we have made it here, then we could not find a match in Src0s or Src1s
15910 // for either Src0 or Src1, so just place them arbitrarily.
15911
15912 unsigned ZeroMask = 0x0c0c0c0c;
15913 unsigned FMask = 0xFF << (8 * (3 - Step));
15914
15915 Src0s.push_back(
15916 {*Src0.Src,
15917 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15918 Src0.SrcOffset / 4});
15919 Src1s.push_back(
15920 {*Src1.Src,
15921 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15922 Src1.SrcOffset / 4});
15923}
15924
15926 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15927 bool IsAny) {
15928
15929 // If we just have one source, just permute it accordingly.
15930 if (Srcs.size() == 1) {
15931 auto *Elt = Srcs.begin();
15932 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15933
15934 // v_perm will produce the original value
15935 if (Elt->PermMask == 0x3020100)
15936 return EltOp;
15937
15938 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15939 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15940 }
15941
15942 auto *FirstElt = Srcs.begin();
15943 auto *SecondElt = std::next(FirstElt);
15944
15946
15947 // If we have multiple sources in the chain, combine them via perms (using
15948 // calculated perm mask) and Ors.
15949 while (true) {
15950 auto FirstMask = FirstElt->PermMask;
15951 auto SecondMask = SecondElt->PermMask;
15952
15953 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15954 unsigned FirstPlusFour = FirstMask | 0x04040404;
15955 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15956 // original 0x0C.
15957 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15958
15959 auto PermMask = addPermMasks(FirstMask, SecondMask);
15960 auto FirstVal =
15961 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15962 auto SecondVal =
15963 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15964
15965 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15966 SecondVal,
15967 DAG.getConstant(PermMask, SL, MVT::i32)));
15968
15969 FirstElt = std::next(SecondElt);
15970 if (FirstElt == Srcs.end())
15971 break;
15972
15973 SecondElt = std::next(FirstElt);
15974 // If we only have a FirstElt, then just combine that into the cumulative
15975 // source node.
15976 if (SecondElt == Srcs.end()) {
15977 auto EltOp =
15978 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15979
15980 Perms.push_back(
15981 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15982 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15983 break;
15984 }
15985 }
15986
15987 assert(Perms.size() == 1 || Perms.size() == 2);
15988 return Perms.size() == 2
15989 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15990 : Perms[0];
15991}
15992
15993static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15994 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15995 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15996 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15997 EntryMask += ZeroMask;
15998 }
15999}
16000
16001static bool isMul(const SDValue Op) {
16002 auto Opcode = Op.getOpcode();
16003
16004 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16005 Opcode == AMDGPUISD::MUL_I24);
16006}
16007
16008static std::optional<bool>
16010 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16011 const SDValue &S1Op, const SelectionDAG &DAG) {
16012 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16013 // of the dot4 is irrelevant.
16014 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16015 return false;
16016
16017 auto Known0 = DAG.computeKnownBits(S0Op, 0);
16018 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16019 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16020 auto Known1 = DAG.computeKnownBits(S1Op, 0);
16021 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16022 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16023
16024 assert(!(S0IsUnsigned && S0IsSigned));
16025 assert(!(S1IsUnsigned && S1IsSigned));
16026
16027 // There are 9 possible permutations of
16028 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16029
16030 // In two permutations, the sign bits are known to be the same for both Ops,
16031 // so simply return Signed / Unsigned corresponding to the MSB
16032
16033 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16034 return S0IsSigned;
16035
16036 // In another two permutations, the sign bits are known to be opposite. In
16037 // this case return std::nullopt to indicate a bad match.
16038
16039 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16040 return std::nullopt;
16041
16042 // In the remaining five permutations, we don't know the value of the sign
16043 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16044 // the upper bits must be extension bits. Thus, the only ways for the sign
16045 // bit to be unknown is if it was sign extended from unknown value, or if it
16046 // was any extended. In either case, it is correct to use the signed
16047 // version of the signedness semantics of dot4
16048
16049 // In two of such permutations, we known the sign bit is set for
16050 // one op, and the other is unknown. It is okay to used signed version of
16051 // dot4.
16052 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16053 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16054 return true;
16055
16056 // In one such permutation, we don't know either of the sign bits. It is okay
16057 // to used the signed version of dot4.
16058 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16059 return true;
16060
16061 // In two of such permutations, we known the sign bit is unset for
16062 // one op, and the other is unknown. Return std::nullopt to indicate a
16063 // bad match.
16064 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16065 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16066 return std::nullopt;
16067
16068 llvm_unreachable("Fully covered condition");
16069}
16070
16071SDValue SITargetLowering::performAddCombine(SDNode *N,
16072 DAGCombinerInfo &DCI) const {
16073 SelectionDAG &DAG = DCI.DAG;
16074 EVT VT = N->getValueType(0);
16075 SDLoc SL(N);
16076 SDValue LHS = N->getOperand(0);
16077 SDValue RHS = N->getOperand(1);
16078
16079 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16080 if (Subtarget->hasMad64_32()) {
16081 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16082 return Folded;
16083 }
16084 }
16085
16086 if (SDValue V = reassociateScalarOps(N, DAG)) {
16087 return V;
16088 }
16089
16090 if (VT == MVT::i64) {
16091 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16092 return Folded;
16093 }
16094
16095 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
16096 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16097 SDValue TempNode(N, 0);
16098 std::optional<bool> IsSigned;
16102
16103 // Match the v_dot4 tree, while collecting src nodes.
16104 int ChainLength = 0;
16105 for (int I = 0; I < 4; I++) {
16106 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
16107 if (MulIdx == -1)
16108 break;
16109 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16110 if (!Src0)
16111 break;
16112 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16113 if (!Src1)
16114 break;
16115
16116 auto IterIsSigned = checkDot4MulSignedness(
16117 TempNode->getOperand(MulIdx), *Src0, *Src1,
16118 TempNode->getOperand(MulIdx)->getOperand(0),
16119 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16120 if (!IterIsSigned)
16121 break;
16122 if (!IsSigned)
16123 IsSigned = *IterIsSigned;
16124 if (*IterIsSigned != *IsSigned)
16125 break;
16126 placeSources(*Src0, *Src1, Src0s, Src1s, I);
16127 auto AddIdx = 1 - MulIdx;
16128 // Allow the special case where add (add (mul24, 0), mul24) became ->
16129 // add (mul24, mul24).
16130 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
16131 Src2s.push_back(TempNode->getOperand(AddIdx));
16132 auto Src0 =
16133 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
16134 if (!Src0)
16135 break;
16136 auto Src1 =
16137 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
16138 if (!Src1)
16139 break;
16140 auto IterIsSigned = checkDot4MulSignedness(
16141 TempNode->getOperand(AddIdx), *Src0, *Src1,
16142 TempNode->getOperand(AddIdx)->getOperand(0),
16143 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16144 if (!IterIsSigned)
16145 break;
16146 assert(IsSigned);
16147 if (*IterIsSigned != *IsSigned)
16148 break;
16149 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
16150 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
16151 ChainLength = I + 2;
16152 break;
16153 }
16154
16155 TempNode = TempNode->getOperand(AddIdx);
16156 Src2s.push_back(TempNode);
16157 ChainLength = I + 1;
16158 if (TempNode->getNumOperands() < 2)
16159 break;
16160 LHS = TempNode->getOperand(0);
16161 RHS = TempNode->getOperand(1);
16162 }
16163
16164 if (ChainLength < 2)
16165 return SDValue();
16166
16167 // Masks were constructed with assumption that we would find a chain of
16168 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16169 // 0x0c) so they do not affect dot calculation.
16170 if (ChainLength < 4) {
16171 fixMasks(Src0s, ChainLength);
16172 fixMasks(Src1s, ChainLength);
16173 }
16174
16175 SDValue Src0, Src1;
16176
16177 // If we are just using a single source for both, and have permuted the
16178 // bytes consistently, we can just use the sources without permuting
16179 // (commutation).
16180 bool UseOriginalSrc = false;
16181 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16182 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16183 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16184 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16185 SmallVector<unsigned, 4> SrcBytes;
16186 auto Src0Mask = Src0s.begin()->PermMask;
16187 SrcBytes.push_back(Src0Mask & 0xFF000000);
16188 bool UniqueEntries = true;
16189 for (auto I = 1; I < 4; I++) {
16190 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16191
16192 if (is_contained(SrcBytes, NextByte)) {
16193 UniqueEntries = false;
16194 break;
16195 }
16196 SrcBytes.push_back(NextByte);
16197 }
16198
16199 if (UniqueEntries) {
16200 UseOriginalSrc = true;
16201
16202 auto *FirstElt = Src0s.begin();
16203 auto FirstEltOp =
16204 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16205
16206 auto *SecondElt = Src1s.begin();
16207 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16208 SecondElt->DWordOffset);
16209
16210 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16211 MVT::getIntegerVT(32));
16212 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16213 MVT::getIntegerVT(32));
16214 }
16215 }
16216
16217 if (!UseOriginalSrc) {
16218 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16219 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16220 }
16221
16222 assert(IsSigned);
16223 SDValue Src2 =
16224 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16225
16226 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16227 : Intrinsic::amdgcn_udot4,
16228 SL, MVT::i64);
16229
16230 assert(!VT.isVector());
16231 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16232 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16233
16234 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16235 }
16236
16237 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16238 return SDValue();
16239
16240 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16241 // add x, sext (setcc) => usubo_carry x, 0, setcc
16242 unsigned Opc = LHS.getOpcode();
16245 std::swap(RHS, LHS);
16246
16247 Opc = RHS.getOpcode();
16248 switch (Opc) {
16249 default:
16250 break;
16251 case ISD::ZERO_EXTEND:
16252 case ISD::SIGN_EXTEND:
16253 case ISD::ANY_EXTEND: {
16254 auto Cond = RHS.getOperand(0);
16255 // If this won't be a real VOPC output, we would still need to insert an
16256 // extra instruction anyway.
16257 if (!isBoolSGPR(Cond))
16258 break;
16259 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16260 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16262 return DAG.getNode(Opc, SL, VTList, Args);
16263 }
16264 case ISD::UADDO_CARRY: {
16265 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16266 if (!isNullConstant(RHS.getOperand(1)))
16267 break;
16268 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16269 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16270 }
16271 }
16272 return SDValue();
16273}
16274
16275SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16276 DAGCombinerInfo &DCI) const {
16277 SelectionDAG &DAG = DCI.DAG;
16278 SDLoc DL(N);
16279 EVT VT = N->getValueType(0);
16280 SDValue N0 = N->getOperand(0);
16281 SDValue N1 = N->getOperand(1);
16282
16283 // The following folds transform PTRADDs into regular arithmetic in cases
16284 // where the PTRADD wouldn't be folded as an immediate offset into memory
16285 // instructions anyway. They are target-specific in that other targets might
16286 // prefer to not lose information about the pointer arithmetic.
16287
16288 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16289 // Adapted from DAGCombiner::visitADDLikeCommutative.
16290 SDValue V, K;
16291 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16292 SDNodeFlags ShlFlags = N1->getFlags();
16293 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16294 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16295 // preserved.
16296 SDNodeFlags NewShlFlags =
16297 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16299 : SDNodeFlags();
16300 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16301 DCI.AddToWorklist(Inner.getNode());
16302 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16303 }
16304
16305 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16306 // performAddCombine.
16307 if (N1.getOpcode() == ISD::MUL) {
16308 if (Subtarget->hasMad64_32()) {
16309 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16310 return Folded;
16311 }
16312 }
16313
16314 // If the 32 low bits of the constant are all zero, there is nothing to fold
16315 // into an immediate offset, so it's better to eliminate the unnecessary
16316 // addition for the lower 32 bits than to preserve the PTRADD.
16317 // Analogous to a fold in performAddCombine.
16318 if (VT == MVT::i64) {
16319 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16320 return Folded;
16321 }
16322
16323 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16324 return SDValue();
16325
16326 SDValue X = N0;
16327 SDValue Y = N1.getOperand(0);
16328 SDValue Z = N1.getOperand(1);
16329 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16330 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16331
16332 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16333 Y->isDivergent() != Z->isDivergent()) {
16334 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16335 // y are uniform and z isn't.
16336 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16337 // z are uniform and y isn't.
16338 // The goal is to push uniform operands up in the computation, so that they
16339 // can be handled with scalar operations. We can't use reassociateScalarOps
16340 // for this since it requires two identical commutative operations to
16341 // reassociate.
16342 if (Y->isDivergent())
16343 std::swap(Y, Z);
16344 // If both additions in the original were NUW, reassociation preserves that.
16345 SDNodeFlags ReassocFlags =
16346 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16347 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16348 DCI.AddToWorklist(UniformInner.getNode());
16349 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16350 }
16351
16352 return SDValue();
16353}
16354
16355SDValue SITargetLowering::performSubCombine(SDNode *N,
16356 DAGCombinerInfo &DCI) const {
16357 SelectionDAG &DAG = DCI.DAG;
16358 EVT VT = N->getValueType(0);
16359
16360 if (VT == MVT::i64) {
16361 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16362 return Folded;
16363 }
16364
16365 if (VT != MVT::i32)
16366 return SDValue();
16367
16368 SDLoc SL(N);
16369 SDValue LHS = N->getOperand(0);
16370 SDValue RHS = N->getOperand(1);
16371
16372 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16373 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16374 unsigned Opc = RHS.getOpcode();
16375 switch (Opc) {
16376 default:
16377 break;
16378 case ISD::ZERO_EXTEND:
16379 case ISD::SIGN_EXTEND:
16380 case ISD::ANY_EXTEND: {
16381 auto Cond = RHS.getOperand(0);
16382 // If this won't be a real VOPC output, we would still need to insert an
16383 // extra instruction anyway.
16384 if (!isBoolSGPR(Cond))
16385 break;
16386 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16387 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16389 return DAG.getNode(Opc, SL, VTList, Args);
16390 }
16391 }
16392
16393 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16394 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16395 if (!isNullConstant(LHS.getOperand(1)))
16396 return SDValue();
16397 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16398 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16399 }
16400 return SDValue();
16401}
16402
16403SDValue
16404SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16405 DAGCombinerInfo &DCI) const {
16406
16407 if (N->getValueType(0) != MVT::i32)
16408 return SDValue();
16409
16410 if (!isNullConstant(N->getOperand(1)))
16411 return SDValue();
16412
16413 SelectionDAG &DAG = DCI.DAG;
16414 SDValue LHS = N->getOperand(0);
16415
16416 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16417 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16418 unsigned LHSOpc = LHS.getOpcode();
16419 unsigned Opc = N->getOpcode();
16420 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16421 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16422 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16423 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16424 }
16425 return SDValue();
16426}
16427
16428SDValue SITargetLowering::performFAddCombine(SDNode *N,
16429 DAGCombinerInfo &DCI) const {
16430 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16431 return SDValue();
16432
16433 SelectionDAG &DAG = DCI.DAG;
16434 EVT VT = N->getValueType(0);
16435
16436 SDLoc SL(N);
16437 SDValue LHS = N->getOperand(0);
16438 SDValue RHS = N->getOperand(1);
16439
16440 // These should really be instruction patterns, but writing patterns with
16441 // source modifiers is a pain.
16442
16443 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16444 if (LHS.getOpcode() == ISD::FADD) {
16445 SDValue A = LHS.getOperand(0);
16446 if (A == LHS.getOperand(1)) {
16447 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16448 if (FusedOp != 0) {
16449 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16450 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16451 }
16452 }
16453 }
16454
16455 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16456 if (RHS.getOpcode() == ISD::FADD) {
16457 SDValue A = RHS.getOperand(0);
16458 if (A == RHS.getOperand(1)) {
16459 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16460 if (FusedOp != 0) {
16461 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16462 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16463 }
16464 }
16465 }
16466
16467 return SDValue();
16468}
16469
16470SDValue SITargetLowering::performFSubCombine(SDNode *N,
16471 DAGCombinerInfo &DCI) const {
16472 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16473 return SDValue();
16474
16475 SelectionDAG &DAG = DCI.DAG;
16476 SDLoc SL(N);
16477 EVT VT = N->getValueType(0);
16478 assert(!VT.isVector());
16479
16480 // Try to get the fneg to fold into the source modifier. This undoes generic
16481 // DAG combines and folds them into the mad.
16482 //
16483 // Only do this if we are not trying to support denormals. v_mad_f32 does
16484 // not support denormals ever.
16485 SDValue LHS = N->getOperand(0);
16486 SDValue RHS = N->getOperand(1);
16487 if (LHS.getOpcode() == ISD::FADD) {
16488 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16489 SDValue A = LHS.getOperand(0);
16490 if (A == LHS.getOperand(1)) {
16491 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16492 if (FusedOp != 0) {
16493 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16494 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16495
16496 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16497 }
16498 }
16499 }
16500
16501 if (RHS.getOpcode() == ISD::FADD) {
16502 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16503
16504 SDValue A = RHS.getOperand(0);
16505 if (A == RHS.getOperand(1)) {
16506 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16507 if (FusedOp != 0) {
16508 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16509 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16510 }
16511 }
16512 }
16513
16514 return SDValue();
16515}
16516
16517SDValue SITargetLowering::performFDivCombine(SDNode *N,
16518 DAGCombinerInfo &DCI) const {
16519 SelectionDAG &DAG = DCI.DAG;
16520 SDLoc SL(N);
16521 EVT VT = N->getValueType(0);
16522
16523 // fsqrt legality correlates to rsq availability.
16524 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
16525 return SDValue();
16526
16527 SDValue LHS = N->getOperand(0);
16528 SDValue RHS = N->getOperand(1);
16529
16530 SDNodeFlags Flags = N->getFlags();
16531 SDNodeFlags RHSFlags = RHS->getFlags();
16532 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16533 !RHS->hasOneUse())
16534 return SDValue();
16535
16536 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16537 bool IsNegative = false;
16538 if (CLHS->isExactlyValue(1.0) ||
16539 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16540 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16541 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16542 if (RHS.getOpcode() == ISD::FSQRT) {
16543 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16544 SDValue Rsq =
16545 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16546 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16547 }
16548 }
16549 }
16550
16551 return SDValue();
16552}
16553
16554SDValue SITargetLowering::performFMulCombine(SDNode *N,
16555 DAGCombinerInfo &DCI) const {
16556 SelectionDAG &DAG = DCI.DAG;
16557 EVT VT = N->getValueType(0);
16558 EVT ScalarVT = VT.getScalarType();
16559 EVT IntVT = VT.changeElementType(MVT::i32);
16560
16561 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16562 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16563 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16564 return SDValue();
16565 }
16566
16567 SDValue LHS = N->getOperand(0);
16568 SDValue RHS = N->getOperand(1);
16569
16570 // It is cheaper to realize i32 inline constants as compared against
16571 // materializing f16 or f64 (or even non-inline f32) values,
16572 // possible via ldexp usage, as shown below :
16573 //
16574 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16575 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16576 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16577 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16578 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16579 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16580 if (!TrueNode)
16581 return SDValue();
16582 const ConstantFPSDNode *FalseNode =
16583 isConstOrConstSplatFP(RHS.getOperand(2));
16584 if (!FalseNode)
16585 return SDValue();
16586
16587 if (TrueNode->isNegative() != FalseNode->isNegative())
16588 return SDValue();
16589
16590 // For f32, only non-inline constants should be transformed.
16591 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16592 if (ScalarVT == MVT::f32 &&
16593 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16594 TII->isInlineConstant(FalseNode->getValueAPF()))
16595 return SDValue();
16596
16597 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16598 if (TrueNodeExpVal == INT_MIN)
16599 return SDValue();
16600 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16601 if (FalseNodeExpVal == INT_MIN)
16602 return SDValue();
16603
16604 SDLoc SL(N);
16605 SDValue SelectNode =
16606 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16607 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16608 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16609
16610 LHS = TrueNode->isNegative()
16611 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16612 : LHS;
16613
16614 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16615 }
16616
16617 return SDValue();
16618}
16619
16620SDValue SITargetLowering::performFMACombine(SDNode *N,
16621 DAGCombinerInfo &DCI) const {
16622 SelectionDAG &DAG = DCI.DAG;
16623 EVT VT = N->getValueType(0);
16624 SDLoc SL(N);
16625
16626 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16627 return SDValue();
16628
16629 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16630 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16631 SDValue Op1 = N->getOperand(0);
16632 SDValue Op2 = N->getOperand(1);
16633 SDValue FMA = N->getOperand(2);
16634
16635 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16636 Op2.getOpcode() != ISD::FP_EXTEND)
16637 return SDValue();
16638
16639 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16640 // regardless of the denorm mode setting. Therefore,
16641 // fp-contract is sufficient to allow generating fdot2.
16642 const TargetOptions &Options = DAG.getTarget().Options;
16643 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16644 (N->getFlags().hasAllowContract() &&
16645 FMA->getFlags().hasAllowContract())) {
16646 Op1 = Op1.getOperand(0);
16647 Op2 = Op2.getOperand(0);
16648 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16650 return SDValue();
16651
16652 SDValue Vec1 = Op1.getOperand(0);
16653 SDValue Idx1 = Op1.getOperand(1);
16654 SDValue Vec2 = Op2.getOperand(0);
16655
16656 SDValue FMAOp1 = FMA.getOperand(0);
16657 SDValue FMAOp2 = FMA.getOperand(1);
16658 SDValue FMAAcc = FMA.getOperand(2);
16659
16660 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16661 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16662 return SDValue();
16663
16664 FMAOp1 = FMAOp1.getOperand(0);
16665 FMAOp2 = FMAOp2.getOperand(0);
16666 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16668 return SDValue();
16669
16670 SDValue Vec3 = FMAOp1.getOperand(0);
16671 SDValue Vec4 = FMAOp2.getOperand(0);
16672 SDValue Idx2 = FMAOp1.getOperand(1);
16673
16674 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16675 // Idx1 and Idx2 cannot be the same.
16676 Idx1 == Idx2)
16677 return SDValue();
16678
16679 if (Vec1 == Vec2 || Vec3 == Vec4)
16680 return SDValue();
16681
16682 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16683 return SDValue();
16684
16685 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16686 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16687 DAG.getTargetConstant(0, SL, MVT::i1));
16688 }
16689 }
16690 return SDValue();
16691}
16692
16693SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16694 DAGCombinerInfo &DCI) const {
16695 SelectionDAG &DAG = DCI.DAG;
16696 SDLoc SL(N);
16697
16698 SDValue LHS = N->getOperand(0);
16699 SDValue RHS = N->getOperand(1);
16700 EVT VT = LHS.getValueType();
16701 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16702
16703 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16704 if (!CRHS) {
16706 if (CRHS) {
16707 std::swap(LHS, RHS);
16708 CC = getSetCCSwappedOperands(CC);
16709 }
16710 }
16711
16712 if (CRHS) {
16713 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16714 isBoolSGPR(LHS.getOperand(0))) {
16715 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16716 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16717 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16718 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16719 if ((CRHS->isAllOnes() &&
16720 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16721 (CRHS->isZero() &&
16722 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16723 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16724 DAG.getAllOnesConstant(SL, MVT::i1));
16725 if ((CRHS->isAllOnes() &&
16726 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16727 (CRHS->isZero() &&
16728 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16729 return LHS.getOperand(0);
16730 }
16731
16732 const APInt &CRHSVal = CRHS->getAPIntValue();
16733 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16734 LHS.getOpcode() == ISD::SELECT &&
16735 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16736 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16737 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16738 isBoolSGPR(LHS.getOperand(0))) {
16739 // Given CT != FT:
16740 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16741 // setcc (select cc, CT, CF), CF, ne => cc
16742 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16743 // setcc (select cc, CT, CF), CT, eq => cc
16744 const APInt &CT = LHS.getConstantOperandAPInt(1);
16745 const APInt &CF = LHS.getConstantOperandAPInt(2);
16746
16747 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16748 (CT == CRHSVal && CC == ISD::SETNE))
16749 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16750 DAG.getAllOnesConstant(SL, MVT::i1));
16751 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16752 (CT == CRHSVal && CC == ISD::SETEQ))
16753 return LHS.getOperand(0);
16754 }
16755 }
16756
16757 // Eliminate setcc by using carryout from add/sub instruction
16758
16759 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
16760 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
16761 // similarly for subtraction
16762
16763 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
16764 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
16765
16766 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
16768 (CC == ISD::SETUGT &&
16770 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16771 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16772 bool IsAdd = LHS.getOpcode() == ISD::ADD;
16773
16774 SDValue Op0 = LHS.getOperand(0);
16775 SDValue Op1 = LHS.getOperand(1);
16776
16777 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
16778 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
16779
16780 SDValue Op0Hi = getHiHalf64(Op0, DAG);
16781 SDValue Op1Hi = getHiHalf64(Op1, DAG);
16782
16783 SDValue NodeLo =
16784 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
16785 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16786
16787 SDValue CarryInHi = NodeLo.getValue(1);
16788 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
16789 SL, DAG.getVTList(MVT::i32, MVT::i1),
16790 {Op0Hi, Op1Hi, CarryInHi});
16791
16792 SDValue ResultLo = NodeLo.getValue(0);
16793 SDValue ResultHi = NodeHi.getValue(0);
16794
16795 SDValue JoinedResult =
16796 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
16797
16798 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
16799 SDValue Overflow = NodeHi.getValue(1);
16800 DCI.CombineTo(LHS.getNode(), Result);
16801 return Overflow;
16802 }
16803
16804 if (VT != MVT::f32 && VT != MVT::f64 &&
16805 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16806 return SDValue();
16807
16808 // Match isinf/isfinite pattern
16809 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16810 // (fcmp one (fabs x), inf) -> (fp_class x,
16811 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16812 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16813 LHS.getOpcode() == ISD::FABS) {
16814 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16815 if (!CRHS)
16816 return SDValue();
16817
16818 const APFloat &APF = CRHS->getValueAPF();
16819 if (APF.isInfinity() && !APF.isNegative()) {
16820 const unsigned IsInfMask =
16822 const unsigned IsFiniteMask =
16826 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16827 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16828 DAG.getConstant(Mask, SL, MVT::i32));
16829 }
16830 }
16831
16832 return SDValue();
16833}
16834
16835SDValue
16836SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16837 DAGCombinerInfo &DCI) const {
16838 SelectionDAG &DAG = DCI.DAG;
16839 SDLoc SL(N);
16840 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16841
16842 SDValue Src = N->getOperand(0);
16843 SDValue Shift = N->getOperand(0);
16844
16845 // TODO: Extend type shouldn't matter (assuming legal types).
16846 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16847 Shift = Shift.getOperand(0);
16848
16849 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16850 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16851 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16852 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16853 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16854 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16855 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16856 SDValue Shifted = DAG.getZExtOrTrunc(
16857 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16858
16859 unsigned ShiftOffset = 8 * Offset;
16860 if (Shift.getOpcode() == ISD::SHL)
16861 ShiftOffset -= C->getZExtValue();
16862 else
16863 ShiftOffset += C->getZExtValue();
16864
16865 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16866 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16867 MVT::f32, Shifted);
16868 }
16869 }
16870 }
16871
16872 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16873 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16874 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16875 // We simplified Src. If this node is not dead, visit it again so it is
16876 // folded properly.
16877 if (N->getOpcode() != ISD::DELETED_NODE)
16878 DCI.AddToWorklist(N);
16879 return SDValue(N, 0);
16880 }
16881
16882 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16883 if (SDValue DemandedSrc =
16884 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16885 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16886
16887 return SDValue();
16888}
16889
16890SDValue SITargetLowering::performClampCombine(SDNode *N,
16891 DAGCombinerInfo &DCI) const {
16892 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16893 if (!CSrc)
16894 return SDValue();
16895
16896 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16897 const APFloat &F = CSrc->getValueAPF();
16898 APFloat Zero = APFloat::getZero(F.getSemantics());
16899 if (F < Zero ||
16900 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16901 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16902 }
16903
16904 APFloat One(F.getSemantics(), "1.0");
16905 if (F > One)
16906 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16907
16908 return SDValue(CSrc, 0);
16909}
16910
16911SDValue SITargetLowering::performSelectCombine(SDNode *N,
16912 DAGCombinerInfo &DCI) const {
16913
16914 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16915 // integer).
16916 // Detect when CMP and SELECT use the same constant and fold them to avoid
16917 // loading the constant twice. Specifically handles patterns like:
16918 // %cmp = icmp eq i32 %val, 4242
16919 // %sel = select i1 %cmp, i32 4242, i32 %other
16920 // It can be optimized to reuse %val instead of 4242 in select.
16921 SDValue Cond = N->getOperand(0);
16922 SDValue TrueVal = N->getOperand(1);
16923 SDValue FalseVal = N->getOperand(2);
16924
16925 // Check if condition is a comparison.
16926 if (Cond.getOpcode() != ISD::SETCC)
16927 return SDValue();
16928
16929 SDValue LHS = Cond.getOperand(0);
16930 SDValue RHS = Cond.getOperand(1);
16931 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16932
16933 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16934 bool isInteger = LHS.getValueType().isInteger();
16935
16936 // Handle simple floating-point and integer types only.
16937 if (!isFloatingPoint && !isInteger)
16938 return SDValue();
16939
16940 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16941 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16942 if (!isEquality && !isNonEquality)
16943 return SDValue();
16944
16945 SDValue ArgVal, ConstVal;
16946 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16947 (isInteger && isa<ConstantSDNode>(RHS))) {
16948 ConstVal = RHS;
16949 ArgVal = LHS;
16950 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16951 (isInteger && isa<ConstantSDNode>(LHS))) {
16952 ConstVal = LHS;
16953 ArgVal = RHS;
16954 } else {
16955 return SDValue();
16956 }
16957
16958 // Skip optimization for inlinable immediates.
16959 if (isFloatingPoint) {
16960 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16961 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16962 return SDValue();
16963 } else {
16965 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16966 return SDValue();
16967 }
16968
16969 // For equality and non-equality comparisons, patterns:
16970 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16971 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16972 if (!(isEquality && TrueVal == ConstVal) &&
16973 !(isNonEquality && FalseVal == ConstVal))
16974 return SDValue();
16975
16976 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16977 SDValue SelectRHS =
16978 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16979 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16980 SelectLHS, SelectRHS);
16981}
16982
16984 DAGCombinerInfo &DCI) const {
16985 switch (N->getOpcode()) {
16986 case ISD::ADD:
16987 case ISD::SUB:
16988 case ISD::SHL:
16989 case ISD::SRL:
16990 case ISD::SRA:
16991 case ISD::AND:
16992 case ISD::OR:
16993 case ISD::XOR:
16994 case ISD::MUL:
16995 case ISD::SETCC:
16996 case ISD::SELECT:
16997 case ISD::SMIN:
16998 case ISD::SMAX:
16999 case ISD::UMIN:
17000 case ISD::UMAX:
17001 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
17002 return Res;
17003 break;
17004 default:
17005 break;
17006 }
17007
17008 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
17009 return SDValue();
17010
17011 switch (N->getOpcode()) {
17012 case ISD::ADD:
17013 return performAddCombine(N, DCI);
17014 case ISD::PTRADD:
17015 return performPtrAddCombine(N, DCI);
17016 case ISD::SUB:
17017 return performSubCombine(N, DCI);
17018 case ISD::UADDO_CARRY:
17019 case ISD::USUBO_CARRY:
17020 return performAddCarrySubCarryCombine(N, DCI);
17021 case ISD::FADD:
17022 return performFAddCombine(N, DCI);
17023 case ISD::FSUB:
17024 return performFSubCombine(N, DCI);
17025 case ISD::FDIV:
17026 return performFDivCombine(N, DCI);
17027 case ISD::FMUL:
17028 return performFMulCombine(N, DCI);
17029 case ISD::SETCC:
17030 return performSetCCCombine(N, DCI);
17031 case ISD::SELECT:
17032 if (auto Res = performSelectCombine(N, DCI))
17033 return Res;
17034 break;
17035 case ISD::FMAXNUM:
17036 case ISD::FMINNUM:
17037 case ISD::FMAXNUM_IEEE:
17038 case ISD::FMINNUM_IEEE:
17039 case ISD::FMAXIMUM:
17040 case ISD::FMINIMUM:
17041 case ISD::FMAXIMUMNUM:
17042 case ISD::FMINIMUMNUM:
17043 case ISD::SMAX:
17044 case ISD::SMIN:
17045 case ISD::UMAX:
17046 case ISD::UMIN:
17047 case AMDGPUISD::FMIN_LEGACY:
17048 case AMDGPUISD::FMAX_LEGACY:
17049 return performMinMaxCombine(N, DCI);
17050 case ISD::FMA:
17051 return performFMACombine(N, DCI);
17052 case ISD::AND:
17053 return performAndCombine(N, DCI);
17054 case ISD::OR:
17055 return performOrCombine(N, DCI);
17056 case ISD::FSHR: {
17058 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
17059 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17060 return matchPERM(N, DCI);
17061 }
17062 break;
17063 }
17064 case ISD::XOR:
17065 return performXorCombine(N, DCI);
17066 case ISD::ZERO_EXTEND:
17067 return performZeroExtendCombine(N, DCI);
17069 return performSignExtendInRegCombine(N, DCI);
17070 case AMDGPUISD::FP_CLASS:
17071 return performClassCombine(N, DCI);
17072 case ISD::FCANONICALIZE:
17073 return performFCanonicalizeCombine(N, DCI);
17074 case AMDGPUISD::RCP:
17075 return performRcpCombine(N, DCI);
17076 case ISD::FLDEXP:
17077 case AMDGPUISD::FRACT:
17078 case AMDGPUISD::RSQ:
17079 case AMDGPUISD::RCP_LEGACY:
17080 case AMDGPUISD::RCP_IFLAG:
17081 case AMDGPUISD::RSQ_CLAMP: {
17082 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17083 SDValue Src = N->getOperand(0);
17084 if (Src.isUndef())
17085 return Src;
17086 break;
17087 }
17088 case ISD::SINT_TO_FP:
17089 case ISD::UINT_TO_FP:
17090 return performUCharToFloatCombine(N, DCI);
17091 case ISD::FCOPYSIGN:
17092 return performFCopySignCombine(N, DCI);
17093 case AMDGPUISD::CVT_F32_UBYTE0:
17094 case AMDGPUISD::CVT_F32_UBYTE1:
17095 case AMDGPUISD::CVT_F32_UBYTE2:
17096 case AMDGPUISD::CVT_F32_UBYTE3:
17097 return performCvtF32UByteNCombine(N, DCI);
17098 case AMDGPUISD::FMED3:
17099 return performFMed3Combine(N, DCI);
17100 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17101 return performCvtPkRTZCombine(N, DCI);
17102 case AMDGPUISD::CLAMP:
17103 return performClampCombine(N, DCI);
17104 case ISD::SCALAR_TO_VECTOR: {
17105 SelectionDAG &DAG = DCI.DAG;
17106 EVT VT = N->getValueType(0);
17107
17108 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17109 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17110 SDLoc SL(N);
17111 SDValue Src = N->getOperand(0);
17112 EVT EltVT = Src.getValueType();
17113 if (EltVT != MVT::i16)
17114 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
17115
17116 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
17117 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
17118 }
17119
17120 break;
17121 }
17123 return performExtractVectorEltCombine(N, DCI);
17125 return performInsertVectorEltCombine(N, DCI);
17126 case ISD::FP_ROUND:
17127 return performFPRoundCombine(N, DCI);
17128 case ISD::LOAD: {
17129 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
17130 return Widened;
17131 [[fallthrough]];
17132 }
17133 default: {
17134 if (!DCI.isBeforeLegalize()) {
17135 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
17136 return performMemSDNodeCombine(MemNode, DCI);
17137 }
17138
17139 break;
17140 }
17141 }
17142
17144}
17145
17146/// Helper function for adjustWritemask
17147static unsigned SubIdx2Lane(unsigned Idx) {
17148 switch (Idx) {
17149 default:
17150 return ~0u;
17151 case AMDGPU::sub0:
17152 return 0;
17153 case AMDGPU::sub1:
17154 return 1;
17155 case AMDGPU::sub2:
17156 return 2;
17157 case AMDGPU::sub3:
17158 return 3;
17159 case AMDGPU::sub4:
17160 return 4; // Possible with TFE/LWE
17161 }
17162}
17163
17164/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17165SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17166 SelectionDAG &DAG) const {
17167 unsigned Opcode = Node->getMachineOpcode();
17168
17169 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17170 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17171 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
17172 return Node; // not implemented for D16
17173
17174 SDNode *Users[5] = {nullptr};
17175 unsigned Lane = 0;
17176 unsigned DmaskIdx =
17177 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17178 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
17179 unsigned NewDmask = 0;
17180 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17181 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17182 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
17183 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
17184 unsigned TFCLane = 0;
17185 bool HasChain = Node->getNumValues() > 1;
17186
17187 if (OldDmask == 0) {
17188 // These are folded out, but on the chance it happens don't assert.
17189 return Node;
17190 }
17191
17192 unsigned OldBitsSet = llvm::popcount(OldDmask);
17193 // Work out which is the TFE/LWE lane if that is enabled.
17194 if (UsesTFC) {
17195 TFCLane = OldBitsSet;
17196 }
17197
17198 // Try to figure out the used register components
17199 for (SDUse &Use : Node->uses()) {
17200
17201 // Don't look at users of the chain.
17202 if (Use.getResNo() != 0)
17203 continue;
17204
17205 SDNode *User = Use.getUser();
17206
17207 // Abort if we can't understand the usage
17208 if (!User->isMachineOpcode() ||
17209 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17210 return Node;
17211
17212 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17213 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17214 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17215 // set, etc.
17216 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17217 if (Lane == ~0u)
17218 return Node;
17219
17220 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17221 if (UsesTFC && Lane == TFCLane) {
17222 Users[Lane] = User;
17223 } else {
17224 // Set which texture component corresponds to the lane.
17225 unsigned Comp;
17226 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17227 Comp = llvm::countr_zero(Dmask);
17228 Dmask &= ~(1 << Comp);
17229 }
17230
17231 // Abort if we have more than one user per component.
17232 if (Users[Lane])
17233 return Node;
17234
17235 Users[Lane] = User;
17236 NewDmask |= 1 << Comp;
17237 }
17238 }
17239
17240 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17241 bool NoChannels = !NewDmask;
17242 if (NoChannels) {
17243 if (!UsesTFC) {
17244 // No uses of the result and not using TFC. Then do nothing.
17245 return Node;
17246 }
17247 // If the original dmask has one channel - then nothing to do
17248 if (OldBitsSet == 1)
17249 return Node;
17250 // Use an arbitrary dmask - required for the instruction to work
17251 NewDmask = 1;
17252 }
17253 // Abort if there's no change
17254 if (NewDmask == OldDmask)
17255 return Node;
17256
17257 unsigned BitsSet = llvm::popcount(NewDmask);
17258
17259 // Check for TFE or LWE - increase the number of channels by one to account
17260 // for the extra return value
17261 // This will need adjustment for D16 if this is also included in
17262 // adjustWriteMask (this function) but at present D16 are excluded.
17263 unsigned NewChannels = BitsSet + UsesTFC;
17264
17265 int NewOpcode =
17266 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17267 assert(NewOpcode != -1 &&
17268 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17269 "failed to find equivalent MIMG op");
17270
17271 // Adjust the writemask in the node
17273 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17274 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17275 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17276
17277 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17278
17279 MVT ResultVT = NewChannels == 1
17280 ? SVT
17281 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17282 : NewChannels == 5 ? 8
17283 : NewChannels);
17284 SDVTList NewVTList =
17285 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17286
17287 MachineSDNode *NewNode =
17288 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17289
17290 if (HasChain) {
17291 // Update chain.
17292 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17293 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17294 }
17295
17296 if (NewChannels == 1) {
17297 assert(Node->hasNUsesOfValue(1, 0));
17298 SDNode *Copy =
17299 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17300 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17301 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17302 return nullptr;
17303 }
17304
17305 // Update the users of the node with the new indices
17306 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17307 SDNode *User = Users[i];
17308 if (!User) {
17309 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17310 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17311 if (i || !NoChannels)
17312 continue;
17313 } else {
17314 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17315 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17316 if (NewUser != User) {
17317 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17318 DAG.RemoveDeadNode(User);
17319 }
17320 }
17321
17322 switch (Idx) {
17323 default:
17324 break;
17325 case AMDGPU::sub0:
17326 Idx = AMDGPU::sub1;
17327 break;
17328 case AMDGPU::sub1:
17329 Idx = AMDGPU::sub2;
17330 break;
17331 case AMDGPU::sub2:
17332 Idx = AMDGPU::sub3;
17333 break;
17334 case AMDGPU::sub3:
17335 Idx = AMDGPU::sub4;
17336 break;
17337 }
17338 }
17339
17340 DAG.RemoveDeadNode(Node);
17341 return nullptr;
17342}
17343
17345 if (Op.getOpcode() == ISD::AssertZext)
17346 Op = Op.getOperand(0);
17347
17348 return isa<FrameIndexSDNode>(Op);
17349}
17350
17351/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17352/// with frame index operands.
17353/// LLVM assumes that inputs are to these instructions are registers.
17354SDNode *
17356 SelectionDAG &DAG) const {
17357 if (Node->getOpcode() == ISD::CopyToReg) {
17358 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17359 SDValue SrcVal = Node->getOperand(2);
17360
17361 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17362 // to try understanding copies to physical registers.
17363 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17364 SDLoc SL(Node);
17366 SDValue VReg = DAG.getRegister(
17367 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17368
17369 SDNode *Glued = Node->getGluedNode();
17370 SDValue ToVReg = DAG.getCopyToReg(
17371 Node->getOperand(0), SL, VReg, SrcVal,
17372 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17373 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17374 VReg, ToVReg.getValue(1));
17375 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17376 DAG.RemoveDeadNode(Node);
17377 return ToResultReg.getNode();
17378 }
17379 }
17380
17382 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17383 if (!isFrameIndexOp(Node->getOperand(i))) {
17384 Ops.push_back(Node->getOperand(i));
17385 continue;
17386 }
17387
17388 SDLoc DL(Node);
17389 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17390 Node->getOperand(i).getValueType(),
17391 Node->getOperand(i)),
17392 0));
17393 }
17394
17395 return DAG.UpdateNodeOperands(Node, Ops);
17396}
17397
17398/// Fold the instructions after selecting them.
17399/// Returns null if users were already updated.
17401 SelectionDAG &DAG) const {
17403 unsigned Opcode = Node->getMachineOpcode();
17404
17405 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17406 !TII->isGather4(Opcode) &&
17407 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17408 return adjustWritemask(Node, DAG);
17409 }
17410
17411 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17413 return Node;
17414 }
17415
17416 switch (Opcode) {
17417 case AMDGPU::V_DIV_SCALE_F32_e64:
17418 case AMDGPU::V_DIV_SCALE_F64_e64: {
17419 // Satisfy the operand register constraint when one of the inputs is
17420 // undefined. Ordinarily each undef value will have its own implicit_def of
17421 // a vreg, so force these to use a single register.
17422 SDValue Src0 = Node->getOperand(1);
17423 SDValue Src1 = Node->getOperand(3);
17424 SDValue Src2 = Node->getOperand(5);
17425
17426 if ((Src0.isMachineOpcode() &&
17427 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17428 (Src0 == Src1 || Src0 == Src2))
17429 break;
17430
17431 MVT VT = Src0.getValueType().getSimpleVT();
17432 const TargetRegisterClass *RC =
17433 getRegClassFor(VT, Src0.getNode()->isDivergent());
17434
17436 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17437
17438 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17439 Src0, SDValue());
17440
17441 // src0 must be the same register as src1 or src2, even if the value is
17442 // undefined, so make sure we don't violate this constraint.
17443 if (Src0.isMachineOpcode() &&
17444 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17445 if (Src1.isMachineOpcode() &&
17446 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17447 Src0 = Src1;
17448 else if (Src2.isMachineOpcode() &&
17449 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17450 Src0 = Src2;
17451 else {
17452 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17453 Src0 = UndefReg;
17454 Src1 = UndefReg;
17455 }
17456 } else
17457 break;
17458
17460 Ops[1] = Src0;
17461 Ops[3] = Src1;
17462 Ops[5] = Src2;
17463 Ops.push_back(ImpDef.getValue(1));
17464 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17465 }
17466 default:
17467 break;
17468 }
17469
17470 return Node;
17471}
17472
17473// Any MIMG instructions that use tfe or lwe require an initialization of the
17474// result register that will be written in the case of a memory access failure.
17475// The required code is also added to tie this init code to the result of the
17476// img instruction.
17479 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17480 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17481 MachineBasicBlock &MBB = *MI.getParent();
17482
17483 int DstIdx =
17484 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17485 unsigned InitIdx = 0;
17486
17487 if (TII->isImage(MI)) {
17488 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17489 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17490 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17491
17492 if (!TFE && !LWE) // intersect_ray
17493 return;
17494
17495 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17496 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17497 unsigned D16Val = D16 ? D16->getImm() : 0;
17498
17499 if (!TFEVal && !LWEVal)
17500 return;
17501
17502 // At least one of TFE or LWE are non-zero
17503 // We have to insert a suitable initialization of the result value and
17504 // tie this to the dest of the image instruction.
17505
17506 // Calculate which dword we have to initialize to 0.
17507 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17508
17509 // check that dmask operand is found.
17510 assert(MO_Dmask && "Expected dmask operand in instruction");
17511
17512 unsigned dmask = MO_Dmask->getImm();
17513 // Determine the number of active lanes taking into account the
17514 // Gather4 special case
17515 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17516
17517 bool Packed = !Subtarget->hasUnpackedD16VMem();
17518
17519 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17520
17521 // Abandon attempt if the dst size isn't large enough
17522 // - this is in fact an error but this is picked up elsewhere and
17523 // reported correctly.
17524 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17525
17526 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
17527 if (DstSize < InitIdx)
17528 return;
17529 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17530 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17531 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
17532 } else {
17533 return;
17534 }
17535
17536 const DebugLoc &DL = MI.getDebugLoc();
17537
17538 // Create a register for the initialization value.
17539 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17540 unsigned NewDst = 0; // Final initialized value will be in here
17541
17542 // If PRTStrictNull feature is enabled (the default) then initialize
17543 // all the result registers to 0, otherwise just the error indication
17544 // register (VGPRn+1)
17545 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17546 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17547
17548 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17549 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17550 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17551 // Initialize dword
17552 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17553 // clang-format off
17554 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17555 .addImm(0);
17556 // clang-format on
17557 // Insert into the super-reg
17558 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17559 .addReg(PrevDst)
17560 .addReg(SubReg)
17562
17563 PrevDst = NewDst;
17564 }
17565
17566 // Add as an implicit operand
17567 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17568
17569 // Tie the just added implicit operand to the dst
17570 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17571}
17572
17573/// Assign the register class depending on the number of
17574/// bits set in the writemask
17576 SDNode *Node) const {
17578
17579 MachineFunction *MF = MI.getMF();
17581
17582 if (TII->isVOP3(MI.getOpcode())) {
17583 // Make sure constant bus requirements are respected.
17584 TII->legalizeOperandsVOP3(MRI, MI);
17585
17586 if (TII->isMAI(MI)) {
17587 // The ordinary src0, src1, src2 were legalized above.
17588 //
17589 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17590 // as a separate instruction.
17591 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17592 AMDGPU::OpName::scale_src0);
17593 if (Src0Idx != -1) {
17594 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17595 AMDGPU::OpName::scale_src1);
17596 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17597 TII->usesConstantBus(MRI, MI, Src1Idx))
17598 TII->legalizeOpWithMove(MI, Src1Idx);
17599 }
17600 }
17601
17602 return;
17603 }
17604
17605 if (TII->isImage(MI))
17606 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17607}
17608
17610 uint64_t Val) {
17611 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17612 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17613}
17614
17616 const SDLoc &DL,
17617 SDValue Ptr) const {
17619
17620 // Build the half of the subregister with the constants before building the
17621 // full 128-bit register. If we are building multiple resource descriptors,
17622 // this will allow CSEing of the 2-component register.
17623 const SDValue Ops0[] = {
17624 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17625 buildSMovImm32(DAG, DL, 0),
17626 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17627 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17628 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17629
17630 SDValue SubRegHi = SDValue(
17631 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17632
17633 // Combine the constants and the pointer.
17634 const SDValue Ops1[] = {
17635 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17636 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17637 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17638
17639 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17640}
17641
17642/// Return a resource descriptor with the 'Add TID' bit enabled
17643/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17644/// of the resource descriptor) to create an offset, which is added to
17645/// the resource pointer.
17647 SDValue Ptr, uint32_t RsrcDword1,
17648 uint64_t RsrcDword2And3) const {
17649 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17650 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17651 if (RsrcDword1) {
17652 PtrHi =
17653 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17654 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17655 0);
17656 }
17657
17658 SDValue DataLo =
17659 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17660 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17661
17662 const SDValue Ops[] = {
17663 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17664 PtrLo,
17665 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17666 PtrHi,
17667 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17668 DataLo,
17669 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17670 DataHi,
17671 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17672
17673 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17674}
17675
17676//===----------------------------------------------------------------------===//
17677// SI Inline Assembly Support
17678//===----------------------------------------------------------------------===//
17679
17680std::pair<unsigned, const TargetRegisterClass *>
17682 StringRef Constraint,
17683 MVT VT) const {
17684 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17685
17686 const TargetRegisterClass *RC = nullptr;
17687 if (Constraint.size() == 1) {
17688 // Check if we cannot determine the bit size of the given value type. This
17689 // can happen, for example, in this situation where we have an empty struct
17690 // (size 0): `call void asm "", "v"({} poison)`-
17691 if (VT == MVT::Other)
17692 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17693 const unsigned BitWidth = VT.getSizeInBits();
17694 switch (Constraint[0]) {
17695 default:
17696 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17697 case 's':
17698 case 'r':
17699 switch (BitWidth) {
17700 case 16:
17701 RC = &AMDGPU::SReg_32RegClass;
17702 break;
17703 case 64:
17704 RC = &AMDGPU::SGPR_64RegClass;
17705 break;
17706 default:
17708 if (!RC)
17709 return std::pair(0U, nullptr);
17710 break;
17711 }
17712 break;
17713 case 'v':
17714 switch (BitWidth) {
17715 case 1:
17716 return std::pair(0U, nullptr);
17717 case 16:
17718 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17719 : &AMDGPU::VGPR_32_Lo256RegClass;
17720 break;
17721 default:
17722 RC = Subtarget->has1024AddressableVGPRs()
17723 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17724 : TRI->getVGPRClassForBitWidth(BitWidth);
17725 if (!RC)
17726 return std::pair(0U, nullptr);
17727 break;
17728 }
17729 break;
17730 case 'a':
17731 if (!Subtarget->hasMAIInsts())
17732 break;
17733 switch (BitWidth) {
17734 case 1:
17735 return std::pair(0U, nullptr);
17736 case 16:
17737 RC = &AMDGPU::AGPR_32RegClass;
17738 break;
17739 default:
17740 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17741 if (!RC)
17742 return std::pair(0U, nullptr);
17743 break;
17744 }
17745 break;
17746 }
17747 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17748 const unsigned BitWidth = VT.getSizeInBits();
17749 switch (BitWidth) {
17750 case 16:
17751 RC = &AMDGPU::AV_32RegClass;
17752 break;
17753 default:
17754 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17755 if (!RC)
17756 return std::pair(0U, nullptr);
17757 break;
17758 }
17759 }
17760
17761 // We actually support i128, i16 and f16 as inline parameters
17762 // even if they are not reported as legal
17763 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17764 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17765 return std::pair(0U, RC);
17766
17767 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17768 if (Kind != '\0') {
17769 if (Kind == 'v') {
17770 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17771 } else if (Kind == 's') {
17772 RC = &AMDGPU::SGPR_32RegClass;
17773 } else if (Kind == 'a') {
17774 RC = &AMDGPU::AGPR_32RegClass;
17775 }
17776
17777 if (RC) {
17778 if (NumRegs > 1) {
17779 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17780 return std::pair(0U, nullptr);
17781
17782 uint32_t Width = NumRegs * 32;
17783 // Prohibit constraints for register ranges with a width that does not
17784 // match the required type.
17785 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17786 return std::pair(0U, nullptr);
17787
17788 MCRegister Reg = RC->getRegister(Idx);
17790 RC = TRI->getVGPRClassForBitWidth(Width);
17791 else if (SIRegisterInfo::isSGPRClass(RC))
17792 RC = TRI->getSGPRClassForBitWidth(Width);
17793 else if (SIRegisterInfo::isAGPRClass(RC))
17794 RC = TRI->getAGPRClassForBitWidth(Width);
17795 if (RC) {
17796 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17797 if (!Reg) {
17798 // The register class does not contain the requested register,
17799 // e.g., because it is an SGPR pair that would violate alignment
17800 // requirements.
17801 return std::pair(0U, nullptr);
17802 }
17803 return std::pair(Reg, RC);
17804 }
17805 }
17806
17807 // Check for lossy scalar/vector conversions.
17808 if (VT.isVector() && VT.getSizeInBits() != 32)
17809 return std::pair(0U, nullptr);
17810 if (Idx < RC->getNumRegs())
17811 return std::pair(RC->getRegister(Idx), RC);
17812 return std::pair(0U, nullptr);
17813 }
17814 }
17815
17816 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17817 if (Ret.first)
17818 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17819
17820 return Ret;
17821}
17822
17823static bool isImmConstraint(StringRef Constraint) {
17824 if (Constraint.size() == 1) {
17825 switch (Constraint[0]) {
17826 default:
17827 break;
17828 case 'I':
17829 case 'J':
17830 case 'A':
17831 case 'B':
17832 case 'C':
17833 return true;
17834 }
17835 } else if (Constraint == "DA" || Constraint == "DB") {
17836 return true;
17837 }
17838 return false;
17839}
17840
17843 if (Constraint.size() == 1) {
17844 switch (Constraint[0]) {
17845 default:
17846 break;
17847 case 's':
17848 case 'v':
17849 case 'a':
17850 return C_RegisterClass;
17851 }
17852 } else if (Constraint.size() == 2) {
17853 if (Constraint == "VA")
17854 return C_RegisterClass;
17855 }
17856 if (isImmConstraint(Constraint)) {
17857 return C_Other;
17858 }
17859 return TargetLowering::getConstraintType(Constraint);
17860}
17861
17862static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17864 Val = Val & maskTrailingOnes<uint64_t>(Size);
17865 }
17866 return Val;
17867}
17868
17870 StringRef Constraint,
17871 std::vector<SDValue> &Ops,
17872 SelectionDAG &DAG) const {
17873 if (isImmConstraint(Constraint)) {
17874 uint64_t Val;
17875 if (getAsmOperandConstVal(Op, Val) &&
17876 checkAsmConstraintVal(Op, Constraint, Val)) {
17877 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17878 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17879 }
17880 } else {
17882 }
17883}
17884
17886 unsigned Size = Op.getScalarValueSizeInBits();
17887 if (Size > 64)
17888 return false;
17889
17890 if (Size == 16 && !Subtarget->has16BitInsts())
17891 return false;
17892
17894 Val = C->getSExtValue();
17895 return true;
17896 }
17898 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17899 return true;
17900 }
17902 if (Size != 16 || Op.getNumOperands() != 2)
17903 return false;
17904 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17905 return false;
17906 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17907 Val = C->getSExtValue();
17908 return true;
17909 }
17910 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17911 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17912 return true;
17913 }
17914 }
17915
17916 return false;
17917}
17918
17920 uint64_t Val) const {
17921 if (Constraint.size() == 1) {
17922 switch (Constraint[0]) {
17923 case 'I':
17925 case 'J':
17926 return isInt<16>(Val);
17927 case 'A':
17928 return checkAsmConstraintValA(Op, Val);
17929 case 'B':
17930 return isInt<32>(Val);
17931 case 'C':
17932 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17934 default:
17935 break;
17936 }
17937 } else if (Constraint.size() == 2) {
17938 if (Constraint == "DA") {
17939 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17940 int64_t LoBits = static_cast<int32_t>(Val);
17941 return checkAsmConstraintValA(Op, HiBits, 32) &&
17942 checkAsmConstraintValA(Op, LoBits, 32);
17943 }
17944 if (Constraint == "DB") {
17945 return true;
17946 }
17947 }
17948 llvm_unreachable("Invalid asm constraint");
17949}
17950
17952 unsigned MaxSize) const {
17953 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17954 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17955 if (Size == 16) {
17956 MVT VT = Op.getSimpleValueType();
17957 switch (VT.SimpleTy) {
17958 default:
17959 return false;
17960 case MVT::i16:
17961 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17962 case MVT::f16:
17963 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17964 case MVT::bf16:
17965 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17966 case MVT::v2i16:
17967 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17968 case MVT::v2f16:
17969 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17970 case MVT::v2bf16:
17971 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17972 }
17973 }
17974 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17975 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17976 return true;
17977 return false;
17978}
17979
17980static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17981 switch (UnalignedClassID) {
17982 case AMDGPU::VReg_64RegClassID:
17983 return AMDGPU::VReg_64_Align2RegClassID;
17984 case AMDGPU::VReg_96RegClassID:
17985 return AMDGPU::VReg_96_Align2RegClassID;
17986 case AMDGPU::VReg_128RegClassID:
17987 return AMDGPU::VReg_128_Align2RegClassID;
17988 case AMDGPU::VReg_160RegClassID:
17989 return AMDGPU::VReg_160_Align2RegClassID;
17990 case AMDGPU::VReg_192RegClassID:
17991 return AMDGPU::VReg_192_Align2RegClassID;
17992 case AMDGPU::VReg_224RegClassID:
17993 return AMDGPU::VReg_224_Align2RegClassID;
17994 case AMDGPU::VReg_256RegClassID:
17995 return AMDGPU::VReg_256_Align2RegClassID;
17996 case AMDGPU::VReg_288RegClassID:
17997 return AMDGPU::VReg_288_Align2RegClassID;
17998 case AMDGPU::VReg_320RegClassID:
17999 return AMDGPU::VReg_320_Align2RegClassID;
18000 case AMDGPU::VReg_352RegClassID:
18001 return AMDGPU::VReg_352_Align2RegClassID;
18002 case AMDGPU::VReg_384RegClassID:
18003 return AMDGPU::VReg_384_Align2RegClassID;
18004 case AMDGPU::VReg_512RegClassID:
18005 return AMDGPU::VReg_512_Align2RegClassID;
18006 case AMDGPU::VReg_1024RegClassID:
18007 return AMDGPU::VReg_1024_Align2RegClassID;
18008 case AMDGPU::AReg_64RegClassID:
18009 return AMDGPU::AReg_64_Align2RegClassID;
18010 case AMDGPU::AReg_96RegClassID:
18011 return AMDGPU::AReg_96_Align2RegClassID;
18012 case AMDGPU::AReg_128RegClassID:
18013 return AMDGPU::AReg_128_Align2RegClassID;
18014 case AMDGPU::AReg_160RegClassID:
18015 return AMDGPU::AReg_160_Align2RegClassID;
18016 case AMDGPU::AReg_192RegClassID:
18017 return AMDGPU::AReg_192_Align2RegClassID;
18018 case AMDGPU::AReg_256RegClassID:
18019 return AMDGPU::AReg_256_Align2RegClassID;
18020 case AMDGPU::AReg_512RegClassID:
18021 return AMDGPU::AReg_512_Align2RegClassID;
18022 case AMDGPU::AReg_1024RegClassID:
18023 return AMDGPU::AReg_1024_Align2RegClassID;
18024 default:
18025 return -1;
18026 }
18027}
18028
18029// Figure out which registers should be reserved for stack access. Only after
18030// the function is legalized do we know all of the non-spill stack objects or if
18031// calls are present.
18035 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18036 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18037 const SIInstrInfo *TII = ST.getInstrInfo();
18038
18039 if (Info->isEntryFunction()) {
18040 // Callable functions have fixed registers used for stack access.
18042 }
18043
18044 // TODO: Move this logic to getReservedRegs()
18045 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18046 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18047 Register SReg = ST.isWave32()
18048 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18049 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18050 &AMDGPU::SGPR_64RegClass);
18051 Info->setSGPRForEXECCopy(SReg);
18052
18053 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18054 Info->getStackPtrOffsetReg()));
18055 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18056 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18057
18058 // We need to worry about replacing the default register with itself in case
18059 // of MIR testcases missing the MFI.
18060 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18061 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18062
18063 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18064 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18065
18066 Info->limitOccupancy(MF);
18067
18068 if (ST.isWave32() && !MF.empty()) {
18069 for (auto &MBB : MF) {
18070 for (auto &MI : MBB) {
18071 TII->fixImplicitOperands(MI);
18072 }
18073 }
18074 }
18075
18076 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18077 // classes if required. Ideally the register class constraints would differ
18078 // per-subtarget, but there's no easy way to achieve that right now. This is
18079 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18080 // from using them as the register class for legal types.
18081 if (ST.needsAlignedVGPRs()) {
18082 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18083 const Register Reg = Register::index2VirtReg(I);
18084 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18085 if (!RC)
18086 continue;
18087 int NewClassID = getAlignedAGPRClassID(RC->getID());
18088 if (NewClassID != -1)
18089 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
18090 }
18091 }
18092
18094}
18095
18097 KnownBits &Known,
18098 const APInt &DemandedElts,
18099 const SelectionDAG &DAG,
18100 unsigned Depth) const {
18101 Known.resetAll();
18102 unsigned Opc = Op.getOpcode();
18103 switch (Opc) {
18105 unsigned IID = Op.getConstantOperandVal(0);
18106 switch (IID) {
18107 case Intrinsic::amdgcn_mbcnt_lo:
18108 case Intrinsic::amdgcn_mbcnt_hi: {
18109 const GCNSubtarget &ST =
18111 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18112 // most 31 + src1.
18113 Known.Zero.setBitsFrom(
18114 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18115 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
18116 Known = KnownBits::add(Known, Known2);
18117 return;
18118 }
18119 }
18120 break;
18121 }
18122 }
18124 Op, Known, DemandedElts, DAG, Depth);
18125}
18126
18128 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18130
18131 // Set the high bits to zero based on the maximum allowed scratch size per
18132 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18133 // calculation won't overflow, so assume the sign bit is never set.
18134 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18135}
18136
18138 GISelValueTracking &VT, KnownBits &Known,
18139 unsigned Dim) {
18140 unsigned MaxValue =
18141 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
18142 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
18143}
18144
18146 KnownBits &Known, const APInt &DemandedElts,
18147 unsigned BFEWidth, bool SExt, unsigned Depth) {
18149 const MachineOperand &Src1 = MI.getOperand(2);
18150
18151 unsigned Src1Cst = 0;
18152 if (Src1.isImm()) {
18153 Src1Cst = Src1.getImm();
18154 } else if (Src1.isReg()) {
18155 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18156 if (!Cst)
18157 return;
18158 Src1Cst = Cst->Value.getZExtValue();
18159 } else {
18160 return;
18161 }
18162
18163 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18164 // Width is always [22:16].
18165 const unsigned Offset =
18166 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18167 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18168
18169 if (Width >= BFEWidth) // Ill-formed.
18170 return;
18171
18172 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18173 Depth + 1);
18174
18175 Known = Known.extractBits(Width, Offset);
18176
18177 if (SExt)
18178 Known = Known.sext(BFEWidth);
18179 else
18180 Known = Known.zext(BFEWidth);
18181}
18182
18184 GISelValueTracking &VT, Register R, KnownBits &Known,
18185 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18186 unsigned Depth) const {
18187 Known.resetAll();
18188 const MachineInstr *MI = MRI.getVRegDef(R);
18189 switch (MI->getOpcode()) {
18190 case AMDGPU::S_BFE_I32:
18191 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18192 /*SExt=*/true, Depth);
18193 case AMDGPU::S_BFE_U32:
18194 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18195 /*SExt=*/false, Depth);
18196 case AMDGPU::S_BFE_I64:
18197 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18198 /*SExt=*/true, Depth);
18199 case AMDGPU::S_BFE_U64:
18200 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18201 /*SExt=*/false, Depth);
18202 case AMDGPU::G_INTRINSIC:
18203 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18204 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18205 switch (IID) {
18206 case Intrinsic::amdgcn_workitem_id_x:
18207 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18208 break;
18209 case Intrinsic::amdgcn_workitem_id_y:
18210 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18211 break;
18212 case Intrinsic::amdgcn_workitem_id_z:
18213 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18214 break;
18215 case Intrinsic::amdgcn_mbcnt_lo:
18216 case Intrinsic::amdgcn_mbcnt_hi: {
18217 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18218 // most 31 + src1.
18219 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18220 ? getSubtarget()->getWavefrontSizeLog2()
18221 : 5);
18222 KnownBits Known2;
18223 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18224 Depth + 1);
18225 Known = KnownBits::add(Known, Known2);
18226 break;
18227 }
18228 case Intrinsic::amdgcn_groupstaticsize: {
18229 // We can report everything over the maximum size as 0. We can't report
18230 // based on the actual size because we don't know if it's accurate or not
18231 // at any given point.
18232 Known.Zero.setHighBits(
18233 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18234 break;
18235 }
18236 }
18237 break;
18238 }
18239 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18240 Known.Zero.setHighBits(24);
18241 break;
18242 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18243 Known.Zero.setHighBits(16);
18244 break;
18245 case AMDGPU::G_AMDGPU_SMED3:
18246 case AMDGPU::G_AMDGPU_UMED3: {
18247 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18248
18249 KnownBits Known2;
18250 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18251 if (Known2.isUnknown())
18252 break;
18253
18254 KnownBits Known1;
18255 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18256 if (Known1.isUnknown())
18257 break;
18258
18259 KnownBits Known0;
18260 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18261 if (Known0.isUnknown())
18262 break;
18263
18264 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18265 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18266 Known.One = Known0.One & Known1.One & Known2.One;
18267 break;
18268 }
18269 }
18270}
18271
18274 unsigned Depth) const {
18275 const MachineInstr *MI = MRI.getVRegDef(R);
18276 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18277 // FIXME: Can this move to generic code? What about the case where the call
18278 // site specifies a lower alignment?
18279 Intrinsic::ID IID = GI->getIntrinsicID();
18281 AttributeList Attrs =
18282 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18283 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18284 return *RetAlign;
18285 }
18286 return Align(1);
18287}
18288
18291 const Align CacheLineAlign = Align(64);
18292
18293 // Pre-GFX10 target did not benefit from loop alignment
18294 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18295 getSubtarget()->hasInstFwdPrefetchBug())
18296 return PrefAlign;
18297
18298 // On GFX10 I$ is 4 x 64 bytes cache lines.
18299 // By default prefetcher keeps one cache line behind and reads two ahead.
18300 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18301 // behind and one ahead.
18302 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18303 // If loop fits 64 bytes it always spans no more than two cache lines and
18304 // does not need an alignment.
18305 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18306 // Else if loop is less or equal 192 bytes we need two lines behind.
18307
18309 const MachineBasicBlock *Header = ML->getHeader();
18310 if (Header->getAlignment() != PrefAlign)
18311 return Header->getAlignment(); // Already processed.
18312
18313 unsigned LoopSize = 0;
18314 for (const MachineBasicBlock *MBB : ML->blocks()) {
18315 // If inner loop block is aligned assume in average half of the alignment
18316 // size to be added as nops.
18317 if (MBB != Header)
18318 LoopSize += MBB->getAlignment().value() / 2;
18319
18320 for (const MachineInstr &MI : *MBB) {
18321 LoopSize += TII->getInstSizeInBytes(MI);
18322 if (LoopSize > 192)
18323 return PrefAlign;
18324 }
18325 }
18326
18327 if (LoopSize <= 64)
18328 return PrefAlign;
18329
18330 if (LoopSize <= 128)
18331 return CacheLineAlign;
18332
18333 // If any of parent loops is surrounded by prefetch instructions do not
18334 // insert new for inner loop, which would reset parent's settings.
18335 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18336 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18337 auto I = Exit->getFirstNonDebugInstr();
18338 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18339 return CacheLineAlign;
18340 }
18341 }
18342
18343 MachineBasicBlock *Pre = ML->getLoopPreheader();
18344 MachineBasicBlock *Exit = ML->getExitBlock();
18345
18346 if (Pre && Exit) {
18347 auto PreTerm = Pre->getFirstTerminator();
18348 if (PreTerm == Pre->begin() ||
18349 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18350 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18351 .addImm(1); // prefetch 2 lines behind PC
18352
18353 auto ExitHead = Exit->getFirstNonDebugInstr();
18354 if (ExitHead == Exit->end() ||
18355 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18356 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18357 .addImm(2); // prefetch 1 line behind PC
18358 }
18359
18360 return CacheLineAlign;
18361}
18362
18363[[maybe_unused]]
18364static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18365 assert(N->getOpcode() == ISD::CopyFromReg);
18366 do {
18367 // Follow the chain until we find an INLINEASM node.
18368 N = N->getOperand(0).getNode();
18369 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18370 return true;
18371 } while (N->getOpcode() == ISD::CopyFromReg);
18372 return false;
18373}
18374
18377 UniformityInfo *UA) const {
18378 switch (N->getOpcode()) {
18379 case ISD::CopyFromReg: {
18380 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18381 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18382 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18383 Register Reg = R->getReg();
18384
18385 // FIXME: Why does this need to consider isLiveIn?
18386 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18387 return !TRI->isSGPRReg(MRI, Reg);
18388
18389 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18390 return UA->isDivergent(V);
18391
18393 return !TRI->isSGPRReg(MRI, Reg);
18394 }
18395 case ISD::LOAD: {
18396 const LoadSDNode *L = cast<LoadSDNode>(N);
18397 unsigned AS = L->getAddressSpace();
18398 // A flat load may access private memory.
18400 }
18401 case ISD::CALLSEQ_END:
18402 return true;
18404 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18406 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18407 case AMDGPUISD::ATOMIC_CMP_SWAP:
18408 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18409 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18410 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18411 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18412 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18413 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18414 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18415 case AMDGPUISD::BUFFER_ATOMIC_AND:
18416 case AMDGPUISD::BUFFER_ATOMIC_OR:
18417 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18418 case AMDGPUISD::BUFFER_ATOMIC_INC:
18419 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18420 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18421 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18422 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18423 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18424 // Target-specific read-modify-write atomics are sources of divergence.
18425 return true;
18426 default:
18427 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18428 // Generic read-modify-write atomics are sources of divergence.
18429 return A->readMem() && A->writeMem();
18430 }
18431 return false;
18432 }
18433}
18434
18436 EVT VT) const {
18437 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18438 case MVT::f32:
18440 case MVT::f64:
18441 case MVT::f16:
18443 default:
18444 return false;
18445 }
18446}
18447
18449 LLT Ty, const MachineFunction &MF) const {
18450 switch (Ty.getScalarSizeInBits()) {
18451 case 32:
18452 return !denormalModeIsFlushAllF32(MF);
18453 case 64:
18454 case 16:
18455 return !denormalModeIsFlushAllF64F16(MF);
18456 default:
18457 return false;
18458 }
18459}
18460
18462 const APInt &DemandedElts,
18463 const SelectionDAG &DAG,
18464 bool SNaN,
18465 unsigned Depth) const {
18466 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18467 const MachineFunction &MF = DAG.getMachineFunction();
18469
18470 if (Info->getMode().DX10Clamp)
18471 return true; // Clamped to 0.
18472 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18473 }
18474
18476 DAG, SNaN, Depth);
18477}
18478
18479// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18480// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18482 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18483 return true;
18484
18485 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
18486 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18487 if (DenormMode == DenormalMode::getPreserveSign())
18488 return true;
18489
18490 // TODO: Remove this.
18491 return RMW->getFunction()
18492 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18493 .getValueAsBool();
18494}
18495
18497 LLVMContext &Ctx = RMW->getContext();
18498 StringRef MemScope =
18499 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18500
18501 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18502 << "Hardware instruction generated for atomic "
18503 << RMW->getOperationName(RMW->getOperation())
18504 << " operation at memory scope " << MemScope;
18505}
18506
18507static bool isV2F16OrV2BF16(Type *Ty) {
18508 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18509 Type *EltTy = VT->getElementType();
18510 return VT->getNumElements() == 2 &&
18511 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18512 }
18513
18514 return false;
18515}
18516
18517static bool isV2F16(Type *Ty) {
18519 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18520}
18521
18522static bool isV2BF16(Type *Ty) {
18524 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18525}
18526
18527/// \return true if atomicrmw integer ops work for the type.
18528static bool isAtomicRMWLegalIntTy(Type *Ty) {
18529 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18530 unsigned BW = IT->getBitWidth();
18531 return BW == 32 || BW == 64;
18532 }
18533
18534 return false;
18535}
18536
18537/// \return true if this atomicrmw xchg type can be selected.
18538static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18539 Type *Ty = RMW->getType();
18540 if (isAtomicRMWLegalIntTy(Ty))
18541 return true;
18542
18543 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18544 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18545 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18546 return BW == 32 || BW == 64;
18547 }
18548
18549 if (Ty->isFloatTy() || Ty->isDoubleTy())
18550 return true;
18551
18553 return VT->getNumElements() == 2 &&
18554 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18555 }
18556
18557 return false;
18558}
18559
18560/// \returns true if it's valid to emit a native instruction for \p RMW, based
18561/// on the properties of the target memory.
18562static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18563 const AtomicRMWInst *RMW,
18564 bool HasSystemScope) {
18565 // The remote/fine-grained access logic is different from the integer
18566 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18567 // fine-grained access does not work, even for a device local allocation.
18568 //
18569 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18570 // allocations work.
18571 if (HasSystemScope) {
18573 RMW->hasMetadata("amdgpu.no.remote.memory"))
18574 return true;
18575 if (Subtarget.hasEmulatedSystemScopeAtomics())
18576 return true;
18578 return true;
18579
18580 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18581}
18582
18583/// \return Action to perform on AtomicRMWInsts for integer operations.
18590
18591/// Return if a flat address space atomicrmw can access private memory.
18593 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18594 return !MD ||
18596}
18597
18605
18608 unsigned AS = RMW->getPointerAddressSpace();
18609 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18611
18612 // 64-bit flat atomics that dynamically reside in private memory will silently
18613 // be dropped.
18614 //
18615 // Note that we will emit a new copy of the original atomic in the expansion,
18616 // which will be incrementally relegalized.
18617 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18618 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18619 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18622
18623 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18625 ORE.emit([=]() {
18626 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18627 });
18628 return Kind;
18629 };
18630
18631 auto SSID = RMW->getSyncScopeID();
18632 bool HasSystemScope =
18633 SSID == SyncScope::System ||
18634 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18635
18636 auto Op = RMW->getOperation();
18637 switch (Op) {
18639 // PCIe supports add and xchg for system atomics.
18640 return isAtomicRMWLegalXChgTy(RMW)
18643 case AtomicRMWInst::Add:
18644 // PCIe supports add and xchg for system atomics.
18646 case AtomicRMWInst::Sub:
18647 case AtomicRMWInst::And:
18648 case AtomicRMWInst::Or:
18649 case AtomicRMWInst::Xor:
18650 case AtomicRMWInst::Max:
18651 case AtomicRMWInst::Min:
18658 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
18660 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
18663 auto *IT = dyn_cast<IntegerType>(RMW->getType());
18664 if (!IT || IT->getBitWidth() != 32)
18666 }
18667
18670 if (Subtarget->hasEmulatedSystemScopeAtomics())
18672
18673 // On most subtargets, for atomicrmw operations other than add/xchg,
18674 // whether or not the instructions will behave correctly depends on where
18675 // the address physically resides and what interconnect is used in the
18676 // system configuration. On some some targets the instruction will nop,
18677 // and in others synchronization will only occur at degraded device scope.
18678 //
18679 // If the allocation is known local to the device, the instructions should
18680 // work correctly.
18681 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18683
18684 // If fine-grained remote memory works at device scope, we don't need to
18685 // do anything.
18686 if (!HasSystemScope &&
18687 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18689
18690 // If we are targeting a remote allocated address, it depends what kind of
18691 // allocation the address belongs to.
18692 //
18693 // If the allocation is fine-grained (in host memory, or in PCIe peer
18694 // device memory), the operation will fail depending on the target.
18695 //
18696 // Note fine-grained host memory access does work on APUs or if XGMI is
18697 // used, but we do not know if we are targeting an APU or the system
18698 // configuration from the ISA version/target-cpu.
18699 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18701
18704 // Atomic sub/or/xor do not work over PCI express, but atomic add
18705 // does. InstCombine transforms these with 0 to or, so undo that.
18706 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18707 ConstVal && ConstVal->isNullValue())
18709 }
18710
18711 // If the allocation could be in remote, fine-grained memory, the rmw
18712 // instructions may fail. cmpxchg should work, so emit that. On some
18713 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18714 // even work, so you're out of luck anyway.
18715
18716 // In summary:
18717 //
18718 // Cases that may fail:
18719 // - fine-grained pinned host memory
18720 // - fine-grained migratable host memory
18721 // - fine-grained PCIe peer device
18722 //
18723 // Cases that should work, but may be treated overly conservatively.
18724 // - fine-grained host memory on an APU
18725 // - fine-grained XGMI peer device
18727 }
18728
18730 }
18731 case AtomicRMWInst::FAdd: {
18732 Type *Ty = RMW->getType();
18733
18734 // TODO: Handle REGION_ADDRESS
18735 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18736 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18737 // is fixed to round-to-nearest-even.
18738 //
18739 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18740 // round-to-nearest-even.
18741 //
18742 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18743 // suggests it is OK if the floating-point mode may not match the calling
18744 // thread.
18745 if (Ty->isFloatTy()) {
18746 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18748 }
18749
18750 if (Ty->isDoubleTy()) {
18751 // Ignores denormal mode, but we don't consider flushing mandatory.
18752 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18754 }
18755
18756 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18758
18760 }
18761
18762 // LDS atomics respect the denormal mode from the mode register.
18763 //
18764 // Traditionally f32 global/buffer memory atomics would unconditionally
18765 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18766 // flush.
18767 //
18768 // On targets with flat atomic fadd, denormals would flush depending on
18769 // whether the target address resides in LDS or global memory. We consider
18770 // this flat-maybe-flush as will-flush.
18771 if (Ty->isFloatTy() &&
18772 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18775
18776 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18777 // safe. The message phrasing also should be better.
18778 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18779 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18780 // gfx942, gfx12
18781 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18782 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18783 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18784 // gfx90a, gfx942, gfx12
18785 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18786 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18787
18788 // gfx942, gfx12
18789 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18790 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18791 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18792 // gfx90a, gfx942, gfx12
18793 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18794 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18795
18796 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18797 // buffer. gfx12 does have the buffer version.
18798 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18799 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18800 }
18801
18802 // global and flat atomic fadd f64: gfx90a, gfx942.
18803 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18804 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18805
18806 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18807 if (Ty->isFloatTy()) {
18808 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18809 // gfx11+.
18810 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18811 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18812 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18813 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18814 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18815 } else {
18816 // gfx908
18817 if (RMW->use_empty() &&
18818 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18819 isV2F16(Ty))
18820 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18821 }
18822 }
18823
18824 // flat atomic fadd f32: gfx942, gfx11+.
18825 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18826 if (Subtarget->hasFlatAtomicFaddF32Inst())
18827 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18828
18829 // If it is in flat address space, and the type is float, we will try to
18830 // expand it, if the target supports global and lds atomic fadd. The
18831 // reason we need that is, in the expansion, we emit the check of
18832 // address space. If it is in global address space, we emit the global
18833 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18834 // fadd.
18835 if (Subtarget->hasLDSFPAtomicAddF32()) {
18836 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18838 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18840 }
18841 }
18842 }
18843
18845 }
18847 case AtomicRMWInst::FMax: {
18848 Type *Ty = RMW->getType();
18849
18850 // LDS float and double fmin/fmax were always supported.
18851 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18852 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18854 }
18855
18856 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18857 // For flat and global cases:
18858 // float, double in gfx7. Manual claims denormal support.
18859 // Removed in gfx8.
18860 // float, double restored in gfx10.
18861 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18862 //
18863 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18864 // no f32.
18865 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18866 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18867 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18868 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18869 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18870 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18872 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18873 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18874 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18875 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18876 }
18877 }
18878
18880 }
18883 default:
18885 }
18886
18887 llvm_unreachable("covered atomicrmw op switch");
18888}
18889
18896
18903
18906 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18907 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18909
18910 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18912
18913 const DataLayout &DL = CmpX->getDataLayout();
18914
18915 Type *ValTy = CmpX->getNewValOperand()->getType();
18916
18917 // If a 64-bit flat atomic may alias private, we need to avoid using the
18918 // atomic in the private case.
18919 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18921}
18922
18923const TargetRegisterClass *
18924SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18926 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18927 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18928 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18929 : &AMDGPU::SReg_32RegClass;
18930 if (!TRI->isSGPRClass(RC) && !isDivergent)
18931 return TRI->getEquivalentSGPRClass(RC);
18932 if (TRI->isSGPRClass(RC) && isDivergent) {
18933 if (Subtarget->hasGFX90AInsts())
18934 return TRI->getEquivalentAVClass(RC);
18935 return TRI->getEquivalentVGPRClass(RC);
18936 }
18937
18938 return RC;
18939}
18940
18941// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18942// uniform values (as produced by the mask results of control flow intrinsics)
18943// used outside of divergent blocks. The phi users need to also be treated as
18944// always uniform.
18945//
18946// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18947static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18948 unsigned WaveSize) {
18949 // FIXME: We assume we never cast the mask results of a control flow
18950 // intrinsic.
18951 // Early exit if the type won't be consistent as a compile time hack.
18952 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18953 if (!IT || IT->getBitWidth() != WaveSize)
18954 return false;
18955
18956 if (!isa<Instruction>(V))
18957 return false;
18958 if (!Visited.insert(V).second)
18959 return false;
18960 bool Result = false;
18961 for (const auto *U : V->users()) {
18963 if (V == U->getOperand(1)) {
18964 switch (Intrinsic->getIntrinsicID()) {
18965 default:
18966 Result = false;
18967 break;
18968 case Intrinsic::amdgcn_if_break:
18969 case Intrinsic::amdgcn_if:
18970 case Intrinsic::amdgcn_else:
18971 Result = true;
18972 break;
18973 }
18974 }
18975 if (V == U->getOperand(0)) {
18976 switch (Intrinsic->getIntrinsicID()) {
18977 default:
18978 Result = false;
18979 break;
18980 case Intrinsic::amdgcn_end_cf:
18981 case Intrinsic::amdgcn_loop:
18982 Result = true;
18983 break;
18984 }
18985 }
18986 } else {
18987 Result = hasCFUser(U, Visited, WaveSize);
18988 }
18989 if (Result)
18990 break;
18991 }
18992 return Result;
18993}
18994
18996 const Value *V) const {
18997 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18998 if (CI->isInlineAsm()) {
18999 // FIXME: This cannot give a correct answer. This should only trigger in
19000 // the case where inline asm returns mixed SGPR and VGPR results, used
19001 // outside the defining block. We don't have a specific result to
19002 // consider, so this assumes if any value is SGPR, the overall register
19003 // also needs to be SGPR.
19004 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
19006 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
19007 for (auto &TC : TargetConstraints) {
19008 if (TC.Type == InlineAsm::isOutput) {
19010 const TargetRegisterClass *RC =
19011 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
19012 TC.ConstraintVT)
19013 .second;
19014 if (RC && SIRI->isSGPRClass(RC))
19015 return true;
19016 }
19017 }
19018 }
19019 }
19021 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19022}
19023
19025 for (SDUse &Use : N->uses()) {
19027 if (getBasePtrIndex(M) == Use.getOperandNo())
19028 return true;
19029 }
19030 }
19031 return false;
19032}
19033
19035 SDValue N1) const {
19036 if (!N0.hasOneUse())
19037 return false;
19038 // Take care of the opportunity to keep N0 uniform
19039 if (N0->isDivergent() || !N1->isDivergent())
19040 return true;
19041 // Check if we have a good chance to form the memory access pattern with the
19042 // base and offset
19043 return (DAG.isBaseWithConstantOffset(N0) &&
19045}
19046
19048 Register N0, Register N1) const {
19049 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
19050}
19051
19054 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19056 if (I.getMetadata("amdgpu.noclobber"))
19057 Flags |= MONoClobber;
19058 if (I.getMetadata("amdgpu.last.use"))
19059 Flags |= MOLastUse;
19060 return Flags;
19061}
19062
19064 Instruction *AI) const {
19065 // Given: atomicrmw fadd ptr %addr, float %val ordering
19066 //
19067 // With this expansion we produce the following code:
19068 // [...]
19069 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19070 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19071 //
19072 // atomicrmw.shared:
19073 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19074 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19075 // float %val ordering
19076 // br label %atomicrmw.phi
19077 //
19078 // atomicrmw.check.private:
19079 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19080 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19081 //
19082 // atomicrmw.private:
19083 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19084 // %loaded.private = load float, ptr addrspace(5) %cast.private
19085 // %val.new = fadd float %loaded.private, %val
19086 // store float %val.new, ptr addrspace(5) %cast.private
19087 // br label %atomicrmw.phi
19088 //
19089 // atomicrmw.global:
19090 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19091 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19092 // float %val ordering
19093 // br label %atomicrmw.phi
19094 //
19095 // atomicrmw.phi:
19096 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19097 // [ %loaded.private, %atomicrmw.private ],
19098 // [ %loaded.global, %atomicrmw.global ]
19099 // br label %atomicrmw.end
19100 //
19101 // atomicrmw.end:
19102 // [...]
19103 //
19104 //
19105 // For 64-bit atomics which may reside in private memory, we perform a simpler
19106 // version that only inserts the private check, and uses the flat operation.
19107
19108 IRBuilder<> Builder(AI);
19109 LLVMContext &Ctx = Builder.getContext();
19110
19111 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
19112 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19114 Value *Addr = AI->getOperand(PtrOpIdx);
19115
19116 /// TODO: Only need to check private, then emit flat-known-not private (no
19117 /// need for shared block, or cast to global).
19119
19120 Align Alignment;
19121 if (RMW)
19122 Alignment = RMW->getAlign();
19123 else if (CX)
19124 Alignment = CX->getAlign();
19125 else
19126 llvm_unreachable("unhandled atomic operation");
19127
19128 // FullFlatEmulation is true if we need to issue the private, shared, and
19129 // global cases.
19130 //
19131 // If this is false, we are only dealing with the flat-targeting-private case,
19132 // where we only insert a check for private and still use the flat instruction
19133 // for global and shared.
19134
19135 bool FullFlatEmulation =
19136 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19137 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19138 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19139 RMW->getType()->isDoubleTy()));
19140
19141 // If the return value isn't used, do not introduce a false use in the phi.
19142 bool ReturnValueIsUsed = !AI->use_empty();
19143
19144 BasicBlock *BB = Builder.GetInsertBlock();
19145 Function *F = BB->getParent();
19146 BasicBlock *ExitBB =
19147 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19148 BasicBlock *SharedBB = nullptr;
19149
19150 BasicBlock *CheckPrivateBB = BB;
19151 if (FullFlatEmulation) {
19152 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19153 CheckPrivateBB =
19154 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19155 }
19156
19157 BasicBlock *PrivateBB =
19158 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19159 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19160 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19161
19162 std::prev(BB->end())->eraseFromParent();
19163 Builder.SetInsertPoint(BB);
19164
19165 Value *LoadedShared = nullptr;
19166 if (FullFlatEmulation) {
19167 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19168 {Addr}, nullptr, "is.shared");
19169 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19170 Builder.SetInsertPoint(SharedBB);
19171 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19173
19174 Instruction *Clone = AI->clone();
19175 Clone->insertInto(SharedBB, SharedBB->end());
19176 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19177 LoadedShared = Clone;
19178
19179 Builder.CreateBr(PhiBB);
19180 Builder.SetInsertPoint(CheckPrivateBB);
19181 }
19182
19183 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19184 {Addr}, nullptr, "is.private");
19185 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19186
19187 Builder.SetInsertPoint(PrivateBB);
19188
19189 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19191
19192 Value *LoadedPrivate;
19193 if (RMW) {
19194 LoadedPrivate = Builder.CreateAlignedLoad(
19195 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19196
19197 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19198 LoadedPrivate, RMW->getValOperand());
19199
19200 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19201 } else {
19202 auto [ResultLoad, Equal] =
19203 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19204 CX->getNewValOperand(), CX->getAlign());
19205
19206 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19207 ResultLoad, 0);
19208 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19209 }
19210
19211 Builder.CreateBr(PhiBB);
19212
19213 Builder.SetInsertPoint(GlobalBB);
19214
19215 // Continue using a flat instruction if we only emitted the check for private.
19216 Instruction *LoadedGlobal = AI;
19217 if (FullFlatEmulation) {
19218 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19220 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19221 }
19222
19223 AI->removeFromParent();
19224 AI->insertInto(GlobalBB, GlobalBB->end());
19225
19226 // The new atomicrmw may go through another round of legalization later.
19227 if (!FullFlatEmulation) {
19228 // We inserted the runtime check already, make sure we do not try to
19229 // re-expand this.
19230 // TODO: Should union with any existing metadata.
19231 MDBuilder MDB(F->getContext());
19232 MDNode *RangeNotPrivate =
19235 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19236 RangeNotPrivate);
19237 }
19238
19239 Builder.CreateBr(PhiBB);
19240
19241 Builder.SetInsertPoint(PhiBB);
19242
19243 if (ReturnValueIsUsed) {
19244 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19245 AI->replaceAllUsesWith(Loaded);
19246 if (FullFlatEmulation)
19247 Loaded->addIncoming(LoadedShared, SharedBB);
19248 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19249 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19250 Loaded->takeName(AI);
19251 }
19252
19253 Builder.CreateBr(ExitBB);
19254}
19255
19257 unsigned PtrOpIdx) {
19258 Value *PtrOp = I->getOperand(PtrOpIdx);
19261
19262 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19263 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19264 I->getIterator());
19265 I->setOperand(PtrOpIdx, ASCast);
19266}
19267
19270
19273
19276 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19277 ConstVal && ConstVal->isNullValue()) {
19278 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19280
19281 // We may still need the private-alias-flat handling below.
19282
19283 // TODO: Skip this for cases where we cannot access remote memory.
19284 }
19285 }
19286
19287 // The non-flat expansions should only perform the de-canonicalization of
19288 // identity values.
19290 return;
19291
19293}
19294
19301
19305
19307 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19308}
19309
19311 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19312 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19313
19315 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19316}
19317
19318LoadInst *
19320 IRBuilder<> Builder(AI);
19321 auto Order = AI->getOrdering();
19322
19323 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19324 // must be flushed if the atomic ordering had a release semantics. This is
19325 // not necessary a fence, a release fence just coincides to do that flush.
19326 // Avoid replacing of an atomicrmw with a release semantics.
19327 if (isReleaseOrStronger(Order))
19328 return nullptr;
19329
19330 LoadInst *LI = Builder.CreateAlignedLoad(
19331 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19332 LI->setAtomic(Order, AI->getSyncScopeID());
19333 LI->copyMetadata(*AI);
19334 LI->takeName(AI);
19335 AI->replaceAllUsesWith(LI);
19336 AI->eraseFromParent();
19337 return LI;
19338}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1258
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1255
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1102
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6053
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1479
bool isNegative() const
Definition APFloat.h:1431
bool isNormal() const
Definition APFloat.h:1435
APInt bitcastToAPInt() const
Definition APFloat.h:1335
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1120
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1061
bool isInfinity() const
Definition APFloat.h:1428
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1640
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1238
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:215
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:806
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:226
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:220
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:223
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:423
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:153
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ ATOMIC_LOAD_USUB_COND
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition ISDOpcodes.h:991
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:981
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ ATOMIC_LOAD_USUB_SAT
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:963
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:541
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:966
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ ATOMIC_LOAD_UDEC_WRAP
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:985
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ ATOMIC_LOAD_UINC_WRAP
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:223
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2157
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:301
constexpr bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:232
constexpr unsigned BitWidth
constexpr bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1748
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1779
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1918
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs