LLVM 23.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
42#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
46#include "llvm/IR/MDBuilder.h"
49#include "llvm/Support/ModRef.h"
51#include <optional>
52
53using namespace llvm;
54using namespace llvm::SDPatternMatch;
55
56#define DEBUG_TYPE "si-lower"
57
58STATISTIC(NumTailCalls, "Number of tail calls");
59
60static cl::opt<bool>
61 DisableLoopAlignment("amdgpu-disable-loop-alignment",
62 cl::desc("Do not align and prefetch loops"),
63 cl::init(false));
64
66 "amdgpu-use-divergent-register-indexing", cl::Hidden,
67 cl::desc("Use indirect register addressing for divergent indexes"),
68 cl::init(false));
69
72 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
73}
74
77 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
78}
79
80static unsigned findFirstFreeSGPR(CCState &CCInfo) {
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
83 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
84 return AMDGPU::SGPR0 + Reg;
85 }
86 }
87 llvm_unreachable("Cannot allocate sgpr");
88}
89
91 const GCNSubtarget &STI)
92 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
93 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
94 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
95
96 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
97
98 const SIRegisterInfo *TRI = STI.getRegisterInfo();
99 const TargetRegisterClass *V32RegClass =
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
101 addRegisterClass(MVT::f32, V32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const TargetRegisterClass *V64RegClass =
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32,
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32,
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
149
150 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
151 addRegisterClass(MVT::v12f32,
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
153
154 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v16f32,
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
157
158 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
159 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
160
161 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
162 addRegisterClass(MVT::v16f64,
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
167 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
170 } else {
171 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
174 }
175
176 // Unless there are also VOP3P operations, not operations are really legal.
177 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
180 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
183 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
186 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
189 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
191 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
192 }
193
194 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
195 addRegisterClass(MVT::v32f32,
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
197
198 computeRegisterProperties(Subtarget->getRegisterInfo());
199
200 // The boolean content concept here is too inflexible. Compares only ever
201 // really produce a 1-bit result. Any copy/extend from these will turn into a
202 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
203 // it's what most targets use.
206
207 // We need to custom lower vector stores from local memory
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
213 Custom);
214
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
220 Custom);
221
222 if (isTypeLegal(MVT::bf16)) {
223 for (unsigned Opc :
232 ISD::SETCC}) {
233 setOperationAction(Opc, MVT::bf16, Promote);
234 }
235
237
239 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
240
244
245 // We only need to custom lower because we can't specify an action for bf16
246 // sources.
249 }
250
251 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
252 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
253 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
254 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
255 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
256 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
257 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
258 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
259 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
260 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
261 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
262 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
263 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
264 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
265 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
266 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
267
268 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
269 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
270 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
271 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
272 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
273 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
274 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
275
276 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
277 setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);
278
282 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
283
284 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
285
287 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
288
290 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
291 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
292
294 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
295 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
296 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
297 Expand);
299 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
300 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
301 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
302 Expand);
303
305 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
306 MVT::v3i16, MVT::v4i16, MVT::Other},
307 Custom);
308
311 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
312
314
316
318 Expand);
319
320#if 0
322#endif
323
324 // We only support LOAD/STORE and vector manipulation ops for vectors
325 // with > 4 elements.
326 for (MVT VT :
327 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
328 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
329 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
330 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
331 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
332 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
333 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
334 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
335 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
336 switch (Op) {
337 case ISD::LOAD:
338 case ISD::STORE:
340 case ISD::BITCAST:
341 case ISD::UNDEF:
345 case ISD::IS_FPCLASS:
346 break;
351 break;
352 default:
354 break;
355 }
356 }
357 }
358
360
361 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
362 // is expanded to avoid having two separate loops in case the index is a VGPR.
363
364 // Most operations are naturally 32-bit vector operations. We only support
365 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
366 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
368 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
369
371 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
372
374 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
375
377 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
378 }
379
380 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
382 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
383
385 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
386
388 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
389
391 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
392 }
393
394 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
396 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
397
399 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
400
402 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
403
405 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
406 }
407
408 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
410 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
411
413 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
414
416 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
417
419 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
420 }
421
422 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
424 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
425
427 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
428
430 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
431
433 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
434 }
435
437 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
438 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
439 Custom);
440
441 if (Subtarget->hasPkMovB32()) {
442 // TODO: 16-bit element vectors should be legal with even aligned elements.
443 // TODO: Can be legal with wider source types than the result with
444 // subregister extracts.
445 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
446 }
447
449 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
450 // instead lower to cndmask in SITargetLowering::LowerSELECT().
452 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
453 // alignbit.
454 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
455
456 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
457 Custom);
458
459 // Avoid stack access for these.
460 // TODO: Generalize to more vector types.
462 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
463 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
464 Custom);
465
466 // Deal with vec3 vector operations when widened to vec4.
468 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
469
470 // Deal with vec5/6/7 vector operations when widened to vec8.
472 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
473 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
474 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
475 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
476 Custom);
477
478 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
479 // and output demarshalling
480 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
481
482 // We can't return success/failure, only the old value,
483 // let LLVM add the comparison
485 Expand);
486
487 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
488
489 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
490
491 // FIXME: This should be narrowed to i32, but that only happens if i64 is
492 // illegal.
493 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
494 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
495
496 // On SI this is s_memtime and s_memrealtime on VI.
498
499 if (Subtarget->hasSMemRealTime() ||
500 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
503
504 if (Subtarget->has16BitInsts()) {
507 } else {
509 }
510
511 if (Subtarget->hasMadMacF32Insts())
513
516
517 // We only really have 32-bit BFE instructions (and 16-bit on VI).
518 //
519 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
520 // effort to match them now. We want this to be false for i64 cases when the
521 // extraction isn't restricted to the upper or lower half. Ideally we would
522 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
523 // span the midpoint are probably relatively rare, so don't worry about them
524 // for now.
526
527 // Clamp modifier on add/sub
528 if (Subtarget->hasIntClamp())
530
531 if (Subtarget->hasAddNoCarry())
532 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
533 Legal);
534
537 {MVT::f32, MVT::f64}, Custom);
538
539 // These are really only legal for ieee_mode functions. We should be avoiding
540 // them for functions that don't have ieee_mode enabled, so just say they are
541 // legal.
543 {MVT::f32, MVT::f64}, Legal);
544
545 if (Subtarget->haveRoundOpsF64())
547 Legal);
548 else
550 MVT::f64, Custom);
551
553 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
554 Legal);
555 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
556
559
560 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
561 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
562
563 // Custom lower these because we can't specify a rule based on an illegal
564 // source bf16.
567
568 if (Subtarget->has16BitInsts()) {
571 MVT::i16, Legal);
572
573 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
574
576 MVT::i16, Expand);
577
581 ISD::CTPOP},
582 MVT::i16, Promote);
583
585
586 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
587
589 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
591 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
592
596
598
599 // F16 - Constant Actions.
602
603 // F16 - Load/Store Actions.
605 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
607 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
608
609 // BF16 - Load/Store Actions.
611 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
613 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
614
615 // F16 - VOP1 Actions.
618 MVT::f16, Custom);
619
620 // BF16 - VOP1 Actions.
621 if (Subtarget->hasBF16TransInsts())
623
626
627 // F16 - VOP2 Actions.
628 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
629 Expand);
633
634 // F16 - VOP3 Actions.
636 if (STI.hasMadF16())
638
639 for (MVT VT :
640 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
641 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
642 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
643 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
644 switch (Op) {
645 case ISD::LOAD:
646 case ISD::STORE:
648 case ISD::BITCAST:
649 case ISD::UNDEF:
654 case ISD::IS_FPCLASS:
655 break;
659 break;
660 default:
662 break;
663 }
664 }
665 }
666
667 // v_perm_b32 can handle either of these.
668 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
670
671 // XXX - Do these do anything? Vector constants turn into build_vector.
672 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
673
674 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
675 Legal);
676
678 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
680 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
681
683 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
685 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
686
687 setOperationAction(ISD::AND, MVT::v2i16, Promote);
688 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
689 setOperationAction(ISD::OR, MVT::v2i16, Promote);
690 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
691 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
692 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
693
695 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
697 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
698 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
699 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
700
702 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
704 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
706 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
707
709 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
711 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
712 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
713 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
714
716 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
718 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
719
721 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
723 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
725 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
726
727 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
728 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
729 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
730 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
731 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
732 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
733
735 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
737 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
738 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
739 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
740
741 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
742 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
743 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
744 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
745 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
746 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
747
749 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
751 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
752 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
753 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
754
756 MVT::v2i32, Expand);
758
760 MVT::v4i32, Expand);
761
763 MVT::v8i32, Expand);
764
765 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
766 Subtarget->hasVOP3PInsts() ? Legal : Custom);
767
768 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
769 // This isn't really legal, but this avoids the legalizer unrolling it (and
770 // allows matching fneg (fabs x) patterns)
771 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
772
773 // Can do this in one BFI plus a constant materialize.
775 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
776 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
777 MVT::v32f16, MVT::v32bf16},
778 Custom);
779
782 MVT::f16, Custom);
784
787 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
788 Custom);
789
791 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
792 Expand);
793
794 for (MVT Vec16 :
795 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
796 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
799 Vec16, Custom);
801 }
802 }
803
804 if (Subtarget->hasVOP3PInsts()) {
808 MVT::v2i16, Legal);
809
812 MVT::v2f16, Legal);
813
815 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
816
818 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
819 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
820 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
821 Custom);
822
823 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
824 // Split vector operations.
829 VT, Custom);
830
831 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
832 // Split vector operations.
834 VT, Custom);
835
838 {MVT::v2f16, MVT::v4f16}, Custom);
839
840 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
841 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
842 Custom);
843
844 if (Subtarget->hasBF16PackedInsts()) {
845 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
846 // Split vector operations.
848 VT, Custom);
849 }
850
851 if (Subtarget->hasPackedFP32Ops()) {
853 MVT::v2f32, Legal);
855 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
856 Custom);
857 }
858 }
859
861
862 if (Subtarget->has16BitInsts()) {
864 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
866 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
867 } else {
868 // Legalization hack.
869 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
870
872 }
873
875 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
876 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
877 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
878 MVT::v32f16, MVT::v32bf16},
879 Custom);
880
882
883 if (Subtarget->hasVectorMulU64())
885 else if (Subtarget->hasScalarSMulU64())
887
888 if (Subtarget->hasMad64_32())
890
891 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
893
894 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
896 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
897 } else {
898 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
899 if (Subtarget->hasMinimum3Maximum3F32())
901
902 if (Subtarget->hasMinimum3Maximum3PKF16()) {
904
905 // If only the vector form is available, we need to widen to a vector.
906 if (!Subtarget->hasMinimum3Maximum3F16())
908 }
909 }
910
911 if (Subtarget->hasVOP3PInsts()) {
912 // We want to break these into v2f16 pieces, not scalarize.
914 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
915 Custom);
916 }
917
918 if (Subtarget->hasIntMinMax64())
920 Legal);
921
923 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
924 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
925 MVT::i8},
926 Custom);
927
929 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
930 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
931 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
932 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
933 Custom);
934
936 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
937 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
938 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
939 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
940 Custom);
941
947
948 // TODO: Could move this to custom lowering, could benefit from combines on
949 // extract of relevant bits.
951
953
954 if (Subtarget->hasBF16ConversionInsts()) {
955 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
957 }
958
959 if (Subtarget->hasBF16PackedInsts()) {
962 MVT::v2bf16, Legal);
963 }
964
965 if (Subtarget->hasBF16TransInsts()) {
967 }
968
969 if (Subtarget->hasCvtPkF16F32Inst()) {
971 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
972 Custom);
973 }
974
978 ISD::SUB,
980 ISD::MUL,
981 ISD::FADD,
982 ISD::FSUB,
983 ISD::FDIV,
984 ISD::FMUL,
993 ISD::FMA,
994 ISD::SMIN,
995 ISD::SMAX,
996 ISD::UMIN,
997 ISD::UMAX,
1000 ISD::SMIN,
1001 ISD::SMAX,
1002 ISD::UMIN,
1003 ISD::UMAX,
1004 ISD::AND,
1005 ISD::OR,
1006 ISD::XOR,
1007 ISD::SHL,
1008 ISD::SRL,
1009 ISD::SRA,
1010 ISD::FSHR,
1020
1021 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1023
1024 // All memory operations. Some folding on the pointer operand is done to help
1025 // matching the constant offsets in the addressing modes.
1027 ISD::STORE,
1052
1053 // FIXME: In other contexts we pretend this is a per-function property.
1055
1057}
1058
1059const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1060
1062 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1063 return RCRegs;
1064}
1065
1066//===----------------------------------------------------------------------===//
1067// TargetLowering queries
1068//===----------------------------------------------------------------------===//
1069
1070// v_mad_mix* support a conversion from f16 to f32.
1071//
1072// There is only one special case when denormals are enabled we don't currently,
1073// where this is OK to use.
1074bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1075 EVT DestVT, EVT SrcVT) const {
1076 return DestVT.getScalarType() == MVT::f32 &&
1077 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1078 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1079 SrcVT.getScalarType() == MVT::f16) ||
1080 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1081 SrcVT.getScalarType() == MVT::bf16)) &&
1082 // TODO: This probably only requires no input flushing?
1084}
1085
1087 LLT DestTy, LLT SrcTy) const {
1088 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1089 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1090 DestTy.getScalarSizeInBits() == 32 &&
1091 SrcTy.getScalarSizeInBits() == 16 &&
1092 // TODO: This probably only requires no input flushing?
1093 denormalModeIsFlushAllF32(*MI.getMF());
1094}
1095
1097 // SI has some legal vector types, but no legal vector operations. Say no
1098 // shuffles are legal in order to prefer scalarizing some vector operations.
1099 return false;
1100}
1101
1103 CallingConv::ID CC,
1104 EVT VT) const {
1106 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1107
1108 if (VT.isVector()) {
1109 EVT ScalarVT = VT.getScalarType();
1110 unsigned Size = ScalarVT.getSizeInBits();
1111 if (Size == 16) {
1112 if (Subtarget->has16BitInsts())
1113 return MVT::getVectorVT(ScalarVT.getSimpleVT(), 2);
1114 return VT.isInteger() ? MVT::i32 : MVT::f32;
1115 }
1116
1117 if (Size < 16)
1118 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1119 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1120 }
1121
1122 if (VT.getSizeInBits() > 32)
1123 return MVT::i32;
1124
1125 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1126}
1127
1129 CallingConv::ID CC,
1130 EVT VT) const {
1132 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1133
1134 if (VT.isVector()) {
1135 unsigned NumElts = VT.getVectorNumElements();
1136 EVT ScalarVT = VT.getScalarType();
1137 unsigned Size = ScalarVT.getSizeInBits();
1138
1139 // FIXME: Should probably promote 8-bit vectors to i16.
1140 if (Size == 16 && Subtarget->has16BitInsts())
1141 return (NumElts + 1) / 2;
1142
1143 if (Size <= 32)
1144 return NumElts;
1145
1146 if (Size > 32)
1147 return NumElts * ((Size + 31) / 32);
1148 } else if (VT.getSizeInBits() > 32)
1149 return (VT.getSizeInBits() + 31) / 32;
1150
1151 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1152}
1153
1155 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1156 unsigned &NumIntermediates, MVT &RegisterVT) const {
1157 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1158 unsigned NumElts = VT.getVectorNumElements();
1159 EVT ScalarVT = VT.getScalarType();
1160 unsigned Size = ScalarVT.getSizeInBits();
1161 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1162 // support, but unless we can properly handle 3-vectors, it will be still be
1163 // inconsistent.
1164 if (Size == 16 && Subtarget->has16BitInsts()) {
1165 RegisterVT = MVT::getVectorVT(ScalarVT.getSimpleVT(), 2);
1166 IntermediateVT = RegisterVT;
1167 NumIntermediates = (NumElts + 1) / 2;
1168 return NumIntermediates;
1169 }
1170
1171 if (Size == 32) {
1172 RegisterVT = ScalarVT.getSimpleVT();
1173 IntermediateVT = RegisterVT;
1174 NumIntermediates = NumElts;
1175 return NumIntermediates;
1176 }
1177
1178 if (Size < 16 && Subtarget->has16BitInsts()) {
1179 // FIXME: Should probably form v2i16 pieces
1180 RegisterVT = MVT::i16;
1181 IntermediateVT = ScalarVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1184 }
1185
1186 if (Size != 16 && Size <= 32) {
1187 RegisterVT = MVT::i32;
1188 IntermediateVT = ScalarVT;
1189 NumIntermediates = NumElts;
1190 return NumIntermediates;
1191 }
1192
1193 if (Size > 32) {
1194 RegisterVT = MVT::i32;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts * ((Size + 31) / 32);
1197 return NumIntermediates;
1198 }
1199 }
1200
1202 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1203}
1204
1206 const DataLayout &DL, Type *Ty,
1207 unsigned MaxNumLanes) {
1208 assert(MaxNumLanes != 0);
1209
1210 LLVMContext &Ctx = Ty->getContext();
1211 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1212 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1213 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1214 NumElts);
1215 }
1216
1217 return TLI.getValueType(DL, Ty);
1218}
1219
1220// Peek through TFE struct returns to only use the data size.
1222 const DataLayout &DL, Type *Ty,
1223 unsigned MaxNumLanes) {
1224 auto *ST = dyn_cast<StructType>(Ty);
1225 if (!ST)
1226 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1227
1228 // TFE intrinsics return an aggregate type.
1229 assert(ST->getNumContainedTypes() == 2 &&
1230 ST->getContainedType(1)->isIntegerTy(32));
1231 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1232}
1233
1234/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1235/// in-memory representation. This return value is a custom type because there
1236/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1237/// could cause issues during codegen, these address space 7 pointers will be
1238/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1239/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1240/// for cost modeling, to work. (This also sets us up decently for doing the
1241/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1243 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1244 return MVT::amdgpuBufferFatPointer;
1246 DL.getPointerSizeInBits(AS) == 192)
1247 return MVT::amdgpuBufferStridedPointer;
1249}
1250/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1251/// v8i32 when padding is added.
1252/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1253/// also v8i32 with padding.
1255 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1256 DL.getPointerSizeInBits(AS) == 160) ||
1258 DL.getPointerSizeInBits(AS) == 192))
1259 return MVT::v8i32;
1261}
1262
1263static unsigned getIntrMemWidth(unsigned IntrID) {
1264 switch (IntrID) {
1265 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1266 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1267 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1268 return 8;
1269 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1270 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1271 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1272 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1273 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1274 return 32;
1275 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1276 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1277 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1278 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1279 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1280 return 64;
1281 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1282 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1283 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1284 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1285 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1286 return 128;
1287 default:
1288 llvm_unreachable("Unknown width");
1289 }
1290}
1291
1292static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad,
1294 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1295 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1296 switch (AtomicOrderingCABI(Ord)) {
1299 break;
1302 break;
1305 break;
1306 default:
1308 break;
1309 }
1310
1311 Info.flags =
1313 Info.flags |= MOCooperative;
1314
1315 MDNode *ScopeMD = cast<MDNode>(
1316 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1317 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1318 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1319}
1320
1322 const CallBase &CI,
1323 MachineFunction &MF,
1324 unsigned IntrID) const {
1325 Info.flags = MachineMemOperand::MONone;
1326 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1327 Info.flags |= MachineMemOperand::MOInvariant;
1328 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1330 Info.flags |= getTargetMMOFlags(CI);
1331
1332 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1334 AttributeSet Attr =
1336 MemoryEffects ME = Attr.getMemoryEffects();
1337 if (ME.doesNotAccessMemory())
1338 return false;
1339
1340 // TODO: Should images get their own address space?
1341 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1342
1343 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1344 if (RsrcIntr->IsImage) {
1345 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1347 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1348 Info.align.reset();
1349 }
1350
1351 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1352 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1353 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1354 // We conservatively set the memory operand of a buffer intrinsic to the
1355 // base resource pointer, so that we can access alias information about
1356 // those pointers. Cases like "this points at the same value
1357 // but with a different offset" are handled in
1358 // areMemAccessesTriviallyDisjoint.
1359 Info.ptrVal = RsrcArg;
1360 }
1361
1362 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1363 if (!IsSPrefetch) {
1364 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1365 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1366 Info.flags |= MachineMemOperand::MOVolatile;
1367 }
1368
1370 if (ME.onlyReadsMemory()) {
1371 if (RsrcIntr->IsImage) {
1372 unsigned MaxNumLanes = 4;
1373
1374 if (!BaseOpcode->Gather4) {
1375 // If this isn't a gather, we may have excess loaded elements in the
1376 // IR type. Check the dmask for the real number of elements loaded.
1377 unsigned DMask =
1378 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1379 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1380 }
1381
1382 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1383 CI.getType(), MaxNumLanes);
1384 } else {
1385 Info.memVT =
1387 std::numeric_limits<unsigned>::max());
1388 }
1389
1390 // FIXME: What does alignment mean for an image?
1391 Info.opc = ISD::INTRINSIC_W_CHAIN;
1392 Info.flags |= MachineMemOperand::MOLoad;
1393 } else if (ME.onlyWritesMemory()) {
1394 Info.opc = ISD::INTRINSIC_VOID;
1395
1396 Type *DataTy = CI.getArgOperand(0)->getType();
1397 if (RsrcIntr->IsImage) {
1398 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1399 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1400 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1401 DMaskLanes);
1402 } else
1403 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1404
1405 Info.flags |= MachineMemOperand::MOStore;
1406 } else {
1407 // Atomic, NoReturn Sampler or prefetch
1408 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1410 Info.flags |=
1412
1413 if (!IsSPrefetch)
1414 Info.flags |= MachineMemOperand::MOStore;
1415
1416 switch (IntrID) {
1417 default:
1418 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1419 // Fake memory access type for no return sampler intrinsics
1420 Info.memVT = MVT::i32;
1421 } else {
1422 // XXX - Should this be volatile without known ordering?
1423 Info.flags |= MachineMemOperand::MOVolatile;
1424 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1425 }
1426 break;
1427 case Intrinsic::amdgcn_raw_buffer_load_lds:
1428 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1429 case Intrinsic::amdgcn_struct_buffer_load_lds:
1430 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1431 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1432 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1433 Info.ptrVal = CI.getArgOperand(1);
1434 return true;
1435 }
1436 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1437 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1438 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1439 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1440 Info.memVT =
1442 std::numeric_limits<unsigned>::max());
1443 Info.flags &= ~MachineMemOperand::MOStore;
1444 return true;
1445 }
1446 }
1447 }
1448 return true;
1449 }
1450
1451 switch (IntrID) {
1452 case Intrinsic::amdgcn_ds_ordered_add:
1453 case Intrinsic::amdgcn_ds_ordered_swap: {
1454 Info.opc = ISD::INTRINSIC_W_CHAIN;
1455 Info.memVT = MVT::getVT(CI.getType());
1456 Info.ptrVal = CI.getOperand(0);
1457 Info.align.reset();
1459
1460 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1461 if (!Vol->isZero())
1462 Info.flags |= MachineMemOperand::MOVolatile;
1463
1464 return true;
1465 }
1466 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1467 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1468 Info.opc = ISD::INTRINSIC_W_CHAIN;
1469 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1470 Info.ptrVal = nullptr;
1471 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1473 return true;
1474 }
1475 case Intrinsic::amdgcn_ds_append:
1476 case Intrinsic::amdgcn_ds_consume: {
1477 Info.opc = ISD::INTRINSIC_W_CHAIN;
1478 Info.memVT = MVT::getVT(CI.getType());
1479 Info.ptrVal = CI.getOperand(0);
1480 Info.align.reset();
1482
1483 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1484 if (!Vol->isZero())
1485 Info.flags |= MachineMemOperand::MOVolatile;
1486
1487 return true;
1488 }
1489 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1490 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1491 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1494 Info.memVT = MVT::getVT(CI.getType());
1495 Info.ptrVal = CI.getOperand(0);
1496 Info.memVT = MVT::i64;
1497 Info.size = 8;
1498 Info.align.reset();
1500 return true;
1501 }
1502 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1503 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1504 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1505 Info.opc = ISD::INTRINSIC_W_CHAIN;
1506 Info.memVT =
1507 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1508 ? CI.getType()
1510 ->getElementType(0)); // XXX: what is correct VT?
1511
1512 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1513 Info.align.reset();
1514 Info.flags |=
1516 return true;
1517 }
1518 case Intrinsic::amdgcn_global_atomic_fmin_num:
1519 case Intrinsic::amdgcn_global_atomic_fmax_num:
1520 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1521 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1522 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1523 Info.opc = ISD::INTRINSIC_W_CHAIN;
1524 Info.memVT = MVT::getVT(CI.getType());
1525 Info.ptrVal = CI.getOperand(0);
1526 Info.align.reset();
1530 return true;
1531 }
1532 case Intrinsic::amdgcn_flat_load_monitor_b32:
1533 case Intrinsic::amdgcn_flat_load_monitor_b64:
1534 case Intrinsic::amdgcn_flat_load_monitor_b128:
1535 case Intrinsic::amdgcn_global_load_monitor_b32:
1536 case Intrinsic::amdgcn_global_load_monitor_b64:
1537 case Intrinsic::amdgcn_global_load_monitor_b128:
1538 case Intrinsic::amdgcn_cluster_load_b32:
1539 case Intrinsic::amdgcn_cluster_load_b64:
1540 case Intrinsic::amdgcn_cluster_load_b128:
1541 case Intrinsic::amdgcn_ds_load_tr6_b96:
1542 case Intrinsic::amdgcn_ds_load_tr4_b64:
1543 case Intrinsic::amdgcn_ds_load_tr8_b64:
1544 case Intrinsic::amdgcn_ds_load_tr16_b128:
1545 case Intrinsic::amdgcn_global_load_tr6_b96:
1546 case Intrinsic::amdgcn_global_load_tr4_b64:
1547 case Intrinsic::amdgcn_global_load_tr_b64:
1548 case Intrinsic::amdgcn_global_load_tr_b128:
1549 case Intrinsic::amdgcn_ds_read_tr4_b64:
1550 case Intrinsic::amdgcn_ds_read_tr6_b96:
1551 case Intrinsic::amdgcn_ds_read_tr8_b64:
1552 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1553 Info.opc = ISD::INTRINSIC_W_CHAIN;
1554 Info.memVT = MVT::getVT(CI.getType());
1555 Info.ptrVal = CI.getOperand(0);
1556 Info.align.reset();
1557 Info.flags |= MachineMemOperand::MOLoad;
1558 return true;
1559 }
1560 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1561 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1562 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1563 Info.opc = ISD::INTRINSIC_W_CHAIN;
1564 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1565 Info.ptrVal = CI.getOperand(0);
1566 Info.align.reset();
1567 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1568 return true;
1569 }
1570 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1571 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1572 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1573 Info.opc = ISD::INTRINSIC_VOID;
1574 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1575 Info.ptrVal = CI.getArgOperand(0);
1576 Info.align.reset();
1577 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1578 return true;
1579 }
1580 case Intrinsic::amdgcn_ds_gws_init:
1581 case Intrinsic::amdgcn_ds_gws_barrier:
1582 case Intrinsic::amdgcn_ds_gws_sema_v:
1583 case Intrinsic::amdgcn_ds_gws_sema_br:
1584 case Intrinsic::amdgcn_ds_gws_sema_p:
1585 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1586 Info.opc = ISD::INTRINSIC_VOID;
1587
1588 const GCNTargetMachine &TM =
1589 static_cast<const GCNTargetMachine &>(getTargetMachine());
1590
1592 Info.ptrVal = MFI->getGWSPSV(TM);
1593
1594 // This is an abstract access, but we need to specify a type and size.
1595 Info.memVT = MVT::i32;
1596 Info.size = 4;
1597 Info.align = Align(4);
1598
1599 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1600 Info.flags |= MachineMemOperand::MOLoad;
1601 else
1602 Info.flags |= MachineMemOperand::MOStore;
1603 return true;
1604 }
1605 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1606 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1607 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1608 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1609 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1610 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1611 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1612 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1613 Info.opc = ISD::INTRINSIC_VOID;
1614 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1615 Info.ptrVal = CI.getArgOperand(1);
1617 return true;
1618 }
1619 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1620 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1621 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1622 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1623 Info.opc = ISD::INTRINSIC_VOID;
1624 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1625 Info.ptrVal = CI.getArgOperand(0);
1627 return true;
1628 }
1629 case Intrinsic::amdgcn_load_to_lds:
1630 case Intrinsic::amdgcn_global_load_lds: {
1631 Info.opc = ISD::INTRINSIC_VOID;
1632 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1633 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1634 Info.ptrVal = CI.getArgOperand(1);
1636 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1637 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1638 Info.flags |= MachineMemOperand::MOVolatile;
1639 return true;
1640 }
1641 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1642 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1643 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1644 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1645 Info.opc = ISD::INTRINSIC_W_CHAIN;
1646
1647 const GCNTargetMachine &TM =
1648 static_cast<const GCNTargetMachine &>(getTargetMachine());
1649
1651 Info.ptrVal = MFI->getGWSPSV(TM);
1652
1653 // This is an abstract access, but we need to specify a type and size.
1654 Info.memVT = MVT::i32;
1655 Info.size = 4;
1656 Info.align = Align(4);
1657
1659 return true;
1660 }
1661 case Intrinsic::amdgcn_s_prefetch_data:
1662 case Intrinsic::amdgcn_flat_prefetch:
1663 case Intrinsic::amdgcn_global_prefetch: {
1664 Info.opc = ISD::INTRINSIC_VOID;
1665 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1666 Info.ptrVal = CI.getArgOperand(0);
1667 Info.flags |= MachineMemOperand::MOLoad;
1668 return true;
1669 }
1670 default:
1671 return false;
1672 }
1673}
1674
1676 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1678 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1679 // The DAG's ValueType loses the addrspaces.
1680 // Add them as 2 extra Constant operands "from" and "to".
1681 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1682 unsigned DstAS = I.getType()->getPointerAddressSpace();
1683 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1684 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1685 break;
1686 }
1687 default:
1688 break;
1689 }
1690}
1691
1694 Type *&AccessTy) const {
1695 Value *Ptr = nullptr;
1696 switch (II->getIntrinsicID()) {
1697 case Intrinsic::amdgcn_cluster_load_b128:
1698 case Intrinsic::amdgcn_cluster_load_b64:
1699 case Intrinsic::amdgcn_cluster_load_b32:
1700 case Intrinsic::amdgcn_ds_append:
1701 case Intrinsic::amdgcn_ds_consume:
1702 case Intrinsic::amdgcn_ds_load_tr8_b64:
1703 case Intrinsic::amdgcn_ds_load_tr16_b128:
1704 case Intrinsic::amdgcn_ds_load_tr4_b64:
1705 case Intrinsic::amdgcn_ds_load_tr6_b96:
1706 case Intrinsic::amdgcn_ds_read_tr4_b64:
1707 case Intrinsic::amdgcn_ds_read_tr6_b96:
1708 case Intrinsic::amdgcn_ds_read_tr8_b64:
1709 case Intrinsic::amdgcn_ds_read_tr16_b64:
1710 case Intrinsic::amdgcn_ds_ordered_add:
1711 case Intrinsic::amdgcn_ds_ordered_swap:
1712 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1713 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1714 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1715 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1716 case Intrinsic::amdgcn_flat_load_monitor_b128:
1717 case Intrinsic::amdgcn_flat_load_monitor_b32:
1718 case Intrinsic::amdgcn_flat_load_monitor_b64:
1719 case Intrinsic::amdgcn_global_atomic_fmax_num:
1720 case Intrinsic::amdgcn_global_atomic_fmin_num:
1721 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1722 case Intrinsic::amdgcn_global_load_monitor_b128:
1723 case Intrinsic::amdgcn_global_load_monitor_b32:
1724 case Intrinsic::amdgcn_global_load_monitor_b64:
1725 case Intrinsic::amdgcn_global_load_tr_b64:
1726 case Intrinsic::amdgcn_global_load_tr_b128:
1727 case Intrinsic::amdgcn_global_load_tr4_b64:
1728 case Intrinsic::amdgcn_global_load_tr6_b96:
1729 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1730 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1731 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1732 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1733 Ptr = II->getArgOperand(0);
1734 break;
1735 case Intrinsic::amdgcn_load_to_lds:
1736 case Intrinsic::amdgcn_global_load_lds:
1737 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1738 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1739 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1740 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1741 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1742 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1743 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1744 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1745 Ptr = II->getArgOperand(1);
1746 break;
1747 default:
1748 return false;
1749 }
1750 AccessTy = II->getType();
1751 Ops.push_back(Ptr);
1752 return true;
1753}
1754
1756 unsigned AddrSpace) const {
1757 if (!Subtarget->hasFlatInstOffsets()) {
1758 // Flat instructions do not have offsets, and only have the register
1759 // address.
1760 return AM.BaseOffs == 0 && AM.Scale == 0;
1761 }
1762
1763 decltype(SIInstrFlags::FLAT) FlatVariant =
1767
1768 return AM.Scale == 0 &&
1769 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1770 AM.BaseOffs, AddrSpace, FlatVariant));
1771}
1772
1774 if (Subtarget->hasFlatGlobalInsts())
1776
1777 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1778 // Assume the we will use FLAT for all global memory accesses
1779 // on VI.
1780 // FIXME: This assumption is currently wrong. On VI we still use
1781 // MUBUF instructions for the r + i addressing mode. As currently
1782 // implemented, the MUBUF instructions only work on buffer < 4GB.
1783 // It may be possible to support > 4GB buffers with MUBUF instructions,
1784 // by setting the stride value in the resource descriptor which would
1785 // increase the size limit to (stride * 4GB). However, this is risky,
1786 // because it has never been validated.
1788 }
1789
1790 return isLegalMUBUFAddressingMode(AM);
1791}
1792
1793bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1794 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1795 // additionally can do r + r + i with addr64. 32-bit has more addressing
1796 // mode options. Depending on the resource constant, it can also do
1797 // (i64 r0) + (i32 r1) * (i14 i).
1798 //
1799 // Private arrays end up using a scratch buffer most of the time, so also
1800 // assume those use MUBUF instructions. Scratch loads / stores are currently
1801 // implemented as mubuf instructions with offen bit set, so slightly
1802 // different than the normal addr64.
1803 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1804 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1805 return false;
1806
1807 // FIXME: Since we can split immediate into soffset and immediate offset,
1808 // would it make sense to allow any immediate?
1809
1810 switch (AM.Scale) {
1811 case 0: // r + i or just i, depending on HasBaseReg.
1812 return true;
1813 case 1:
1814 return true; // We have r + r or r + i.
1815 case 2:
1816 if (AM.HasBaseReg) {
1817 // Reject 2 * r + r.
1818 return false;
1819 }
1820
1821 // Allow 2 * r as r + r
1822 // Or 2 * r + i is allowed as r + r + i.
1823 return true;
1824 default: // Don't allow n * r
1825 return false;
1826 }
1827}
1828
1830 const AddrMode &AM, Type *Ty,
1831 unsigned AS,
1832 Instruction *I) const {
1833 // No global is ever allowed as a base.
1834 if (AM.BaseGV)
1835 return false;
1836
1837 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1838 return isLegalGlobalAddressingMode(AM);
1839
1840 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1844 // If the offset isn't a multiple of 4, it probably isn't going to be
1845 // correctly aligned.
1846 // FIXME: Can we get the real alignment here?
1847 if (AM.BaseOffs % 4 != 0)
1848 return isLegalMUBUFAddressingMode(AM);
1849
1850 if (!Subtarget->hasScalarSubwordLoads()) {
1851 // There are no SMRD extloads, so if we have to do a small type access we
1852 // will use a MUBUF load.
1853 // FIXME?: We also need to do this if unaligned, but we don't know the
1854 // alignment here.
1855 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1856 return isLegalGlobalAddressingMode(AM);
1857 }
1858
1859 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1860 // SMRD instructions have an 8-bit, dword offset on SI.
1861 if (!isUInt<8>(AM.BaseOffs / 4))
1862 return false;
1863 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1864 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1865 // in 8-bits, it can use a smaller encoding.
1866 if (!isUInt<32>(AM.BaseOffs / 4))
1867 return false;
1868 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1869 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1870 if (!isUInt<20>(AM.BaseOffs))
1871 return false;
1872 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1873 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1874 // for S_BUFFER_* instructions).
1875 if (!isInt<21>(AM.BaseOffs))
1876 return false;
1877 } else {
1878 // On GFX12, all offsets are signed 24-bit in bytes.
1879 if (!isInt<24>(AM.BaseOffs))
1880 return false;
1881 }
1882
1883 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1885 AM.BaseOffs < 0) {
1886 // Scalar (non-buffer) loads can only use a negative offset if
1887 // soffset+offset is non-negative. Since the compiler can only prove that
1888 // in a few special cases, it is safer to claim that negative offsets are
1889 // not supported.
1890 return false;
1891 }
1892
1893 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1894 return true;
1895
1896 if (AM.Scale == 1 && AM.HasBaseReg)
1897 return true;
1898
1899 return false;
1900 }
1901
1902 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1903 return Subtarget->enableFlatScratch()
1905 : isLegalMUBUFAddressingMode(AM);
1906
1907 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1908 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1909 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1910 // field.
1911 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1912 // an 8-bit dword offset but we don't know the alignment here.
1913 if (!isUInt<16>(AM.BaseOffs))
1914 return false;
1915
1916 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1917 return true;
1918
1919 if (AM.Scale == 1 && AM.HasBaseReg)
1920 return true;
1921
1922 return false;
1923 }
1924
1926 // For an unknown address space, this usually means that this is for some
1927 // reason being used for pure arithmetic, and not based on some addressing
1928 // computation. We don't have instructions that compute pointers with any
1929 // addressing modes, so treat them as having no offset like flat
1930 // instructions.
1932 }
1933
1934 // Assume a user alias of global for unknown address spaces.
1935 return isLegalGlobalAddressingMode(AM);
1936}
1937
1939 const MachineFunction &MF) const {
1941 return (MemVT.getSizeInBits() <= 4 * 32);
1942 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1943 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1944 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1945 }
1947 return (MemVT.getSizeInBits() <= 2 * 32);
1948 return true;
1949}
1950
1952 unsigned Size, unsigned AddrSpace, Align Alignment,
1953 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1954 if (IsFast)
1955 *IsFast = 0;
1956
1957 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1958 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1959 // Check if alignment requirements for ds_read/write instructions are
1960 // disabled.
1961 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1962 return false;
1963
1964 Align RequiredAlignment(
1965 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1966 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1967 Alignment < RequiredAlignment)
1968 return false;
1969
1970 // Either, the alignment requirements are "enabled", or there is an
1971 // unaligned LDS access related hardware bug though alignment requirements
1972 // are "disabled". In either case, we need to check for proper alignment
1973 // requirements.
1974 //
1975 switch (Size) {
1976 case 64:
1977 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1978 // address is negative, then the instruction is incorrectly treated as
1979 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1980 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1981 // load later in the SILoadStoreOptimizer.
1982 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1983 return false;
1984
1985 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1986 // can do a 4 byte aligned, 8 byte access in a single operation using
1987 // ds_read2/write2_b32 with adjacent offsets.
1988 RequiredAlignment = Align(4);
1989
1990 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1991 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1992 // ds_write2_b32 depending on the alignment. In either case with either
1993 // alignment there is no faster way of doing this.
1994
1995 // The numbers returned here and below are not additive, it is a 'speed
1996 // rank'. They are just meant to be compared to decide if a certain way
1997 // of lowering an operation is faster than another. For that purpose
1998 // naturally aligned operation gets it bitsize to indicate that "it
1999 // operates with a speed comparable to N-bit wide load". With the full
2000 // alignment ds128 is slower than ds96 for example. If underaligned it
2001 // is comparable to a speed of a single dword access, which would then
2002 // mean 32 < 128 and it is faster to issue a wide load regardless.
2003 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2004 // wider load which will not be aligned anymore the latter is slower.
2005 if (IsFast)
2006 *IsFast = (Alignment >= RequiredAlignment) ? 64
2007 : (Alignment < Align(4)) ? 32
2008 : 1;
2009 return true;
2010 }
2011
2012 break;
2013 case 96:
2014 if (!Subtarget->hasDS96AndDS128())
2015 return false;
2016
2017 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2018 // gfx8 and older.
2019
2020 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2021 // Naturally aligned access is fastest. However, also report it is Fast
2022 // if memory is aligned less than DWORD. A narrow load or store will be
2023 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2024 // be more of them, so overall we will pay less penalty issuing a single
2025 // instruction.
2026
2027 // See comment on the values above.
2028 if (IsFast)
2029 *IsFast = (Alignment >= RequiredAlignment) ? 96
2030 : (Alignment < Align(4)) ? 32
2031 : 1;
2032 return true;
2033 }
2034
2035 break;
2036 case 128:
2037 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2038 return false;
2039
2040 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2041 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2042 // single operation using ds_read2/write2_b64.
2043 RequiredAlignment = Align(8);
2044
2045 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2046 // Naturally aligned access is fastest. However, also report it is Fast
2047 // if memory is aligned less than DWORD. A narrow load or store will be
2048 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2049 // will be more of them, so overall we will pay less penalty issuing a
2050 // single instruction.
2051
2052 // See comment on the values above.
2053 if (IsFast)
2054 *IsFast = (Alignment >= RequiredAlignment) ? 128
2055 : (Alignment < Align(4)) ? 32
2056 : 1;
2057 return true;
2058 }
2059
2060 break;
2061 default:
2062 if (Size > 32)
2063 return false;
2064
2065 break;
2066 }
2067
2068 // See comment on the values above.
2069 // Note that we have a single-dword or sub-dword here, so if underaligned
2070 // it is a slowest possible access, hence returned value is 0.
2071 if (IsFast)
2072 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2073
2074 return Alignment >= RequiredAlignment ||
2075 Subtarget->hasUnalignedDSAccessEnabled();
2076 }
2077
2078 // FIXME: We have to be conservative here and assume that flat operations
2079 // will access scratch. If we had access to the IR function, then we
2080 // could determine if any private memory was used in the function.
2081 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2082 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2083 bool AlignedBy4 = Alignment >= Align(4);
2084 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2085 if (IsFast)
2086 *IsFast = AlignedBy4 ? Size : 1;
2087 return true;
2088 }
2089
2090 if (IsFast)
2091 *IsFast = AlignedBy4;
2092
2093 return AlignedBy4;
2094 }
2095
2096 // So long as they are correct, wide global memory operations perform better
2097 // than multiple smaller memory ops -- even when misaligned
2098 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2099 if (IsFast)
2100 *IsFast = Size;
2101
2102 return Alignment >= Align(4) ||
2103 Subtarget->hasUnalignedBufferAccessEnabled();
2104 }
2105
2106 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2107 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2108 // out-of-bounds behavior, but in the edge case where an access starts
2109 // out-of-bounds and then enter in-bounds, the entire access would be treated
2110 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2111 // natural alignment of buffer accesses.
2112 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2113 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2114 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2115 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2116 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2117 return false;
2118 }
2119
2120 // Smaller than dword value must be aligned.
2121 if (Size < 32)
2122 return false;
2123
2124 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2125 // byte-address are ignored, thus forcing Dword alignment.
2126 // This applies to private, global, and constant memory.
2127 if (IsFast)
2128 *IsFast = 1;
2129
2130 return Size >= 32 && Alignment >= Align(4);
2131}
2132
2134 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2135 unsigned *IsFast) const {
2137 Alignment, Flags, IsFast);
2138}
2139
2141 LLVMContext &Context, const MemOp &Op,
2142 const AttributeList &FuncAttributes) const {
2143 // FIXME: Should account for address space here.
2144
2145 // The default fallback uses the private pointer size as a guess for a type to
2146 // use. Make sure we switch these to 64-bit accesses.
2147
2148 if (Op.size() >= 16 &&
2149 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2150 return MVT::v4i32;
2151
2152 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2153 return MVT::v2i32;
2154
2155 // Use the default.
2156 return MVT::Other;
2157}
2158
2160 const MemSDNode *MemNode = cast<MemSDNode>(N);
2161 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2162}
2163
2168
2170 unsigned DestAS) const {
2171 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2172 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2173 Subtarget->hasGloballyAddressableScratch()) {
2174 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2175 return false;
2176 }
2177
2178 // Flat -> private/local is a simple truncate.
2179 // Flat -> global is no-op
2180 return true;
2181 }
2182
2183 const GCNTargetMachine &TM =
2184 static_cast<const GCNTargetMachine &>(getTargetMachine());
2185 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2186}
2187
2195
2197 Type *Ty) const {
2198 // FIXME: Could be smarter if called for vector constants.
2199 return true;
2200}
2201
2203 unsigned Index) const {
2205 return false;
2206
2207 // TODO: Add more cases that are cheap.
2208 return Index == 0;
2209}
2210
2211bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2212 // TODO: This should be more aggressive, particular for 16-bit element
2213 // vectors. However there are some mixed improvements and regressions.
2214 EVT EltTy = VT.getVectorElementType();
2215 return EltTy.getSizeInBits() % 32 == 0;
2216}
2217
2219 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2220 switch (Op) {
2221 case ISD::LOAD:
2222 case ISD::STORE:
2223 return true;
2224 default:
2225 return false;
2226 }
2227 }
2228
2229 // SimplifySetCC uses this function to determine whether or not it should
2230 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2231 if (VT == MVT::i1 && Op == ISD::SETCC)
2232 return false;
2233
2235}
2236
2239 // This isn't really a constant pool but close enough.
2242 return PtrInfo;
2243}
2244
2245SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2246 const SDLoc &SL,
2247 SDValue Chain,
2248 uint64_t Offset) const {
2249 const DataLayout &DL = DAG.getDataLayout();
2253
2254 auto [InputPtrReg, RC, ArgTy] =
2255 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2256
2257 // We may not have the kernarg segment argument if we have no kernel
2258 // arguments.
2259 if (!InputPtrReg)
2260 return DAG.getConstant(Offset, SL, PtrVT);
2261
2263 SDValue BasePtr = DAG.getCopyFromReg(
2264 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2265
2266 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2267}
2268
2269SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2270 const SDLoc &SL) const {
2273 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2274}
2275
2276SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2277 const SDLoc &SL) const {
2278
2280 std::optional<uint32_t> KnownSize =
2282 if (KnownSize.has_value())
2283 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2284 return SDValue();
2285}
2286
2287SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2288 const SDLoc &SL, SDValue Val,
2289 bool Signed,
2290 const ISD::InputArg *Arg) const {
2291 // First, if it is a widened vector, narrow it.
2292 if (VT.isVector() &&
2294 EVT NarrowedVT =
2297 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2298 DAG.getConstant(0, SL, MVT::i32));
2299 }
2300
2301 // Then convert the vector elements or scalar value.
2302 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2303 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2304 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2305 }
2306
2307 if (MemVT.isFloatingPoint())
2308 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2309 else if (Signed)
2310 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2311 else
2312 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2313
2314 return Val;
2315}
2316
2317SDValue SITargetLowering::lowerKernargMemParameter(
2318 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2319 uint64_t Offset, Align Alignment, bool Signed,
2320 const ISD::InputArg *Arg) const {
2321
2322 MachinePointerInfo PtrInfo =
2324
2325 // Try to avoid using an extload by loading earlier than the argument address,
2326 // and extracting the relevant bits. The load should hopefully be merged with
2327 // the previous argument.
2328 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2329 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2330 int64_t AlignDownOffset = alignDown(Offset, 4);
2331 int64_t OffsetDiff = Offset - AlignDownOffset;
2332
2333 EVT IntVT = MemVT.changeTypeToInteger();
2334
2335 // TODO: If we passed in the base kernel offset we could have a better
2336 // alignment than 4, but we don't really need it.
2337 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2338 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2339 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2342
2343 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2344 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2345
2346 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2347 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2348 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2349
2350 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2351 }
2352
2353 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2354 SDValue Load = DAG.getLoad(
2355 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2357
2358 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2359 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2360}
2361
2362/// Coerce an argument which was passed in a different ABI type to the original
2363/// expected value type.
2364SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2365 SDValue Val,
2366 CCValAssign &VA,
2367 const SDLoc &SL) const {
2368 EVT ValVT = VA.getValVT();
2369
2370 // If this is an 8 or 16-bit value, it is really passed promoted
2371 // to 32 bits. Insert an assert[sz]ext to capture this, then
2372 // truncate to the right size.
2373 switch (VA.getLocInfo()) {
2374 case CCValAssign::Full:
2375 return Val;
2376 case CCValAssign::BCvt:
2377 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2378 case CCValAssign::SExt:
2379 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2380 DAG.getValueType(ValVT));
2381 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2382 case CCValAssign::ZExt:
2383 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2384 DAG.getValueType(ValVT));
2385 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2386 case CCValAssign::AExt:
2387 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2388 default:
2389 llvm_unreachable("Unknown loc info!");
2390 }
2391}
2392
2393SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2394 CCValAssign &VA, const SDLoc &SL,
2395 SDValue Chain,
2396 const ISD::InputArg &Arg) const {
2397 MachineFunction &MF = DAG.getMachineFunction();
2398 MachineFrameInfo &MFI = MF.getFrameInfo();
2399
2400 if (Arg.Flags.isByVal()) {
2401 unsigned Size = Arg.Flags.getByValSize();
2402 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2403 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2404 }
2405
2406 unsigned ArgOffset = VA.getLocMemOffset();
2407 unsigned ArgSize = VA.getValVT().getStoreSize();
2408
2409 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2410
2411 // Create load nodes to retrieve arguments from the stack.
2412 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2413
2414 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2416 MVT MemVT = VA.getValVT();
2417
2418 switch (VA.getLocInfo()) {
2419 default:
2420 break;
2421 case CCValAssign::BCvt:
2422 MemVT = VA.getLocVT();
2423 break;
2424 case CCValAssign::SExt:
2425 ExtType = ISD::SEXTLOAD;
2426 break;
2427 case CCValAssign::ZExt:
2428 ExtType = ISD::ZEXTLOAD;
2429 break;
2430 case CCValAssign::AExt:
2431 ExtType = ISD::EXTLOAD;
2432 break;
2433 }
2434
2435 SDValue ArgValue = DAG.getExtLoad(
2436 ExtType, SL, VA.getLocVT(), Chain, FIN,
2438
2439 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2440 if (ConvertedVal == ArgValue)
2441 return ConvertedVal;
2442
2443 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2444}
2445
2446SDValue SITargetLowering::lowerWorkGroupId(
2447 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2450 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2451 if (!Subtarget->hasClusters())
2452 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2453
2454 // Clusters are supported. Return the global position in the grid. If clusters
2455 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2456
2457 // WorkGroupIdXYZ = ClusterId == 0 ?
2458 // ClusterIdXYZ :
2459 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2460 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2461 SDLoc SL(ClusterIdXYZ);
2462 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2463 SDValue One = DAG.getConstant(1, SL, VT);
2464 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2465 SDValue ClusterWorkGroupIdXYZ =
2466 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2467 SDValue GlobalIdXYZ =
2468 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2469 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2470
2471 switch (MFI.getClusterDims().getKind()) {
2474 return GlobalIdXYZ;
2476 return ClusterIdXYZ;
2478 using namespace AMDGPU::Hwreg;
2479 SDValue ClusterIdField =
2480 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2481 SDNode *GetReg =
2482 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2483 SDValue ClusterId(GetReg, 0);
2484 SDValue Zero = DAG.getConstant(0, SL, VT);
2485 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2486 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2487 }
2488 }
2489
2490 llvm_unreachable("nothing should reach here");
2491}
2492
2493SDValue SITargetLowering::getPreloadedValue(
2494 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2496 const ArgDescriptor *Reg = nullptr;
2497 const TargetRegisterClass *RC;
2498 LLT Ty;
2499
2501 const ArgDescriptor WorkGroupIDX =
2502 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2503 // If GridZ is not programmed in an entry function then the hardware will set
2504 // it to all zeros, so there is no need to mask the GridY value in the low
2505 // order bits.
2506 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2507 AMDGPU::TTMP7,
2508 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2509 const ArgDescriptor WorkGroupIDZ =
2510 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2511 const ArgDescriptor ClusterWorkGroupIDX =
2512 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2513 const ArgDescriptor ClusterWorkGroupIDY =
2514 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2515 const ArgDescriptor ClusterWorkGroupIDZ =
2516 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2517 const ArgDescriptor ClusterWorkGroupMaxIDX =
2518 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2519 const ArgDescriptor ClusterWorkGroupMaxIDY =
2520 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2521 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2522 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2523 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2524 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2525
2526 auto LoadConstant = [&](unsigned N) {
2527 return DAG.getConstant(N, SDLoc(), VT);
2528 };
2529
2530 if (Subtarget->hasArchitectedSGPRs() &&
2532 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2533 bool HasFixedDims = ClusterDims.isFixedDims();
2534
2535 switch (PVID) {
2537 Reg = &WorkGroupIDX;
2538 RC = &AMDGPU::SReg_32RegClass;
2539 Ty = LLT::scalar(32);
2540 break;
2542 Reg = &WorkGroupIDY;
2543 RC = &AMDGPU::SReg_32RegClass;
2544 Ty = LLT::scalar(32);
2545 break;
2547 Reg = &WorkGroupIDZ;
2548 RC = &AMDGPU::SReg_32RegClass;
2549 Ty = LLT::scalar(32);
2550 break;
2552 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2553 return LoadConstant(0);
2554 Reg = &ClusterWorkGroupIDX;
2555 RC = &AMDGPU::SReg_32RegClass;
2556 Ty = LLT::scalar(32);
2557 break;
2559 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2560 return LoadConstant(0);
2561 Reg = &ClusterWorkGroupIDY;
2562 RC = &AMDGPU::SReg_32RegClass;
2563 Ty = LLT::scalar(32);
2564 break;
2566 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDZ;
2569 RC = &AMDGPU::SReg_32RegClass;
2570 Ty = LLT::scalar(32);
2571 break;
2573 if (HasFixedDims)
2574 return LoadConstant(ClusterDims.getDims()[0] - 1);
2575 Reg = &ClusterWorkGroupMaxIDX;
2576 RC = &AMDGPU::SReg_32RegClass;
2577 Ty = LLT::scalar(32);
2578 break;
2580 if (HasFixedDims)
2581 return LoadConstant(ClusterDims.getDims()[1] - 1);
2582 Reg = &ClusterWorkGroupMaxIDY;
2583 RC = &AMDGPU::SReg_32RegClass;
2584 Ty = LLT::scalar(32);
2585 break;
2587 if (HasFixedDims)
2588 return LoadConstant(ClusterDims.getDims()[2] - 1);
2589 Reg = &ClusterWorkGroupMaxIDZ;
2590 RC = &AMDGPU::SReg_32RegClass;
2591 Ty = LLT::scalar(32);
2592 break;
2594 Reg = &ClusterWorkGroupMaxFlatID;
2595 RC = &AMDGPU::SReg_32RegClass;
2596 Ty = LLT::scalar(32);
2597 break;
2598 default:
2599 break;
2600 }
2601 }
2602
2603 if (!Reg)
2604 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2605 if (!Reg) {
2607 // It's possible for a kernarg intrinsic call to appear in a kernel with
2608 // no allocated segment, in which case we do not add the user sgpr
2609 // argument, so just return null.
2610 return DAG.getConstant(0, SDLoc(), VT);
2611 }
2612
2613 // It's undefined behavior if a function marked with the amdgpu-no-*
2614 // attributes uses the corresponding intrinsic.
2615 return DAG.getPOISON(VT);
2616 }
2617
2618 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2619}
2620
2622 CallingConv::ID CallConv,
2623 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2624 FunctionType *FType,
2626 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2627 const ISD::InputArg *Arg = &Ins[I];
2628
2629 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2630 "vector type argument should have been split");
2631
2632 // First check if it's a PS input addr.
2633 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2634 PSInputNum <= 15) {
2635 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2636
2637 // Inconveniently only the first part of the split is marked as isSplit,
2638 // so skip to the end. We only want to increment PSInputNum once for the
2639 // entire split argument.
2640 if (Arg->Flags.isSplit()) {
2641 while (!Arg->Flags.isSplitEnd()) {
2642 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2643 "unexpected vector split in ps argument type");
2644 if (!SkipArg)
2645 Splits.push_back(*Arg);
2646 Arg = &Ins[++I];
2647 }
2648 }
2649
2650 if (SkipArg) {
2651 // We can safely skip PS inputs.
2652 Skipped.set(Arg->getOrigArgIndex());
2653 ++PSInputNum;
2654 continue;
2655 }
2656
2657 Info->markPSInputAllocated(PSInputNum);
2658 if (Arg->Used)
2659 Info->markPSInputEnabled(PSInputNum);
2660
2661 ++PSInputNum;
2662 }
2663
2664 Splits.push_back(*Arg);
2665 }
2666}
2667
2668// Allocate special inputs passed in VGPRs.
2670 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2671 SIMachineFunctionInfo &Info) const {
2672 const LLT S32 = LLT::scalar(32);
2674
2675 if (Info.hasWorkItemIDX()) {
2676 Register Reg = AMDGPU::VGPR0;
2677 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2678
2679 CCInfo.AllocateReg(Reg);
2680 unsigned Mask =
2681 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2682 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2683 }
2684
2685 if (Info.hasWorkItemIDY()) {
2686 assert(Info.hasWorkItemIDX());
2687 if (Subtarget->hasPackedTID()) {
2688 Info.setWorkItemIDY(
2689 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2690 } else {
2691 unsigned Reg = AMDGPU::VGPR1;
2692 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2693
2694 CCInfo.AllocateReg(Reg);
2695 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2696 }
2697 }
2698
2699 if (Info.hasWorkItemIDZ()) {
2700 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2701 if (Subtarget->hasPackedTID()) {
2702 Info.setWorkItemIDZ(
2703 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2704 } else {
2705 unsigned Reg = AMDGPU::VGPR2;
2706 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2707
2708 CCInfo.AllocateReg(Reg);
2709 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2710 }
2711 }
2712}
2713
2714// Try to allocate a VGPR at the end of the argument list, or if no argument
2715// VGPRs are left allocating a stack slot.
2716// If \p Mask is is given it indicates bitfield position in the register.
2717// If \p Arg is given use it with new ]p Mask instead of allocating new.
2718static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2719 ArgDescriptor Arg = ArgDescriptor()) {
2720 if (Arg.isSet())
2721 return ArgDescriptor::createArg(Arg, Mask);
2722
2723 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2724 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2725 if (RegIdx == ArgVGPRs.size()) {
2726 // Spill to stack required.
2727 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2728
2729 return ArgDescriptor::createStack(Offset, Mask);
2730 }
2731
2732 unsigned Reg = ArgVGPRs[RegIdx];
2733 Reg = CCInfo.AllocateReg(Reg);
2734 assert(Reg != AMDGPU::NoRegister);
2735
2736 MachineFunction &MF = CCInfo.getMachineFunction();
2737 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2738 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2739 return ArgDescriptor::createRegister(Reg, Mask);
2740}
2741
2743 const TargetRegisterClass *RC,
2744 unsigned NumArgRegs) {
2745 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2746 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2747 if (RegIdx == ArgSGPRs.size())
2748 report_fatal_error("ran out of SGPRs for arguments");
2749
2750 unsigned Reg = ArgSGPRs[RegIdx];
2751 Reg = CCInfo.AllocateReg(Reg);
2752 assert(Reg != AMDGPU::NoRegister);
2753
2754 MachineFunction &MF = CCInfo.getMachineFunction();
2755 MF.addLiveIn(Reg, RC);
2757}
2758
2759// If this has a fixed position, we still should allocate the register in the
2760// CCInfo state. Technically we could get away with this for values passed
2761// outside of the normal argument range.
2763 const TargetRegisterClass *RC,
2764 MCRegister Reg) {
2765 Reg = CCInfo.AllocateReg(Reg);
2766 assert(Reg != AMDGPU::NoRegister);
2767 MachineFunction &MF = CCInfo.getMachineFunction();
2768 MF.addLiveIn(Reg, RC);
2769}
2770
2771static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2772 if (Arg) {
2773 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2774 Arg.getRegister());
2775 } else
2776 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2777}
2778
2779static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2780 if (Arg) {
2781 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2782 Arg.getRegister());
2783 } else
2784 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2785}
2786
2787/// Allocate implicit function VGPR arguments at the end of allocated user
2788/// arguments.
2790 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2791 SIMachineFunctionInfo &Info) const {
2792 const unsigned Mask = 0x3ff;
2793 ArgDescriptor Arg;
2794
2795 if (Info.hasWorkItemIDX()) {
2796 Arg = allocateVGPR32Input(CCInfo, Mask);
2797 Info.setWorkItemIDX(Arg);
2798 }
2799
2800 if (Info.hasWorkItemIDY()) {
2801 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2802 Info.setWorkItemIDY(Arg);
2803 }
2804
2805 if (Info.hasWorkItemIDZ())
2806 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2807}
2808
2809/// Allocate implicit function VGPR arguments in fixed registers.
2811 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2812 SIMachineFunctionInfo &Info) const {
2813 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2814 if (!Reg)
2815 report_fatal_error("failed to allocate VGPR for implicit arguments");
2816
2817 const unsigned Mask = 0x3ff;
2818 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2819 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2820 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2821}
2822
2824 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2825 SIMachineFunctionInfo &Info) const {
2826 auto &ArgInfo = Info.getArgInfo();
2827 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2828
2829 // TODO: Unify handling with private memory pointers.
2830 if (UserSGPRInfo.hasDispatchPtr())
2831 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2832
2833 if (UserSGPRInfo.hasQueuePtr())
2834 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2835
2836 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2837 // constant offset from the kernarg segment.
2838 if (Info.hasImplicitArgPtr())
2839 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2840
2841 if (UserSGPRInfo.hasDispatchID())
2842 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2843
2844 // flat_scratch_init is not applicable for non-kernel functions.
2845
2846 if (Info.hasWorkGroupIDX())
2847 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2848
2849 if (Info.hasWorkGroupIDY())
2850 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2851
2852 if (Info.hasWorkGroupIDZ())
2853 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2854
2855 if (Info.hasLDSKernelId())
2856 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2857}
2858
2859// Allocate special inputs passed in user SGPRs.
2861 MachineFunction &MF,
2862 const SIRegisterInfo &TRI,
2863 SIMachineFunctionInfo &Info) const {
2864 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2865 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2866 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2867 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2868 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2869 }
2870
2871 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2872 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2873 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2874 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2875 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2876 }
2877
2878 if (UserSGPRInfo.hasDispatchPtr()) {
2879 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2880 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2881 CCInfo.AllocateReg(DispatchPtrReg);
2882 }
2883
2884 if (UserSGPRInfo.hasQueuePtr()) {
2885 Register QueuePtrReg = Info.addQueuePtr(TRI);
2886 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2887 CCInfo.AllocateReg(QueuePtrReg);
2888 }
2889
2890 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2892 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2893 CCInfo.AllocateReg(InputPtrReg);
2894
2895 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2896 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2897 }
2898
2899 if (UserSGPRInfo.hasDispatchID()) {
2900 Register DispatchIDReg = Info.addDispatchID(TRI);
2901 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2902 CCInfo.AllocateReg(DispatchIDReg);
2903 }
2904
2905 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2906 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2907 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2908 CCInfo.AllocateReg(FlatScratchInitReg);
2909 }
2910
2911 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2912 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2913 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2914 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2915 }
2916
2917 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2918 // these from the dispatch pointer.
2919}
2920
2921// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2922// sequential starting from the first argument.
2924 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2926 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2927 Function &F = MF.getFunction();
2928 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2929 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2930 bool InPreloadSequence = true;
2931 unsigned InIdx = 0;
2932 bool AlignedForImplictArgs = false;
2933 unsigned ImplicitArgOffset = 0;
2934 for (auto &Arg : F.args()) {
2935 if (!InPreloadSequence || !Arg.hasInRegAttr())
2936 break;
2937
2938 unsigned ArgIdx = Arg.getArgNo();
2939 // Don't preload non-original args or parts not in the current preload
2940 // sequence.
2941 if (InIdx < Ins.size() &&
2942 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2943 break;
2944
2945 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2946 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2947 InIdx++) {
2948 assert(ArgLocs[ArgIdx].isMemLoc());
2949 auto &ArgLoc = ArgLocs[InIdx];
2950 const Align KernelArgBaseAlign = Align(16);
2951 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2952 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2953 unsigned NumAllocSGPRs =
2954 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2955
2956 // Fix alignment for hidden arguments.
2957 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2958 if (!AlignedForImplictArgs) {
2959 ImplicitArgOffset =
2960 alignTo(LastExplicitArgOffset,
2961 Subtarget->getAlignmentForImplicitArgPtr()) -
2962 LastExplicitArgOffset;
2963 AlignedForImplictArgs = true;
2964 }
2965 ArgOffset += ImplicitArgOffset;
2966 }
2967
2968 // Arg is preloaded into the previous SGPR.
2969 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2970 assert(InIdx >= 1 && "No previous SGPR");
2971 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2972 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2973 continue;
2974 }
2975
2976 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2977 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2978 // Check for free user SGPRs for preloading.
2979 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2980 InPreloadSequence = false;
2981 break;
2982 }
2983
2984 // Preload this argument.
2985 const TargetRegisterClass *RC =
2986 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2987 SmallVectorImpl<MCRegister> *PreloadRegs =
2988 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2989
2990 if (PreloadRegs->size() > 1)
2991 RC = &AMDGPU::SGPR_32RegClass;
2992 for (auto &Reg : *PreloadRegs) {
2993 assert(Reg);
2994 MF.addLiveIn(Reg, RC);
2995 CCInfo.AllocateReg(Reg);
2996 }
2997
2998 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2999 }
3000 }
3001}
3002
3004 const SIRegisterInfo &TRI,
3005 SIMachineFunctionInfo &Info) const {
3006 // Always allocate this last since it is a synthetic preload.
3007 if (Info.hasLDSKernelId()) {
3008 Register Reg = Info.addLDSKernelId();
3009 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3010 CCInfo.AllocateReg(Reg);
3011 }
3012}
3013
3014// Allocate special input registers that are initialized per-wave.
3017 CallingConv::ID CallConv,
3018 bool IsShader) const {
3019 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3020 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3021 // Note: user SGPRs are handled by the front-end for graphics shaders
3022 // Pad up the used user SGPRs with dead inputs.
3023
3024 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3025 // before enabling architected SGPRs for workgroup IDs.
3026 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3027
3028 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3029 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3030 // rely on it to reach 16 since if we end up having no stack usage, it will
3031 // not really be added.
3032 unsigned NumRequiredSystemSGPRs =
3033 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3034 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3035 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3036 Register Reg = Info.addReservedUserSGPR();
3037 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3038 CCInfo.AllocateReg(Reg);
3039 }
3040 }
3041
3042 if (!HasArchitectedSGPRs) {
3043 if (Info.hasWorkGroupIDX()) {
3044 Register Reg = Info.addWorkGroupIDX();
3045 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3046 CCInfo.AllocateReg(Reg);
3047 }
3048
3049 if (Info.hasWorkGroupIDY()) {
3050 Register Reg = Info.addWorkGroupIDY();
3051 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3052 CCInfo.AllocateReg(Reg);
3053 }
3054
3055 if (Info.hasWorkGroupIDZ()) {
3056 Register Reg = Info.addWorkGroupIDZ();
3057 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3058 CCInfo.AllocateReg(Reg);
3059 }
3060 }
3061
3062 if (Info.hasWorkGroupInfo()) {
3063 Register Reg = Info.addWorkGroupInfo();
3064 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3065 CCInfo.AllocateReg(Reg);
3066 }
3067
3068 if (Info.hasPrivateSegmentWaveByteOffset()) {
3069 // Scratch wave offset passed in system SGPR.
3070 unsigned PrivateSegmentWaveByteOffsetReg;
3071
3072 if (IsShader) {
3073 PrivateSegmentWaveByteOffsetReg =
3074 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3075
3076 // This is true if the scratch wave byte offset doesn't have a fixed
3077 // location.
3078 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3079 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3080 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3081 }
3082 } else
3083 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3084
3085 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3086 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3087 }
3088
3089 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3090 Info.getNumPreloadedSGPRs() >= 16);
3091}
3092
3094 MachineFunction &MF,
3095 const SIRegisterInfo &TRI,
3097 // Now that we've figured out where the scratch register inputs are, see if
3098 // should reserve the arguments and use them directly.
3099 MachineFrameInfo &MFI = MF.getFrameInfo();
3100 bool HasStackObjects = MFI.hasStackObjects();
3101 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3102
3103 // Record that we know we have non-spill stack objects so we don't need to
3104 // check all stack objects later.
3105 if (HasStackObjects)
3106 Info.setHasNonSpillStackObjects(true);
3107
3108 // Everything live out of a block is spilled with fast regalloc, so it's
3109 // almost certain that spilling will be required.
3111 HasStackObjects = true;
3112
3113 // For now assume stack access is needed in any callee functions, so we need
3114 // the scratch registers to pass in.
3115 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3116
3117 if (!ST.enableFlatScratch()) {
3118 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3119 // If we have stack objects, we unquestionably need the private buffer
3120 // resource. For the Code Object V2 ABI, this will be the first 4 user
3121 // SGPR inputs. We can reserve those and use them directly.
3122
3123 Register PrivateSegmentBufferReg =
3125 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3126 } else {
3127 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3128 // We tentatively reserve the last registers (skipping the last registers
3129 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3130 // we'll replace these with the ones immediately after those which were
3131 // really allocated. In the prologue copies will be inserted from the
3132 // argument to these reserved registers.
3133
3134 // Without HSA, relocations are used for the scratch pointer and the
3135 // buffer resource setup is always inserted in the prologue. Scratch wave
3136 // offset is still in an input SGPR.
3137 Info.setScratchRSrcReg(ReservedBufferReg);
3138 }
3139 }
3140
3142
3143 // For entry functions we have to set up the stack pointer if we use it,
3144 // whereas non-entry functions get this "for free". This means there is no
3145 // intrinsic advantage to using S32 over S34 in cases where we do not have
3146 // calls but do need a frame pointer (i.e. if we are requested to have one
3147 // because frame pointer elimination is disabled). To keep things simple we
3148 // only ever use S32 as the call ABI stack pointer, and so using it does not
3149 // imply we need a separate frame pointer.
3150 //
3151 // Try to use s32 as the SP, but move it if it would interfere with input
3152 // arguments. This won't work with calls though.
3153 //
3154 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3155 // registers.
3156 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3157 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3158 } else {
3160
3161 if (MFI.hasCalls())
3162 report_fatal_error("call in graphics shader with too many input SGPRs");
3163
3164 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3165 if (!MRI.isLiveIn(Reg)) {
3166 Info.setStackPtrOffsetReg(Reg);
3167 break;
3168 }
3169 }
3170
3171 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3172 report_fatal_error("failed to find register for SP");
3173 }
3174
3175 // hasFP should be accurate for entry functions even before the frame is
3176 // finalized, because it does not rely on the known stack size, only
3177 // properties like whether variable sized objects are present.
3178 if (ST.getFrameLowering()->hasFP(MF)) {
3179 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3180 }
3181}
3182
3185 return !Info->isEntryFunction();
3186}
3187
3189
3191 MachineBasicBlock *Entry,
3192 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3194
3195 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3196 if (!IStart)
3197 return;
3198
3199 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3200 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3201 MachineBasicBlock::iterator MBBI = Entry->begin();
3202 for (const MCPhysReg *I = IStart; *I; ++I) {
3203 const TargetRegisterClass *RC = nullptr;
3204 if (AMDGPU::SReg_64RegClass.contains(*I))
3205 RC = &AMDGPU::SGPR_64RegClass;
3206 else if (AMDGPU::SReg_32RegClass.contains(*I))
3207 RC = &AMDGPU::SGPR_32RegClass;
3208 else
3209 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3210
3211 Register NewVR = MRI->createVirtualRegister(RC);
3212 // Create copy from CSR to a virtual register.
3213 Entry->addLiveIn(*I);
3214 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3215 .addReg(*I);
3216
3217 // Insert the copy-back instructions right before the terminator.
3218 for (auto *Exit : Exits)
3219 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3220 TII->get(TargetOpcode::COPY), *I)
3221 .addReg(NewVR);
3222 }
3223}
3224
3226 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3227 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3228 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3230
3232 const Function &Fn = MF.getFunction();
3235 bool IsError = false;
3236
3237 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3239 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3240 IsError = true;
3241 }
3242
3245 BitVector Skipped(Ins.size());
3246 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3247 *DAG.getContext());
3248
3249 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3250 bool IsKernel = AMDGPU::isKernel(CallConv);
3251 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3252
3253 if (IsGraphics) {
3254 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3255 assert(!UserSGPRInfo.hasDispatchPtr() &&
3256 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3257 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3258 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3259 (void)UserSGPRInfo;
3260 if (!Subtarget->enableFlatScratch())
3261 assert(!UserSGPRInfo.hasFlatScratchInit());
3262 if ((CallConv != CallingConv::AMDGPU_CS &&
3263 CallConv != CallingConv::AMDGPU_Gfx &&
3264 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3265 !Subtarget->hasArchitectedSGPRs())
3266 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3267 !Info->hasWorkGroupIDZ());
3268 }
3269
3270 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3271
3272 if (CallConv == CallingConv::AMDGPU_PS) {
3273 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3274
3275 // At least one interpolation mode must be enabled or else the GPU will
3276 // hang.
3277 //
3278 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3279 // set PSInputAddr, the user wants to enable some bits after the compilation
3280 // based on run-time states. Since we can't know what the final PSInputEna
3281 // will look like, so we shouldn't do anything here and the user should take
3282 // responsibility for the correct programming.
3283 //
3284 // Otherwise, the following restrictions apply:
3285 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3286 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3287 // enabled too.
3288 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3289 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3290 CCInfo.AllocateReg(AMDGPU::VGPR0);
3291 CCInfo.AllocateReg(AMDGPU::VGPR1);
3292 Info->markPSInputAllocated(0);
3293 Info->markPSInputEnabled(0);
3294 }
3295 if (Subtarget->isAmdPalOS()) {
3296 // For isAmdPalOS, the user does not enable some bits after compilation
3297 // based on run-time states; the register values being generated here are
3298 // the final ones set in hardware. Therefore we need to apply the
3299 // workaround to PSInputAddr and PSInputEnable together. (The case where
3300 // a bit is set in PSInputAddr but not PSInputEnable is where the
3301 // frontend set up an input arg for a particular interpolation mode, but
3302 // nothing uses that input arg. Really we should have an earlier pass
3303 // that removes such an arg.)
3304 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3305 if ((PsInputBits & 0x7F) == 0 ||
3306 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3307 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3308 }
3309 } else if (IsKernel) {
3310 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3311 } else {
3312 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3313 Ins.end());
3314 }
3315
3316 if (IsKernel)
3317 analyzeFormalArgumentsCompute(CCInfo, Ins);
3318
3319 if (IsEntryFunc) {
3320 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3321 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3322 if (IsKernel && Subtarget->hasKernargPreload())
3323 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3324
3325 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3326 } else if (!IsGraphics) {
3327 // For the fixed ABI, pass workitem IDs in the last argument register.
3328 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3329
3330 // FIXME: Sink this into allocateSpecialInputSGPRs
3331 if (!Subtarget->enableFlatScratch())
3332 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3333
3334 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3335 }
3336
3337 if (!IsKernel) {
3338 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3339 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3340
3341 // This assumes the registers are allocated by CCInfo in ascending order
3342 // with no gaps.
3343 Info->setNumWaveDispatchSGPRs(
3344 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3345 Info->setNumWaveDispatchVGPRs(
3346 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3347 } else if (Info->getNumKernargPreloadedSGPRs()) {
3348 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3349 }
3350
3352
3353 if (IsWholeWaveFunc) {
3354 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3355 {MVT::i1, MVT::Other}, Chain);
3356 InVals.push_back(Setup.getValue(0));
3357 Chains.push_back(Setup.getValue(1));
3358 }
3359
3360 // FIXME: This is the minimum kernel argument alignment. We should improve
3361 // this to the maximum alignment of the arguments.
3362 //
3363 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3364 // kern arg offset.
3365 const Align KernelArgBaseAlign = Align(16);
3366
3367 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3368 ++i) {
3369 const ISD::InputArg &Arg = Ins[i];
3370 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3371 InVals.push_back(DAG.getPOISON(Arg.VT));
3372 continue;
3373 }
3374
3375 CCValAssign &VA = ArgLocs[ArgIdx++];
3376 MVT VT = VA.getLocVT();
3377
3378 if (IsEntryFunc && VA.isMemLoc()) {
3379 VT = Ins[i].VT;
3380 EVT MemVT = VA.getLocVT();
3381
3382 const uint64_t Offset = VA.getLocMemOffset();
3383 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3384
3385 if (Arg.Flags.isByRef()) {
3386 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3387
3388 const GCNTargetMachine &TM =
3389 static_cast<const GCNTargetMachine &>(getTargetMachine());
3390 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3391 Arg.Flags.getPointerAddrSpace())) {
3394 }
3395
3396 InVals.push_back(Ptr);
3397 continue;
3398 }
3399
3400 SDValue NewArg;
3401 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3402 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3403 // In this case the argument is packed into the previous preload SGPR.
3404 int64_t AlignDownOffset = alignDown(Offset, 4);
3405 int64_t OffsetDiff = Offset - AlignDownOffset;
3406 EVT IntVT = MemVT.changeTypeToInteger();
3407
3408 const SIMachineFunctionInfo *Info =
3411 Register Reg =
3412 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3413
3414 assert(Reg);
3415 Register VReg = MRI.getLiveInVirtReg(Reg);
3416 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3417
3418 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3419 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3420
3421 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3422 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3423 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3424 Ins[i].Flags.isSExt(), &Ins[i]);
3425
3426 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3427 } else {
3428 const SIMachineFunctionInfo *Info =
3431 const SmallVectorImpl<MCRegister> &PreloadRegs =
3432 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3433
3434 SDValue Copy;
3435 if (PreloadRegs.size() == 1) {
3436 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3437 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3438 NewArg = DAG.getCopyFromReg(
3439 Chain, DL, VReg,
3441 TRI->getRegSizeInBits(*RC)));
3442
3443 } else {
3444 // If the kernarg alignment does not match the alignment of the SGPR
3445 // tuple RC that can accommodate this argument, it will be built up
3446 // via copies from from the individual SGPRs that the argument was
3447 // preloaded to.
3449 for (auto Reg : PreloadRegs) {
3450 Register VReg = MRI.getLiveInVirtReg(Reg);
3451 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3452 Elts.push_back(Copy);
3453 }
3454 NewArg =
3455 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3456 PreloadRegs.size()),
3457 DL, Elts);
3458 }
3459
3460 // If the argument was preloaded to multiple consecutive 32-bit
3461 // registers because of misalignment between addressable SGPR tuples
3462 // and the argument size, we can still assume that because of kernarg
3463 // segment alignment restrictions that NewArg's size is the same as
3464 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3465 // truncate since we cannot preload to less than a single SGPR and the
3466 // MemVT may be smaller.
3467 EVT MemVTInt =
3469 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3470 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3471
3472 NewArg = DAG.getBitcast(MemVT, NewArg);
3473 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3474 Ins[i].Flags.isSExt(), &Ins[i]);
3475 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3476 }
3477 } else {
3478 // Hidden arguments that are in the kernel signature must be preloaded
3479 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3480 // the argument list and is not preloaded.
3481 if (Arg.isOrigArg()) {
3482 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3483 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3485 *OrigArg->getParent(),
3486 "hidden argument in kernel signature was not preloaded",
3487 DL.getDebugLoc()));
3488 }
3489 }
3490
3491 NewArg =
3492 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3493 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3494 }
3495 Chains.push_back(NewArg.getValue(1));
3496
3497 auto *ParamTy =
3498 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3499 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3500 ParamTy &&
3501 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3502 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3503 // On SI local pointers are just offsets into LDS, so they are always
3504 // less than 16-bits. On CI and newer they could potentially be
3505 // real pointers, so we can't guarantee their size.
3506 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3507 DAG.getValueType(MVT::i16));
3508 }
3509
3510 InVals.push_back(NewArg);
3511 continue;
3512 }
3513 if (!IsEntryFunc && VA.isMemLoc()) {
3514 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3515 InVals.push_back(Val);
3516 if (!Arg.Flags.isByVal())
3517 Chains.push_back(Val.getValue(1));
3518 continue;
3519 }
3520
3521 assert(VA.isRegLoc() && "Parameter must be in a register!");
3522
3523 Register Reg = VA.getLocReg();
3524 const TargetRegisterClass *RC = nullptr;
3525 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3526 RC = &AMDGPU::VGPR_32RegClass;
3527 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3528 RC = &AMDGPU::SGPR_32RegClass;
3529 else
3530 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3531
3532 Reg = MF.addLiveIn(Reg, RC);
3533 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3534
3535 if (Arg.Flags.isSRet()) {
3536 // The return object should be reasonably addressable.
3537
3538 // FIXME: This helps when the return is a real sret. If it is a
3539 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3540 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3541 unsigned NumBits =
3543 Val = DAG.getNode(
3544 ISD::AssertZext, DL, VT, Val,
3545 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3546 }
3547
3548 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3549 InVals.push_back(Val);
3550 }
3551
3552 // Start adding system SGPRs.
3553 if (IsEntryFunc)
3554 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3555
3556 if (DAG.getPass()) {
3557 auto &ArgUsageInfo =
3559 ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(Fn, Info->getArgInfo());
3560 } else if (auto *MFAM = DAG.getMFAM()) {
3561 Module &M = *MF.getFunction().getParent();
3562 auto *ArgUsageInfo =
3564 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3565 if (ArgUsageInfo)
3566 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3567 }
3568
3569 unsigned StackArgSize = CCInfo.getStackSize();
3570 Info->setBytesInStackArgArea(StackArgSize);
3571
3572 return Chains.empty() ? Chain
3573 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3574}
3575
3576// TODO: If return values can't fit in registers, we should return as many as
3577// possible in registers before passing on stack.
3579 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3580 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3581 const Type *RetTy) const {
3582 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3583 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3584 // for shaders. Vector types should be explicitly handled by CC.
3585 if (AMDGPU::isEntryFunctionCC(CallConv))
3586 return true;
3587
3589 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3590 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3591 return false;
3592
3593 // We must use the stack if return would require unavailable registers.
3594 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3595 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3596 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3597 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3598 return false;
3599
3600 return true;
3601}
3602
3603SDValue
3605 bool isVarArg,
3607 const SmallVectorImpl<SDValue> &OutVals,
3608 const SDLoc &DL, SelectionDAG &DAG) const {
3612
3613 if (AMDGPU::isKernel(CallConv)) {
3614 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3615 OutVals, DL, DAG);
3616 }
3617
3618 bool IsShader = AMDGPU::isShader(CallConv);
3619
3620 Info->setIfReturnsVoid(Outs.empty());
3621 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3622
3623 // CCValAssign - represent the assignment of the return value to a location.
3625
3626 // CCState - Info about the registers and stack slots.
3627 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3628 *DAG.getContext());
3629
3630 // Analyze outgoing return values.
3631 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3632
3633 SDValue Glue;
3635 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3636
3637 SDValue ReadFirstLane =
3638 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3639 // Copy the result values into the output registers.
3640 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3641 ++I, ++RealRVLocIdx) {
3642 CCValAssign &VA = RVLocs[I];
3643 assert(VA.isRegLoc() && "Can only return in registers!");
3644 // TODO: Partially return in registers if return values don't fit.
3645 SDValue Arg = OutVals[RealRVLocIdx];
3646
3647 // Copied from other backends.
3648 switch (VA.getLocInfo()) {
3649 case CCValAssign::Full:
3650 break;
3651 case CCValAssign::BCvt:
3652 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3653 break;
3654 case CCValAssign::SExt:
3655 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3656 break;
3657 case CCValAssign::ZExt:
3658 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3659 break;
3660 case CCValAssign::AExt:
3661 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3662 break;
3663 default:
3664 llvm_unreachable("Unknown loc info!");
3665 }
3666 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3668 ReadFirstLane, Arg);
3669 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3670 Glue = Chain.getValue(1);
3671 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3672 }
3673
3674 // FIXME: Does sret work properly?
3675 if (!Info->isEntryFunction()) {
3676 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3677 const MCPhysReg *I =
3678 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3679 if (I) {
3680 for (; *I; ++I) {
3681 if (AMDGPU::SReg_64RegClass.contains(*I))
3682 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3683 else if (AMDGPU::SReg_32RegClass.contains(*I))
3684 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3685 else
3686 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3687 }
3688 }
3689 }
3690
3691 // Update chain and glue.
3692 RetOps[0] = Chain;
3693 if (Glue.getNode())
3694 RetOps.push_back(Glue);
3695
3696 unsigned Opc = AMDGPUISD::ENDPGM;
3697 if (!IsWaveEnd)
3698 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3699 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3700 : AMDGPUISD::RET_GLUE;
3701 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3702}
3703
3705 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3706 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3707 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3708 SDValue ThisVal) const {
3709 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3710
3711 // Assign locations to each value returned by this call.
3713 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3714 *DAG.getContext());
3715 CCInfo.AnalyzeCallResult(Ins, RetCC);
3716
3717 // Copy all of the result registers out of their specified physreg.
3718 for (CCValAssign VA : RVLocs) {
3719 SDValue Val;
3720
3721 if (VA.isRegLoc()) {
3722 Val =
3723 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3724 Chain = Val.getValue(1);
3725 InGlue = Val.getValue(2);
3726 } else if (VA.isMemLoc()) {
3727 report_fatal_error("TODO: return values in memory");
3728 } else
3729 llvm_unreachable("unknown argument location type");
3730
3731 switch (VA.getLocInfo()) {
3732 case CCValAssign::Full:
3733 break;
3734 case CCValAssign::BCvt:
3735 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3736 break;
3737 case CCValAssign::ZExt:
3738 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3739 DAG.getValueType(VA.getValVT()));
3740 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3741 break;
3742 case CCValAssign::SExt:
3743 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3744 DAG.getValueType(VA.getValVT()));
3745 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3746 break;
3747 case CCValAssign::AExt:
3748 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3749 break;
3750 default:
3751 llvm_unreachable("Unknown loc info!");
3752 }
3753
3754 InVals.push_back(Val);
3755 }
3756
3757 return Chain;
3758}
3759
3760// Add code to pass special inputs required depending on used features separate
3761// from the explicit user arguments present in the IR.
3763 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3764 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3765 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3766 // If we don't have a call site, this was a call inserted by
3767 // legalization. These can never use special inputs.
3768 if (!CLI.CB)
3769 return;
3770
3771 SelectionDAG &DAG = CLI.DAG;
3772 const SDLoc &DL = CLI.DL;
3773 const Function &F = DAG.getMachineFunction().getFunction();
3774
3775 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3776 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3777
3778 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3780 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3781 if (DAG.getPass()) {
3782 auto &ArgUsageInfo =
3784 CalleeArgInfo =
3785 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3786 } else if (auto *MFAM = DAG.getMFAM()) {
3788 auto *ArgUsageInfo =
3790 DAG.getMachineFunction())
3791 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3792 if (ArgUsageInfo)
3793 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3794 }
3795 }
3796
3797 // TODO: Unify with private memory register handling. This is complicated by
3798 // the fact that at least in kernels, the input argument is not necessarily
3799 // in the same location as the input.
3800 // clang-format off
3801 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3802 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3803 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3804 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3805 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3806 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3807 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3808 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3809 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3810 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3811 };
3812 // clang-format on
3813
3814 for (auto [InputID, Attrs] : ImplicitAttrs) {
3815 // If the callee does not use the attribute value, skip copying the value.
3816 if (all_of(Attrs, [&](StringRef Attr) {
3817 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3818 }))
3819 continue;
3820
3821 const auto [OutgoingArg, ArgRC, ArgTy] =
3822 CalleeArgInfo->getPreloadedValue(InputID);
3823 if (!OutgoingArg)
3824 continue;
3825
3826 const auto [IncomingArg, IncomingArgRC, Ty] =
3827 CallerArgInfo.getPreloadedValue(InputID);
3828 assert(IncomingArgRC == ArgRC);
3829
3830 // All special arguments are ints for now.
3831 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3832 SDValue InputReg;
3833
3834 if (IncomingArg) {
3835 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3836 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3837 // The implicit arg ptr is special because it doesn't have a corresponding
3838 // input for kernels, and is computed from the kernarg segment pointer.
3839 InputReg = getImplicitArgPtr(DAG, DL);
3840 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3841 std::optional<uint32_t> Id =
3843 if (Id.has_value()) {
3844 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3845 } else {
3846 InputReg = DAG.getPOISON(ArgVT);
3847 }
3848 } else {
3849 // We may have proven the input wasn't needed, although the ABI is
3850 // requiring it. We just need to allocate the register appropriately.
3851 InputReg = DAG.getPOISON(ArgVT);
3852 }
3853
3854 if (OutgoingArg->isRegister()) {
3855 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3856 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3857 report_fatal_error("failed to allocate implicit input argument");
3858 } else {
3859 unsigned SpecialArgOffset =
3860 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3861 SDValue ArgStore =
3862 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3863 MemOpChains.push_back(ArgStore);
3864 }
3865 }
3866
3867 // Pack workitem IDs into a single register or pass it as is if already
3868 // packed.
3869
3870 auto [OutgoingArg, ArgRC, Ty] =
3872 if (!OutgoingArg)
3873 std::tie(OutgoingArg, ArgRC, Ty) =
3875 if (!OutgoingArg)
3876 std::tie(OutgoingArg, ArgRC, Ty) =
3878 if (!OutgoingArg)
3879 return;
3880
3881 const ArgDescriptor *IncomingArgX = std::get<0>(
3883 const ArgDescriptor *IncomingArgY = std::get<0>(
3885 const ArgDescriptor *IncomingArgZ = std::get<0>(
3887
3888 SDValue InputReg;
3889 SDLoc SL;
3890
3891 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3892 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3893 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3894
3895 // If incoming ids are not packed we need to pack them.
3896 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3897 NeedWorkItemIDX) {
3898 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3899 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3900 } else {
3901 InputReg = DAG.getConstant(0, DL, MVT::i32);
3902 }
3903 }
3904
3905 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3906 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3907 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3908 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3909 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3910 InputReg = InputReg.getNode()
3911 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3912 : Y;
3913 }
3914
3915 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3916 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3917 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3918 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3919 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3920 InputReg = InputReg.getNode()
3921 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3922 : Z;
3923 }
3924
3925 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3926 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3927 // We're in a situation where the outgoing function requires the workitem
3928 // ID, but the calling function does not have it (e.g a graphics function
3929 // calling a C calling convention function). This is illegal, but we need
3930 // to produce something.
3931 InputReg = DAG.getPOISON(MVT::i32);
3932 } else {
3933 // Workitem ids are already packed, any of present incoming arguments
3934 // will carry all required fields.
3935 ArgDescriptor IncomingArg =
3936 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3937 : IncomingArgY ? *IncomingArgY
3938 : *IncomingArgZ,
3939 ~0u);
3940 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3941 }
3942 }
3943
3944 if (OutgoingArg->isRegister()) {
3945 if (InputReg)
3946 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3947
3948 CCInfo.AllocateReg(OutgoingArg->getRegister());
3949 } else {
3950 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3951 if (InputReg) {
3952 SDValue ArgStore =
3953 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3954 MemOpChains.push_back(ArgStore);
3955 }
3956 }
3957}
3958
3960 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3962 const SmallVectorImpl<SDValue> &OutVals,
3963 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3964 if (AMDGPU::isChainCC(CalleeCC))
3965 return true;
3966
3967 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3968 return false;
3969
3970 // For a divergent call target, we need to do a waterfall loop over the
3971 // possible callees which precludes us from using a simple jump.
3972 if (Callee->isDivergent())
3973 return false;
3974
3976 const Function &CallerF = MF.getFunction();
3977 CallingConv::ID CallerCC = CallerF.getCallingConv();
3979 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3980
3981 // Kernels aren't callable, and don't have a live in return address so it
3982 // doesn't make sense to do a tail call with entry functions.
3983 if (!CallerPreserved)
3984 return false;
3985
3986 bool CCMatch = CallerCC == CalleeCC;
3987
3989 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3990 return true;
3991 return false;
3992 }
3993
3994 // TODO: Can we handle var args?
3995 if (IsVarArg)
3996 return false;
3997
3998 for (const Argument &Arg : CallerF.args()) {
3999 if (Arg.hasByValAttr())
4000 return false;
4001 }
4002
4003 LLVMContext &Ctx = *DAG.getContext();
4004
4005 // Check that the call results are passed in the same way.
4006 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4007 CCAssignFnForCall(CalleeCC, IsVarArg),
4008 CCAssignFnForCall(CallerCC, IsVarArg)))
4009 return false;
4010
4011 // The callee has to preserve all registers the caller needs to preserve.
4012 if (!CCMatch) {
4013 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4014 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4015 return false;
4016 }
4017
4018 // Nothing more to check if the callee is taking no arguments.
4019 if (Outs.empty())
4020 return true;
4021
4023 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4024
4025 // FIXME: We are not allocating special input registers, so we will be
4026 // deciding based on incorrect register assignments.
4027 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4028
4029 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4030 // If the stack arguments for this call do not fit into our own save area then
4031 // the call cannot be made tail.
4032 // TODO: Is this really necessary?
4033 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4034 return false;
4035
4036 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4037 // FIXME: What about inreg arguments that end up passed in memory?
4038 if (!CCVA.isRegLoc())
4039 continue;
4040
4041 // If we are passing an argument in an SGPR, and the value is divergent,
4042 // this call requires a waterfall loop.
4043 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4044 LLVM_DEBUG(
4045 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4046 << printReg(CCVA.getLocReg(), TRI) << '\n');
4047 return false;
4048 }
4049 }
4050
4051 const MachineRegisterInfo &MRI = MF.getRegInfo();
4052 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4053}
4054
4056 if (!CI->isTailCall())
4057 return false;
4058
4059 const Function *ParentFn = CI->getFunction();
4061 return false;
4062 return true;
4063}
4064
4065namespace {
4066// Chain calls have special arguments that we need to handle. These are
4067// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4068// arguments (index 0 and 1 respectively).
4069enum ChainCallArgIdx {
4070 Exec = 2,
4071 Flags,
4072 NumVGPRs,
4073 FallbackExec,
4074 FallbackCallee
4075};
4076} // anonymous namespace
4077
4078// The wave scratch offset register is used as the global base pointer.
4080 SmallVectorImpl<SDValue> &InVals) const {
4081 CallingConv::ID CallConv = CLI.CallConv;
4082 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4083
4084 SelectionDAG &DAG = CLI.DAG;
4085
4086 const SDLoc &DL = CLI.DL;
4087 SDValue Chain = CLI.Chain;
4088 SDValue Callee = CLI.Callee;
4089
4090 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4091 bool UsesDynamicVGPRs = false;
4092 if (IsChainCallConv) {
4093 // The last arguments should be the value that we need to put in EXEC,
4094 // followed by the flags and any other arguments with special meanings.
4095 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4096 // we don't treat them like the "real" arguments.
4097 auto RequestedExecIt =
4098 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4099 return Arg.OrigArgIndex == 2;
4100 });
4101 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4102
4103 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4104 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4105 CLI.OutVals.end());
4106 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4107
4108 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4109 "Haven't popped all the special args");
4110
4111 TargetLowering::ArgListEntry RequestedExecArg =
4112 CLI.Args[ChainCallArgIdx::Exec];
4113 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4114 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4115
4116 // Convert constants into TargetConstants, so they become immediate operands
4117 // instead of being selected into S_MOV.
4118 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4119 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4120 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4121 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4122 } else
4123 ChainCallSpecialArgs.push_back(Arg.Node);
4124 };
4125
4126 PushNodeOrTargetConstant(RequestedExecArg);
4127
4128 // Process any other special arguments depending on the value of the flags.
4129 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4130
4131 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4132 if (FlagsValue.isZero()) {
4133 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4134 return lowerUnhandledCall(CLI, InVals,
4135 "no additional args allowed if flags == 0");
4136 } else if (FlagsValue.isOneBitSet(0)) {
4137 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4138 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4139 }
4140
4141 if (!Subtarget->isWave32()) {
4142 return lowerUnhandledCall(
4143 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4144 }
4145
4146 UsesDynamicVGPRs = true;
4147 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4148 CLI.Args.end(), PushNodeOrTargetConstant);
4149 }
4150 }
4151
4153 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4155 bool &IsTailCall = CLI.IsTailCall;
4156 bool IsVarArg = CLI.IsVarArg;
4157 bool IsSibCall = false;
4159
4160 if (Callee.isUndef() || isNullConstant(Callee)) {
4161 if (!CLI.IsTailCall) {
4162 for (ISD::InputArg &Arg : CLI.Ins)
4163 InVals.push_back(DAG.getPOISON(Arg.VT));
4164 }
4165
4166 return Chain;
4167 }
4168
4169 if (IsVarArg) {
4170 return lowerUnhandledCall(CLI, InVals,
4171 "unsupported call to variadic function ");
4172 }
4173
4174 if (!CLI.CB)
4175 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4176
4177 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4178 return lowerUnhandledCall(CLI, InVals,
4179 "unsupported required tail call to function ");
4180 }
4181
4182 if (IsTailCall) {
4183 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4184 Outs, OutVals, Ins, DAG);
4185 if (!IsTailCall &&
4186 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4187 report_fatal_error("failed to perform tail call elimination on a call "
4188 "site marked musttail or on llvm.amdgcn.cs.chain");
4189 }
4190
4191 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4192
4193 // A sibling call is one where we're under the usual C ABI and not planning
4194 // to change that but can still do a tail call:
4195 if (!TailCallOpt && IsTailCall)
4196 IsSibCall = true;
4197
4198 if (IsTailCall)
4199 ++NumTailCalls;
4200 }
4201
4204 SmallVector<SDValue, 8> MemOpChains;
4205
4206 // Analyze operands of the call, assigning locations to each operand.
4208 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4209 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4210
4211 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4213 // With a fixed ABI, allocate fixed registers before user arguments.
4214 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4215 }
4216
4217 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4218
4219 // Get a count of how many bytes are to be pushed on the stack.
4220 unsigned NumBytes = CCInfo.getStackSize();
4221
4222 if (IsSibCall) {
4223 // Since we're not changing the ABI to make this a tail call, the memory
4224 // operands are already available in the caller's incoming argument space.
4225 NumBytes = 0;
4226 }
4227
4228 // FPDiff is the byte offset of the call's argument area from the callee's.
4229 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4230 // by this amount for a tail call. In a sibling call it must be 0 because the
4231 // caller will deallocate the entire stack and the callee still expects its
4232 // arguments to begin at SP+0. Completely unused for non-tail calls.
4233 int32_t FPDiff = 0;
4234 MachineFrameInfo &MFI = MF.getFrameInfo();
4235 auto *TRI = Subtarget->getRegisterInfo();
4236
4237 // Adjust the stack pointer for the new arguments...
4238 // These operations are automatically eliminated by the prolog/epilog pass
4239 if (!IsSibCall)
4240 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4241
4242 if (!IsSibCall || IsChainCallConv) {
4243 if (!Subtarget->enableFlatScratch()) {
4244 SmallVector<SDValue, 4> CopyFromChains;
4245
4246 // In the HSA case, this should be an identity copy.
4247 SDValue ScratchRSrcReg =
4248 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4249 RegsToPass.emplace_back(IsChainCallConv
4250 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4251 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4252 ScratchRSrcReg);
4253 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4254 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4255 }
4256 }
4257
4258 const unsigned NumSpecialInputs = RegsToPass.size();
4259
4260 MVT PtrVT = MVT::i32;
4261
4262 // Walk the register/memloc assignments, inserting copies/loads.
4263 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4264 CCValAssign &VA = ArgLocs[i];
4265 SDValue Arg = OutVals[i];
4266
4267 // Promote the value if needed.
4268 switch (VA.getLocInfo()) {
4269 case CCValAssign::Full:
4270 break;
4271 case CCValAssign::BCvt:
4272 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4273 break;
4274 case CCValAssign::ZExt:
4275 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4276 break;
4277 case CCValAssign::SExt:
4278 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4279 break;
4280 case CCValAssign::AExt:
4281 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4282 break;
4283 case CCValAssign::FPExt:
4284 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4285 break;
4286 default:
4287 llvm_unreachable("Unknown loc info!");
4288 }
4289
4290 if (VA.isRegLoc()) {
4291 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4292 } else {
4293 assert(VA.isMemLoc());
4294
4295 SDValue DstAddr;
4296 MachinePointerInfo DstInfo;
4297
4298 unsigned LocMemOffset = VA.getLocMemOffset();
4299 int32_t Offset = LocMemOffset;
4300
4301 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4302 MaybeAlign Alignment;
4303
4304 if (IsTailCall) {
4305 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4306 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4307 : VA.getValVT().getStoreSize();
4308
4309 // FIXME: We can have better than the minimum byval required alignment.
4310 Alignment =
4311 Flags.isByVal()
4312 ? Flags.getNonZeroByValAlign()
4313 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4314
4315 Offset = Offset + FPDiff;
4316 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4317
4318 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4319 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4320
4321 // Make sure any stack arguments overlapping with where we're storing
4322 // are loaded before this eventual operation. Otherwise they'll be
4323 // clobbered.
4324
4325 // FIXME: Why is this really necessary? This seems to just result in a
4326 // lot of code to copy the stack and write them back to the same
4327 // locations, which are supposed to be immutable?
4328 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4329 } else {
4330 // Stores to the argument stack area are relative to the stack pointer.
4331 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4332 MVT::i32);
4333 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4334 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4335 Alignment =
4336 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4337 }
4338
4339 if (Outs[i].Flags.isByVal()) {
4340 SDValue SizeNode =
4341 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4342 SDValue Cpy =
4343 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4344 Outs[i].Flags.getNonZeroByValAlign(),
4345 /*isVol = */ false, /*AlwaysInline = */ true,
4346 /*CI=*/nullptr, std::nullopt, DstInfo,
4348
4349 MemOpChains.push_back(Cpy);
4350 } else {
4351 SDValue Store =
4352 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4353 MemOpChains.push_back(Store);
4354 }
4355 }
4356 }
4357
4358 if (!MemOpChains.empty())
4359 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4360
4361 SDValue ReadFirstLaneID =
4362 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4363
4364 SDValue TokenGlue;
4365 if (CLI.ConvergenceControlToken) {
4366 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4368 }
4369
4370 // Build a sequence of copy-to-reg nodes chained together with token chain
4371 // and flag operands which copy the outgoing args into the appropriate regs.
4372 SDValue InGlue;
4373
4374 unsigned ArgIdx = 0;
4375 for (auto [Reg, Val] : RegsToPass) {
4376 if (ArgIdx++ >= NumSpecialInputs &&
4377 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4378 // For chain calls, the inreg arguments are required to be
4379 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4380 // they are uniform.
4381 //
4382 // For other calls, if an inreg arguments is known to be uniform,
4383 // speculatively insert a readfirstlane in case it is in a VGPR.
4384 //
4385 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4386 // value, so let that continue to produce invalid code.
4387
4388 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4389 if (TokenGlue)
4390 ReadfirstlaneArgs.push_back(TokenGlue);
4392 ReadfirstlaneArgs);
4393 }
4394
4395 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4396 InGlue = Chain.getValue(1);
4397 }
4398
4399 // We don't usually want to end the call-sequence here because we would tidy
4400 // the frame up *after* the call, however in the ABI-changing tail-call case
4401 // we've carefully laid out the parameters so that when sp is reset they'll be
4402 // in the correct location.
4403 if (IsTailCall && !IsSibCall) {
4404 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4405 InGlue = Chain.getValue(1);
4406 }
4407
4408 std::vector<SDValue> Ops({Chain});
4409
4410 // Add a redundant copy of the callee global which will not be legalized, as
4411 // we need direct access to the callee later.
4413 const GlobalValue *GV = GSD->getGlobal();
4414 Ops.push_back(Callee);
4415 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4416 } else {
4417 if (IsTailCall) {
4418 // isEligibleForTailCallOptimization considered whether the call target is
4419 // divergent, but we may still end up with a uniform value in a VGPR.
4420 // Insert a readfirstlane just in case.
4421 SDValue ReadFirstLaneID =
4422 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4423
4424 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4425 if (TokenGlue)
4426 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4427 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4428 ReadfirstlaneArgs);
4429 }
4430
4431 Ops.push_back(Callee);
4432 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4433 }
4434
4435 if (IsTailCall) {
4436 // Each tail call may have to adjust the stack by a different amount, so
4437 // this information must travel along with the operation for eventual
4438 // consumption by emitEpilogue.
4439 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4440 }
4441
4442 if (IsChainCallConv)
4443 llvm::append_range(Ops, ChainCallSpecialArgs);
4444
4445 // Add argument registers to the end of the list so that they are known live
4446 // into the call.
4447 for (auto &[Reg, Val] : RegsToPass)
4448 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4449
4450 // Add a register mask operand representing the call-preserved registers.
4451 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4452 assert(Mask && "Missing call preserved mask for calling convention");
4453 Ops.push_back(DAG.getRegisterMask(Mask));
4454
4455 if (SDValue Token = CLI.ConvergenceControlToken) {
4457 GlueOps.push_back(Token);
4458 if (InGlue)
4459 GlueOps.push_back(InGlue);
4460
4461 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4462 MVT::Glue, GlueOps),
4463 0);
4464 }
4465
4466 if (InGlue)
4467 Ops.push_back(InGlue);
4468
4469 // If we're doing a tall call, use a TC_RETURN here rather than an
4470 // actual call instruction.
4471 if (IsTailCall) {
4472 MFI.setHasTailCall();
4473 unsigned OPC = AMDGPUISD::TC_RETURN;
4474 switch (CallConv) {
4476 OPC = AMDGPUISD::TC_RETURN_GFX;
4477 break;
4480 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4481 : AMDGPUISD::TC_RETURN_CHAIN;
4482 break;
4483 }
4484
4485 // If the caller is a whole wave function, we need to use a special opcode
4486 // so we can patch up EXEC.
4487 if (Info->isWholeWaveFunction())
4488 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4489
4490 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4491 }
4492
4493 // Returns a chain and a flag for retval copy to use.
4494 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4495 Chain = Call.getValue(0);
4496 InGlue = Call.getValue(1);
4497
4498 uint64_t CalleePopBytes = NumBytes;
4499 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4500 if (!Ins.empty())
4501 InGlue = Chain.getValue(1);
4502
4503 // Handle result values, copying them out of physregs into vregs that we
4504 // return.
4505 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4506 InVals, /*IsThisReturn=*/false, SDValue());
4507}
4508
4509// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4510// except for:
4511// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4512// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4514 SelectionDAG &DAG) const {
4515 const MachineFunction &MF = DAG.getMachineFunction();
4517
4518 SDLoc dl(Op);
4519 EVT VT = Op.getValueType();
4520 SDValue Chain = Op.getOperand(0);
4521 Register SPReg = Info->getStackPtrOffsetReg();
4522
4523 // Chain the dynamic stack allocation so that it doesn't modify the stack
4524 // pointer when other instructions are using the stack.
4525 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4526
4527 SDValue Size = Op.getOperand(1);
4528 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4529 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4530
4531 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4533 "Stack grows upwards for AMDGPU");
4534
4535 Chain = BaseAddr.getValue(1);
4536 Align StackAlign = TFL->getStackAlign();
4537 if (Alignment > StackAlign) {
4538 uint64_t ScaledAlignment = Alignment.value()
4539 << Subtarget->getWavefrontSizeLog2();
4540 uint64_t StackAlignMask = ScaledAlignment - 1;
4541 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4542 DAG.getConstant(StackAlignMask, dl, VT));
4543 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4544 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4545 }
4546
4547 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4548 SDValue NewSP;
4550 // For constant sized alloca, scale alloca size by wave-size
4551 SDValue ScaledSize = DAG.getNode(
4552 ISD::SHL, dl, VT, Size,
4553 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4554 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4555 } else {
4556 // For dynamic sized alloca, perform wave-wide reduction to get max of
4557 // alloca size(divergent) and then scale it by wave-size
4558 SDValue WaveReduction =
4559 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4560 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4561 Size, DAG.getConstant(0, dl, MVT::i32));
4562 SDValue ScaledSize = DAG.getNode(
4563 ISD::SHL, dl, VT, Size,
4564 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4565 NewSP =
4566 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4567 SDValue ReadFirstLaneID =
4568 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4569 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4570 NewSP);
4571 }
4572
4573 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4574 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4575
4576 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4577}
4578
4580 if (Op.getValueType() != MVT::i32)
4581 return Op; // Defer to cannot select error.
4582
4584 SDLoc SL(Op);
4585
4586 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4587
4588 // Convert from wave uniform to swizzled vector address. This should protect
4589 // from any edge cases where the stacksave result isn't directly used with
4590 // stackrestore.
4591 SDValue VectorAddress =
4592 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4593 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4594}
4595
4597 SelectionDAG &DAG) const {
4598 SDLoc SL(Op);
4599 assert(Op.getValueType() == MVT::i32);
4600
4601 uint32_t BothRoundHwReg =
4603 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4604
4605 SDValue IntrinID =
4606 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4607 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4608 Op.getOperand(0), IntrinID, GetRoundBothImm);
4609
4610 // There are two rounding modes, one for f32 and one for f64/f16. We only
4611 // report in the standard value range if both are the same.
4612 //
4613 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4614 // ties away from zero is not supported, and the other values are rotated by
4615 // 1.
4616 //
4617 // If the two rounding modes are not the same, report a target defined value.
4618
4619 // Mode register rounding mode fields:
4620 //
4621 // [1:0] Single-precision round mode.
4622 // [3:2] Double/Half-precision round mode.
4623 //
4624 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4625 //
4626 // Hardware Spec
4627 // Toward-0 3 0
4628 // Nearest Even 0 1
4629 // +Inf 1 2
4630 // -Inf 2 3
4631 // NearestAway0 N/A 4
4632 //
4633 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4634 // table we can index by the raw hardware mode.
4635 //
4636 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4637
4638 SDValue BitTable =
4640
4641 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4642 SDValue RoundModeTimesNumBits =
4643 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4644
4645 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4646 // knew only one mode was demanded.
4647 SDValue TableValue =
4648 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4649 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4650
4651 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4652 SDValue TableEntry =
4653 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4654
4655 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4656 // if it's an extended value.
4657 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4658 SDValue IsStandardValue =
4659 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4660 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4661 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4662 TableEntry, EnumOffset);
4663
4664 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4665}
4666
4668 SelectionDAG &DAG) const {
4669 SDLoc SL(Op);
4670
4671 SDValue NewMode = Op.getOperand(1);
4672 assert(NewMode.getValueType() == MVT::i32);
4673
4674 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4675 // hardware MODE.fp_round values.
4676 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4677 uint32_t ClampedVal = std::min(
4678 static_cast<uint32_t>(ConstMode->getZExtValue()),
4680 NewMode = DAG.getConstant(
4681 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4682 } else {
4683 // If we know the input can only be one of the supported standard modes in
4684 // the range 0-3, we can use a simplified mapping to hardware values.
4685 KnownBits KB = DAG.computeKnownBits(NewMode);
4686 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4687 // The supported standard values are 0-3. The extended values start at 8. We
4688 // need to offset by 4 if the value is in the extended range.
4689
4690 if (UseReducedTable) {
4691 // Truncate to the low 32-bits.
4692 SDValue BitTable = DAG.getConstant(
4693 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4694
4695 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4696 SDValue RoundModeTimesNumBits =
4697 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4698
4699 NewMode =
4700 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4701
4702 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4703 // the table extracted bits into inline immediates.
4704 } else {
4705 // table_index = umin(value, value - 4)
4706 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4707 SDValue BitTable =
4709
4710 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4711 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4712 SDValue IndexVal =
4713 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4714
4715 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4716 SDValue RoundModeTimesNumBits =
4717 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4718
4719 SDValue TableValue =
4720 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4721 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4722
4723 // No need to mask out the high bits since the setreg will ignore them
4724 // anyway.
4725 NewMode = TruncTable;
4726 }
4727
4728 // Insert a readfirstlane in case the value is a VGPR. We could do this
4729 // earlier and keep more operations scalar, but that interferes with
4730 // combining the source.
4731 SDValue ReadFirstLaneID =
4732 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4733 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4734 ReadFirstLaneID, NewMode);
4735 }
4736
4737 // N.B. The setreg will be later folded into s_round_mode on supported
4738 // targets.
4739 SDValue IntrinID =
4740 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4741 uint32_t BothRoundHwReg =
4743 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4744
4745 SDValue SetReg =
4746 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4747 IntrinID, RoundBothImm, NewMode);
4748
4749 return SetReg;
4750}
4751
4753 if (Op->isDivergent() &&
4754 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4755 // Cannot do I$ prefetch with divergent pointer.
4756 return SDValue();
4757
4758 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4762 break;
4764 if (Subtarget->hasSafeSmemPrefetch())
4765 break;
4766 [[fallthrough]];
4767 default:
4768 return SDValue();
4769 }
4770
4771 // I$ prefetch
4772 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4773 return SDValue();
4774
4775 return Op;
4776}
4777
4778// Work around DAG legality rules only based on the result type.
4780 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4781 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4782 EVT SrcVT = Src.getValueType();
4783
4784 if (SrcVT.getScalarType() != MVT::bf16)
4785 return Op;
4786
4787 SDLoc SL(Op);
4788 SDValue BitCast =
4789 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4790
4791 EVT DstVT = Op.getValueType();
4792 if (IsStrict)
4793 llvm_unreachable("Need STRICT_BF16_TO_FP");
4794
4795 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4796}
4797
4799 SDLoc SL(Op);
4800 if (Op.getValueType() != MVT::i64)
4801 return Op;
4802
4803 uint32_t ModeHwReg =
4805 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4806 uint32_t TrapHwReg =
4808 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4809
4810 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4811 SDValue IntrinID =
4812 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4813 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4814 Op.getOperand(0), IntrinID, ModeHwRegImm);
4815 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4816 Op.getOperand(0), IntrinID, TrapHwRegImm);
4817 SDValue TokenReg =
4818 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4819 GetTrapReg.getValue(1));
4820
4821 SDValue CvtPtr =
4822 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4823 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4824
4825 return DAG.getMergeValues({Result, TokenReg}, SL);
4826}
4827
4829 SDLoc SL(Op);
4830 if (Op.getOperand(1).getValueType() != MVT::i64)
4831 return Op;
4832
4833 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4834 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4835 DAG.getConstant(0, SL, MVT::i32));
4836 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4837 DAG.getConstant(1, SL, MVT::i32));
4838
4839 SDValue ReadFirstLaneID =
4840 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4841 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4842 ReadFirstLaneID, NewModeReg);
4843 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4844 ReadFirstLaneID, NewTrapReg);
4845
4846 unsigned ModeHwReg =
4848 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4849 unsigned TrapHwReg =
4851 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4852
4853 SDValue IntrinID =
4854 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4855 SDValue SetModeReg =
4856 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4857 IntrinID, ModeHwRegImm, NewModeReg);
4858 SDValue SetTrapReg =
4859 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4860 IntrinID, TrapHwRegImm, NewTrapReg);
4861 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4862}
4863
4865 const MachineFunction &MF) const {
4866 const Function &Fn = MF.getFunction();
4867
4869 .Case("m0", AMDGPU::M0)
4870 .Case("exec", AMDGPU::EXEC)
4871 .Case("exec_lo", AMDGPU::EXEC_LO)
4872 .Case("exec_hi", AMDGPU::EXEC_HI)
4873 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4874 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4875 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4876 .Default(Register());
4877 if (!Reg)
4878 return Reg;
4879
4880 if (!Subtarget->hasFlatScrRegister() &&
4881 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4882 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4883 "\" for subtarget."));
4884 }
4885
4886 switch (Reg) {
4887 case AMDGPU::M0:
4888 case AMDGPU::EXEC_LO:
4889 case AMDGPU::EXEC_HI:
4890 case AMDGPU::FLAT_SCR_LO:
4891 case AMDGPU::FLAT_SCR_HI:
4892 if (VT.getSizeInBits() == 32)
4893 return Reg;
4894 break;
4895 case AMDGPU::EXEC:
4896 case AMDGPU::FLAT_SCR:
4897 if (VT.getSizeInBits() == 64)
4898 return Reg;
4899 break;
4900 default:
4901 llvm_unreachable("missing register type checking");
4902 }
4903
4905 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4906}
4907
4908// If kill is not the last instruction, split the block so kill is always a
4909// proper terminator.
4912 MachineBasicBlock *BB) const {
4913 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4915 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4916 return SplitBB;
4917}
4918
4919// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4920// \p MI will be the only instruction in the loop body block. Otherwise, it will
4921// be the first instruction in the remainder block.
4922//
4923/// \returns { LoopBody, Remainder }
4924static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4926 MachineFunction *MF = MBB.getParent();
4928
4929 // To insert the loop we need to split the block. Move everything after this
4930 // point to a new block, and insert a new empty block between the two.
4932 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4934 ++MBBI;
4935
4936 MF->insert(MBBI, LoopBB);
4937 MF->insert(MBBI, RemainderBB);
4938
4939 LoopBB->addSuccessor(LoopBB);
4940 LoopBB->addSuccessor(RemainderBB);
4941
4942 // Move the rest of the block into a new block.
4943 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4944
4945 if (InstInLoop) {
4946 auto Next = std::next(I);
4947
4948 // Move instruction to loop body.
4949 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4950
4951 // Move the rest of the block.
4952 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4953 } else {
4954 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4955 }
4956
4957 MBB.addSuccessor(LoopBB);
4958
4959 return std::pair(LoopBB, RemainderBB);
4960}
4961
4962/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4964 MachineBasicBlock *MBB = MI.getParent();
4966 auto I = MI.getIterator();
4967 auto E = std::next(I);
4968
4969 // clang-format off
4970 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4971 .addImm(0);
4972 // clang-format on
4973
4974 MIBundleBuilder Bundler(*MBB, I, E);
4975 finalizeBundle(*MBB, Bundler.begin());
4976}
4977
4980 MachineBasicBlock *BB) const {
4981 const DebugLoc &DL = MI.getDebugLoc();
4982
4984
4986
4987 // Apparently kill flags are only valid if the def is in the same block?
4988 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4989 Src->setIsKill(false);
4990
4991 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4992
4993 MachineBasicBlock::iterator I = LoopBB->end();
4994
4995 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4997
4998 // Clear TRAP_STS.MEM_VIOL
4999 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5000 .addImm(0)
5001 .addImm(EncodedReg);
5002
5004
5005 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5006
5007 // Load and check TRAP_STS.MEM_VIOL
5008 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5009 .addImm(EncodedReg);
5010
5011 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5012 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5013 .addReg(Reg, RegState::Kill)
5014 .addImm(0);
5015 // clang-format off
5016 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5017 .addMBB(LoopBB);
5018 // clang-format on
5019
5020 return RemainderBB;
5021}
5022
5023// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5024// wavefront. If the value is uniform and just happens to be in a VGPR, this
5025// will only do one iteration. In the worst case, this will loop 64 times.
5026//
5027// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5030 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5031 const DebugLoc &DL, const MachineOperand &Idx,
5032 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5033 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5034 Register &SGPRIdxReg) {
5035
5036 MachineFunction *MF = OrigBB.getParent();
5037 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5038 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5041
5042 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5043 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5044 Register NewExec = MRI.createVirtualRegister(BoolRC);
5045 Register CurrentIdxReg =
5046 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5047 Register CondReg = MRI.createVirtualRegister(BoolRC);
5048
5049 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5050 .addReg(InitReg)
5051 .addMBB(&OrigBB)
5052 .addReg(ResultReg)
5053 .addMBB(&LoopBB);
5054
5055 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5056 .addReg(InitSaveExecReg)
5057 .addMBB(&OrigBB)
5058 .addReg(NewExec)
5059 .addMBB(&LoopBB);
5060
5061 // Read the next variant <- also loop target.
5062 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5063 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5064
5065 // Compare the just read M0 value to all possible Idx values.
5066 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5067 .addReg(CurrentIdxReg)
5068 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5069
5070 // Update EXEC, save the original EXEC value to VCC.
5071 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5072 .addReg(CondReg, RegState::Kill);
5073
5074 MRI.setSimpleHint(NewExec, CondReg);
5075
5076 if (UseGPRIdxMode) {
5077 if (Offset == 0) {
5078 SGPRIdxReg = CurrentIdxReg;
5079 } else {
5080 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5081 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5082 .addReg(CurrentIdxReg, RegState::Kill)
5083 .addImm(Offset);
5084 }
5085 } else {
5086 // Move index from VCC into M0
5087 if (Offset == 0) {
5088 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5089 .addReg(CurrentIdxReg, RegState::Kill);
5090 } else {
5091 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5092 .addReg(CurrentIdxReg, RegState::Kill)
5093 .addImm(Offset);
5094 }
5095 }
5096
5097 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5098 MachineInstr *InsertPt =
5099 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5100 .addReg(LMC.ExecReg)
5101 .addReg(NewExec);
5102
5103 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5104 // s_cbranch_scc0?
5105
5106 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5107 // clang-format off
5108 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5109 .addMBB(&LoopBB);
5110 // clang-format on
5111
5112 return InsertPt->getIterator();
5113}
5114
5115// This has slightly sub-optimal regalloc when the source vector is killed by
5116// the read. The register allocator does not understand that the kill is
5117// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5118// subregister from it, using 1 more VGPR than necessary. This was saved when
5119// this was expanded after register allocation.
5122 unsigned InitResultReg, unsigned PhiReg, int Offset,
5123 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5124 MachineFunction *MF = MBB.getParent();
5125 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5126 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5128 const DebugLoc &DL = MI.getDebugLoc();
5130
5131 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5132 Register DstReg = MI.getOperand(0).getReg();
5133 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5134 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5136
5137 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5138
5139 // Save the EXEC mask
5140 // clang-format off
5141 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5142 .addReg(LMC.ExecReg);
5143 // clang-format on
5144
5145 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5146
5147 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5148
5149 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5150 InitResultReg, DstReg, PhiReg, TmpExec,
5151 Offset, UseGPRIdxMode, SGPRIdxReg);
5152
5153 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5155 ++MBBI;
5156 MF->insert(MBBI, LandingPad);
5157 LoopBB->removeSuccessor(RemainderBB);
5158 LandingPad->addSuccessor(RemainderBB);
5159 LoopBB->addSuccessor(LandingPad);
5160 MachineBasicBlock::iterator First = LandingPad->begin();
5161 // clang-format off
5162 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5163 .addReg(SaveExec);
5164 // clang-format on
5165
5166 return InsPt;
5167}
5168
5169// Returns subreg index, offset
5170static std::pair<unsigned, int>
5172 const TargetRegisterClass *SuperRC, unsigned VecReg,
5173 int Offset) {
5174 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5175
5176 // Skip out of bounds offsets, or else we would end up using an undefined
5177 // register.
5178 if (Offset >= NumElts || Offset < 0)
5179 return std::pair(AMDGPU::sub0, Offset);
5180
5181 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5182}
5183
5186 int Offset) {
5187 MachineBasicBlock *MBB = MI.getParent();
5188 const DebugLoc &DL = MI.getDebugLoc();
5190
5191 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5192
5193 assert(Idx->getReg() != AMDGPU::NoRegister);
5194
5195 if (Offset == 0) {
5196 // clang-format off
5197 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5198 .add(*Idx);
5199 // clang-format on
5200 } else {
5201 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5202 .add(*Idx)
5203 .addImm(Offset);
5204 }
5205}
5206
5209 int Offset) {
5210 MachineBasicBlock *MBB = MI.getParent();
5211 const DebugLoc &DL = MI.getDebugLoc();
5213
5214 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5215
5216 if (Offset == 0)
5217 return Idx->getReg();
5218
5219 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5220 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5221 .add(*Idx)
5222 .addImm(Offset);
5223 return Tmp;
5224}
5225
5228 const GCNSubtarget &ST) {
5229 const SIInstrInfo *TII = ST.getInstrInfo();
5230 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5231 MachineFunction *MF = MBB.getParent();
5233
5234 Register Dst = MI.getOperand(0).getReg();
5235 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5236 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5237 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5238
5239 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5240 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5241
5242 unsigned SubReg;
5243 std::tie(SubReg, Offset) =
5244 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5245
5246 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5247
5248 // Check for a SGPR index.
5249 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5251 const DebugLoc &DL = MI.getDebugLoc();
5252
5253 if (UseGPRIdxMode) {
5254 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5255 // to avoid interfering with other uses, so probably requires a new
5256 // optimization pass.
5258
5259 const MCInstrDesc &GPRIDXDesc =
5260 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5261 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5262 .addReg(SrcReg)
5263 .addReg(Idx)
5264 .addImm(SubReg);
5265 } else {
5267
5268 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5269 .addReg(SrcReg, 0, SubReg)
5270 .addReg(SrcReg, RegState::Implicit);
5271 }
5272
5273 MI.eraseFromParent();
5274
5275 return &MBB;
5276 }
5277
5278 // Control flow needs to be inserted if indexing with a VGPR.
5279 const DebugLoc &DL = MI.getDebugLoc();
5281
5282 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5283 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5284
5285 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5286
5287 Register SGPRIdxReg;
5288 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5289 UseGPRIdxMode, SGPRIdxReg);
5290
5291 MachineBasicBlock *LoopBB = InsPt->getParent();
5292
5293 if (UseGPRIdxMode) {
5294 const MCInstrDesc &GPRIDXDesc =
5295 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5296
5297 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5298 .addReg(SrcReg)
5299 .addReg(SGPRIdxReg)
5300 .addImm(SubReg);
5301 } else {
5302 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5303 .addReg(SrcReg, 0, SubReg)
5304 .addReg(SrcReg, RegState::Implicit);
5305 }
5306
5307 MI.eraseFromParent();
5308
5309 return LoopBB;
5310}
5311
5314 const GCNSubtarget &ST) {
5315 const SIInstrInfo *TII = ST.getInstrInfo();
5316 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5317 MachineFunction *MF = MBB.getParent();
5319
5320 Register Dst = MI.getOperand(0).getReg();
5321 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5322 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5323 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5324 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5325 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5326 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5327
5328 // This can be an immediate, but will be folded later.
5329 assert(Val->getReg());
5330
5331 unsigned SubReg;
5332 std::tie(SubReg, Offset) =
5333 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5334 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5335
5336 if (Idx->getReg() == AMDGPU::NoRegister) {
5338 const DebugLoc &DL = MI.getDebugLoc();
5339
5340 assert(Offset == 0);
5341
5342 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5343 .add(*SrcVec)
5344 .add(*Val)
5345 .addImm(SubReg);
5346
5347 MI.eraseFromParent();
5348 return &MBB;
5349 }
5350
5351 // Check for a SGPR index.
5352 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5354 const DebugLoc &DL = MI.getDebugLoc();
5355
5356 if (UseGPRIdxMode) {
5358
5359 const MCInstrDesc &GPRIDXDesc =
5360 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5361 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5362 .addReg(SrcVec->getReg())
5363 .add(*Val)
5364 .addReg(Idx)
5365 .addImm(SubReg);
5366 } else {
5368
5369 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5370 TRI.getRegSizeInBits(*VecRC), 32, false);
5371 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5372 .addReg(SrcVec->getReg())
5373 .add(*Val)
5374 .addImm(SubReg);
5375 }
5376 MI.eraseFromParent();
5377 return &MBB;
5378 }
5379
5380 // Control flow needs to be inserted if indexing with a VGPR.
5381 if (Val->isReg())
5382 MRI.clearKillFlags(Val->getReg());
5383
5384 const DebugLoc &DL = MI.getDebugLoc();
5385
5386 Register PhiReg = MRI.createVirtualRegister(VecRC);
5387
5388 Register SGPRIdxReg;
5389 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5390 UseGPRIdxMode, SGPRIdxReg);
5391 MachineBasicBlock *LoopBB = InsPt->getParent();
5392
5393 if (UseGPRIdxMode) {
5394 const MCInstrDesc &GPRIDXDesc =
5395 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5396
5397 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5398 .addReg(PhiReg)
5399 .add(*Val)
5400 .addReg(SGPRIdxReg)
5401 .addImm(SubReg);
5402 } else {
5403 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5404 TRI.getRegSizeInBits(*VecRC), 32, false);
5405 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5406 .addReg(PhiReg)
5407 .add(*Val)
5408 .addImm(SubReg);
5409 }
5410
5411 MI.eraseFromParent();
5412 return LoopBB;
5413}
5414
5416 MachineBasicBlock *BB) {
5417 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5418 // For GFX12, we emit s_add_u64 and s_sub_u64.
5419 MachineFunction *MF = BB->getParent();
5420 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5421 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5423 const DebugLoc &DL = MI.getDebugLoc();
5424 MachineOperand &Dest = MI.getOperand(0);
5425 MachineOperand &Src0 = MI.getOperand(1);
5426 MachineOperand &Src1 = MI.getOperand(2);
5427 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5428 if (ST.hasScalarAddSub64()) {
5429 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5430 // clang-format off
5431 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5432 .add(Src0)
5433 .add(Src1);
5434 // clang-format on
5435 } else {
5436 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5437 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5438
5439 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5440 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5441
5442 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5443 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5444 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5445 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5446
5447 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5448 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5449 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5450 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5451
5452 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5453 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5454 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5455 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5456 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5457 .addReg(DestSub0)
5458 .addImm(AMDGPU::sub0)
5459 .addReg(DestSub1)
5460 .addImm(AMDGPU::sub1);
5461 }
5462 MI.eraseFromParent();
5463 return BB;
5464}
5465
5467 switch (Opc) {
5468 case AMDGPU::S_MIN_U32:
5469 return std::numeric_limits<uint32_t>::max();
5470 case AMDGPU::S_MIN_I32:
5471 return std::numeric_limits<int32_t>::max();
5472 case AMDGPU::S_MAX_U32:
5473 return std::numeric_limits<uint32_t>::min();
5474 case AMDGPU::S_MAX_I32:
5475 return std::numeric_limits<int32_t>::min();
5476 case AMDGPU::V_ADD_F32_e64: // -0.0
5477 return 0x80000000;
5478 case AMDGPU::V_SUB_F32_e64: // +0.0
5479 return 0x0;
5480 case AMDGPU::S_ADD_I32:
5481 case AMDGPU::S_SUB_I32:
5482 case AMDGPU::S_OR_B32:
5483 case AMDGPU::S_XOR_B32:
5484 return std::numeric_limits<uint32_t>::min();
5485 case AMDGPU::S_AND_B32:
5486 return std::numeric_limits<uint32_t>::max();
5487 case AMDGPU::V_MIN_F32_e64:
5488 case AMDGPU::V_MAX_F32_e64:
5489 return 0x7fc00000; // qNAN
5490 default:
5492 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5493 }
5494}
5495
5497 switch (Opc) {
5498 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5499 return std::numeric_limits<uint64_t>::max();
5500 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5501 return std::numeric_limits<int64_t>::max();
5502 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5503 return std::numeric_limits<uint64_t>::min();
5504 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5505 return std::numeric_limits<int64_t>::min();
5506 case AMDGPU::S_ADD_U64_PSEUDO:
5507 case AMDGPU::S_SUB_U64_PSEUDO:
5508 case AMDGPU::S_OR_B64:
5509 case AMDGPU::S_XOR_B64:
5510 return std::numeric_limits<uint64_t>::min();
5511 case AMDGPU::S_AND_B64:
5512 return std::numeric_limits<uint64_t>::max();
5513 default:
5515 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5516 }
5517}
5518
5519static bool is32bitWaveReduceOperation(unsigned Opc) {
5520 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5521 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5522 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5523 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5524 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5525 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5526 Opc == AMDGPU::V_SUB_F32_e64;
5527}
5528
5530 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5531 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
5532}
5533
5536 const GCNSubtarget &ST,
5537 unsigned Opc) {
5539 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5540 const DebugLoc &DL = MI.getDebugLoc();
5541 const SIInstrInfo *TII = ST.getInstrInfo();
5542
5543 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5544 Register SrcReg = MI.getOperand(1).getReg();
5545 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5546 Register DstReg = MI.getOperand(0).getReg();
5547 MachineBasicBlock *RetBB = nullptr;
5548 if (isSGPR) {
5549 switch (Opc) {
5550 case AMDGPU::S_MIN_U32:
5551 case AMDGPU::S_MIN_I32:
5552 case AMDGPU::V_MIN_F32_e64:
5553 case AMDGPU::S_MAX_U32:
5554 case AMDGPU::S_MAX_I32:
5555 case AMDGPU::V_MAX_F32_e64:
5556 case AMDGPU::S_AND_B32:
5557 case AMDGPU::S_OR_B32: {
5558 // Idempotent operations.
5559 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5560 RetBB = &BB;
5561 break;
5562 }
5563 case AMDGPU::V_CMP_LT_U64_e64: // umin
5564 case AMDGPU::V_CMP_LT_I64_e64: // min
5565 case AMDGPU::V_CMP_GT_U64_e64: // umax
5566 case AMDGPU::V_CMP_GT_I64_e64: // max
5567 case AMDGPU::S_AND_B64:
5568 case AMDGPU::S_OR_B64: {
5569 // Idempotent operations.
5570 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5571 RetBB = &BB;
5572 break;
5573 }
5574 case AMDGPU::S_XOR_B32:
5575 case AMDGPU::S_XOR_B64:
5576 case AMDGPU::S_ADD_I32:
5577 case AMDGPU::S_ADD_U64_PSEUDO:
5578 case AMDGPU::V_ADD_F32_e64:
5579 case AMDGPU::S_SUB_I32:
5580 case AMDGPU::S_SUB_U64_PSEUDO:
5581 case AMDGPU::V_SUB_F32_e64: {
5582 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5583 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5584 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5585 Register NumActiveLanes =
5586 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5587
5588 bool IsWave32 = ST.isWave32();
5589 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5590 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5591 unsigned BitCountOpc =
5592 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5593
5594 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5595
5596 auto NewAccumulator =
5597 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5598 .addReg(ExecMask);
5599
5600 switch (Opc) {
5601 case AMDGPU::S_XOR_B32:
5602 case AMDGPU::S_XOR_B64: {
5603 // Performing an XOR operation on a uniform value
5604 // depends on the parity of the number of active lanes.
5605 // For even parity, the result will be 0, for odd
5606 // parity the result will be the same as the input value.
5607 Register ParityRegister =
5608 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5609
5610 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5611 .addReg(NewAccumulator->getOperand(0).getReg())
5612 .addImm(1)
5613 .setOperandDead(3); // Dead scc
5614 if (Opc == AMDGPU::S_XOR_B32) {
5615 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5616 .addReg(SrcReg)
5617 .addReg(ParityRegister);
5618 } else {
5619 Register DestSub0 =
5620 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5621 Register DestSub1 =
5622 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5623
5624 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5625 const TargetRegisterClass *SrcSubRC =
5626 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5627
5628 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5629 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5630 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5631 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5632
5633 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5634 .add(Op1L)
5635 .addReg(ParityRegister);
5636
5637 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5638 .add(Op1H)
5639 .addReg(ParityRegister);
5640
5641 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5642 .addReg(DestSub0)
5643 .addImm(AMDGPU::sub0)
5644 .addReg(DestSub1)
5645 .addImm(AMDGPU::sub1);
5646 }
5647 break;
5648 }
5649 case AMDGPU::S_SUB_I32: {
5650 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5651
5652 // Take the negation of the source operand.
5653 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5654 .addImm(0)
5655 .addReg(SrcReg);
5656 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5657 .addReg(NegatedVal)
5658 .addReg(NewAccumulator->getOperand(0).getReg());
5659 break;
5660 }
5661 case AMDGPU::S_ADD_I32: {
5662 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5663 .addReg(SrcReg)
5664 .addReg(NewAccumulator->getOperand(0).getReg());
5665 break;
5666 }
5667 case AMDGPU::S_ADD_U64_PSEUDO:
5668 case AMDGPU::S_SUB_U64_PSEUDO: {
5669 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5670 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5671 Register Op1H_Op0L_Reg =
5672 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5673 Register Op1L_Op0H_Reg =
5674 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5675 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5676 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5677 Register NegatedValLo =
5678 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5679 Register NegatedValHi =
5680 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5681
5682 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5683 const TargetRegisterClass *Src1SubRC =
5684 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5685
5686 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5687 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5688 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5689 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5690
5691 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5692 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5693 .addImm(0)
5694 .addReg(NewAccumulator->getOperand(0).getReg())
5695 .setOperandDead(3); // Dead scc
5696 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5697 .addReg(NegatedValLo)
5698 .addImm(31)
5699 .setOperandDead(3); // Dead scc
5700 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5701 .add(Op1L)
5702 .addReg(NegatedValHi);
5703 }
5704 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5705 ? NegatedValLo
5706 : NewAccumulator->getOperand(0).getReg();
5707 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5708 .add(Op1L)
5709 .addReg(LowOpcode);
5710 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5711 .add(Op1L)
5712 .addReg(LowOpcode);
5713 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5714 .add(Op1H)
5715 .addReg(LowOpcode);
5716
5717 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5718 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5719 .addReg(CarryReg)
5720 .addReg(Op1H_Op0L_Reg)
5721 .setOperandDead(3); // Dead scc
5722
5723 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5724 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5725 .addReg(HiVal)
5726 .addReg(Op1L_Op0H_Reg)
5727 .setOperandDead(3); // Dead scc
5728 }
5729 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5730 .addReg(DestSub0)
5731 .addImm(AMDGPU::sub0)
5732 .addReg(DestSub1)
5733 .addImm(AMDGPU::sub1);
5734 break;
5735 }
5736 case AMDGPU::V_ADD_F32_e64:
5737 case AMDGPU::V_SUB_F32_e64: {
5738 Register ActiveLanesVreg =
5739 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5740 Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5741 // Get number of active lanes as a float val.
5742 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
5743 ActiveLanesVreg)
5744 .addReg(NewAccumulator->getOperand(0).getReg())
5745 .addImm(0) // clamp
5746 .addImm(0); // output-modifier
5747
5748 // Take negation of input for SUB reduction
5749 unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5750 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
5751 .addImm(srcMod) // src0 modifier
5752 .addReg(SrcReg)
5753 .addImm(0) // src1 modifier
5754 .addReg(ActiveLanesVreg)
5755 .addImm(0) // clamp
5756 .addImm(0); // output-mod
5757 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5758 .addReg(DstVreg);
5759 }
5760 }
5761 RetBB = &BB;
5762 }
5763 }
5764 } else {
5765 // TODO: Implement DPP Strategy and switch based on immediate strategy
5766 // operand. For now, for all the cases (default, Iterative and DPP we use
5767 // iterative approach by default.)
5768
5769 // To reduce the VGPR using iterative approach, we need to iterate
5770 // over all the active lanes. Lowering consists of ComputeLoop,
5771 // which iterate over only active lanes. We use copy of EXEC register
5772 // as induction variable and every active lane modifies it using bitset0
5773 // so that we will get the next active lane for next iteration.
5775 Register SrcReg = MI.getOperand(1).getReg();
5776 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5778
5779 // Create Control flow for loop
5780 // Split MI's Machine Basic block into For loop
5781 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5782
5783 // Create virtual registers required for lowering.
5784 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5785 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5786 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5787 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5788 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5789 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5790 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5791 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5792 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5793
5794 bool IsWave32 = ST.isWave32();
5795 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5796 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5797
5798 // Create initial values of induction variable from Exec, Accumulator and
5799 // insert branch instr to newly created ComputeBlock
5800 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5801 if (is32BitOpc) {
5803 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5804 .addImm(IdentityValue);
5805 } else {
5807 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5808 .addImm(IdentityValue);
5809 }
5810 // clang-format off
5811 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5812 .addMBB(ComputeLoop);
5813 // clang-format on
5814
5815 // Start constructing ComputeLoop
5816 I = ComputeLoop->begin();
5817 auto Accumulator =
5818 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5819 .addReg(IdentityValReg)
5820 .addMBB(&BB);
5821 auto ActiveBits =
5822 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5823 .addReg(LoopIterator)
5824 .addMBB(&BB);
5825
5826 I = ComputeLoop->end();
5827 MachineInstr *NewAccumulator;
5828 // Perform the computations
5829 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5830 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5831 .addReg(ActiveBitsReg);
5832 if (is32BitOpc) {
5833 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5834 LaneValueReg)
5835 .addReg(SrcReg)
5836 .addReg(FF1Reg);
5837 if (isFPOp) {
5838 Register LaneValVreg =
5839 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5840 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5841 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5842 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
5843 LaneValVreg)
5844 .addReg(LaneValueReg);
5845 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
5846 .addImm(0) // src0 modifier
5847 .addReg(Accumulator->getOperand(0).getReg())
5848 .addImm(0) // src1 modifier
5849 .addReg(LaneValVreg)
5850 .addImm(0) // clamp
5851 .addImm(0); // omod
5852 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5853 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5854 .addReg(DstVreg);
5855 } else {
5856 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5857 .addReg(Accumulator->getOperand(0).getReg())
5858 .addReg(LaneValueReg);
5859 }
5860 } else {
5861 Register LaneValueLoReg =
5862 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5863 Register LaneValueHiReg =
5864 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5865 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5866 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5867 const TargetRegisterClass *SrcSubRC =
5868 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5869 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5870 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5871 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5872 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5873 // lane value input should be in an sgpr
5874 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5875 LaneValueLoReg)
5876 .add(Op1L)
5877 .addReg(FF1Reg);
5878 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5879 LaneValueHiReg)
5880 .add(Op1H)
5881 .addReg(FF1Reg);
5882 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5883 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5884 .addReg(LaneValueLoReg)
5885 .addImm(AMDGPU::sub0)
5886 .addReg(LaneValueHiReg)
5887 .addImm(AMDGPU::sub1);
5888 switch (Opc) {
5889 case AMDGPU::S_OR_B64:
5890 case AMDGPU::S_AND_B64:
5891 case AMDGPU::S_XOR_B64: {
5892 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5893 .addReg(Accumulator->getOperand(0).getReg())
5894 .addReg(LaneValue->getOperand(0).getReg())
5895 .setOperandDead(3); // Dead scc
5896 break;
5897 }
5898 case AMDGPU::V_CMP_GT_I64_e64:
5899 case AMDGPU::V_CMP_GT_U64_e64:
5900 case AMDGPU::V_CMP_LT_I64_e64:
5901 case AMDGPU::V_CMP_LT_U64_e64: {
5902 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5903 Register ComparisonResultReg =
5904 MRI.createVirtualRegister(WaveMaskRegClass);
5905 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5906 const TargetRegisterClass *VSubRegClass =
5907 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5908 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5909 MachineOperand SrcReg0Sub0 =
5910 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5911 VregClass, AMDGPU::sub0, VSubRegClass);
5912 MachineOperand SrcReg0Sub1 =
5913 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5914 VregClass, AMDGPU::sub1, VSubRegClass);
5915 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5916 AccumulatorVReg)
5917 .add(SrcReg0Sub0)
5918 .addImm(AMDGPU::sub0)
5919 .add(SrcReg0Sub1)
5920 .addImm(AMDGPU::sub1);
5921 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5922 .addReg(LaneValue->getOperand(0).getReg())
5923 .addReg(AccumulatorVReg);
5924
5925 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5926 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5927 .addReg(LaneMaskReg)
5928 .addReg(ActiveBitsReg);
5929
5930 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5931 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5932 .addReg(LaneValue->getOperand(0).getReg())
5933 .addReg(Accumulator->getOperand(0).getReg());
5934 break;
5935 }
5936 case AMDGPU::S_ADD_U64_PSEUDO:
5937 case AMDGPU::S_SUB_U64_PSEUDO: {
5938 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5939 .addReg(Accumulator->getOperand(0).getReg())
5940 .addReg(LaneValue->getOperand(0).getReg());
5941 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5942 break;
5943 }
5944 }
5945 }
5946 // Manipulate the iterator to get the next active lane
5947 unsigned BITSETOpc =
5948 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5949 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5950 .addReg(FF1Reg)
5951 .addReg(ActiveBitsReg);
5952
5953 // Add phi nodes
5954 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5955 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5956
5957 // Creating branching
5958 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5959 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5960 .addReg(NewActiveBitsReg)
5961 .addImm(0);
5962 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5963 .addMBB(ComputeLoop);
5964
5965 RetBB = ComputeEnd;
5966 }
5967 MI.eraseFromParent();
5968 return RetBB;
5969}
5970
5973 MachineBasicBlock *BB) const {
5974 MachineFunction *MF = BB->getParent();
5976 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5978 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
5980 const DebugLoc &DL = MI.getDebugLoc();
5981
5982 switch (MI.getOpcode()) {
5983 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5984 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5985 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5986 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5987 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5988 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5989 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5990 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5991 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
5992 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
5993 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5994 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5995 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5996 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5997 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5998 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5999 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6000 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6001 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6002 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6003 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6004 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6005 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6006 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6007 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6008 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6009 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6010 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6011 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6012 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6013 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6014 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6015 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6016 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6017 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6018 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6019 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6020 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6021 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6022 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6023 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6024 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6025 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6026 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6027 case AMDGPU::S_UADDO_PSEUDO:
6028 case AMDGPU::S_USUBO_PSEUDO: {
6029 MachineOperand &Dest0 = MI.getOperand(0);
6030 MachineOperand &Dest1 = MI.getOperand(1);
6031 MachineOperand &Src0 = MI.getOperand(2);
6032 MachineOperand &Src1 = MI.getOperand(3);
6033
6034 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6035 ? AMDGPU::S_ADD_U32
6036 : AMDGPU::S_SUB_U32;
6037 // clang-format off
6038 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6039 .add(Src0)
6040 .add(Src1);
6041 // clang-format on
6042
6043 unsigned SelOpc =
6044 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6045 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6046
6047 MI.eraseFromParent();
6048 return BB;
6049 }
6050 case AMDGPU::S_ADD_U64_PSEUDO:
6051 case AMDGPU::S_SUB_U64_PSEUDO: {
6052 return Expand64BitScalarArithmetic(MI, BB);
6053 }
6054 case AMDGPU::V_ADD_U64_PSEUDO:
6055 case AMDGPU::V_SUB_U64_PSEUDO: {
6056 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6057
6058 MachineOperand &Dest = MI.getOperand(0);
6059 MachineOperand &Src0 = MI.getOperand(1);
6060 MachineOperand &Src1 = MI.getOperand(2);
6061
6062 if (ST.hasAddSubU64Insts()) {
6063 auto I = BuildMI(*BB, MI, DL,
6064 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6065 : AMDGPU::V_SUB_U64_e64),
6066 Dest.getReg())
6067 .add(Src0)
6068 .add(Src1)
6069 .addImm(0); // clamp
6070 TII->legalizeOperands(*I);
6071 MI.eraseFromParent();
6072 return BB;
6073 }
6074
6075 if (IsAdd && ST.hasLshlAddU64Inst()) {
6076 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6077 Dest.getReg())
6078 .add(Src0)
6079 .addImm(0)
6080 .add(Src1);
6081 TII->legalizeOperands(*Add);
6082 MI.eraseFromParent();
6083 return BB;
6084 }
6085
6086 const auto *CarryRC = TRI->getWaveMaskRegClass();
6087
6088 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6089 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6090
6091 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6092 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6093
6094 const TargetRegisterClass *Src0RC = Src0.isReg()
6095 ? MRI.getRegClass(Src0.getReg())
6096 : &AMDGPU::VReg_64RegClass;
6097 const TargetRegisterClass *Src1RC = Src1.isReg()
6098 ? MRI.getRegClass(Src1.getReg())
6099 : &AMDGPU::VReg_64RegClass;
6100
6101 const TargetRegisterClass *Src0SubRC =
6102 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6103 const TargetRegisterClass *Src1SubRC =
6104 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6105
6106 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6107 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6108 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6109 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6110
6111 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6112 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6113 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6114 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6115
6116 unsigned LoOpc =
6117 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6118 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6119 .addReg(CarryReg, RegState::Define)
6120 .add(SrcReg0Sub0)
6121 .add(SrcReg1Sub0)
6122 .addImm(0); // clamp bit
6123
6124 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6125 MachineInstr *HiHalf =
6126 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6127 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6128 .add(SrcReg0Sub1)
6129 .add(SrcReg1Sub1)
6130 .addReg(CarryReg, RegState::Kill)
6131 .addImm(0); // clamp bit
6132
6133 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6134 .addReg(DestSub0)
6135 .addImm(AMDGPU::sub0)
6136 .addReg(DestSub1)
6137 .addImm(AMDGPU::sub1);
6138 TII->legalizeOperands(*LoHalf);
6139 TII->legalizeOperands(*HiHalf);
6140 MI.eraseFromParent();
6141 return BB;
6142 }
6143 case AMDGPU::S_ADD_CO_PSEUDO:
6144 case AMDGPU::S_SUB_CO_PSEUDO: {
6145 // This pseudo has a chance to be selected
6146 // only from uniform add/subcarry node. All the VGPR operands
6147 // therefore assumed to be splat vectors.
6149 MachineOperand &Dest = MI.getOperand(0);
6150 MachineOperand &CarryDest = MI.getOperand(1);
6151 MachineOperand &Src0 = MI.getOperand(2);
6152 MachineOperand &Src1 = MI.getOperand(3);
6153 MachineOperand &Src2 = MI.getOperand(4);
6154 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6155 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6156 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6157 .addReg(Src0.getReg());
6158 Src0.setReg(RegOp0);
6159 }
6160 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6161 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6162 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6163 .addReg(Src1.getReg());
6164 Src1.setReg(RegOp1);
6165 }
6166 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6167 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6168 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6169 .addReg(Src2.getReg());
6170 Src2.setReg(RegOp2);
6171 }
6172
6173 if (ST.isWave64()) {
6174 if (ST.hasScalarCompareEq64()) {
6175 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6176 .addReg(Src2.getReg())
6177 .addImm(0);
6178 } else {
6179 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6180 const TargetRegisterClass *SubRC =
6181 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6182 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6183 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6184 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6185 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6186 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6187
6188 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6189 .add(Src2Sub0)
6190 .add(Src2Sub1);
6191
6192 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6193 .addReg(Src2_32, RegState::Kill)
6194 .addImm(0);
6195 }
6196 } else {
6197 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6198 .addReg(Src2.getReg())
6199 .addImm(0);
6200 }
6201
6202 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6203 ? AMDGPU::S_ADDC_U32
6204 : AMDGPU::S_SUBB_U32;
6205
6206 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6207
6208 unsigned SelOpc =
6209 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6210
6211 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6212 .addImm(-1)
6213 .addImm(0);
6214
6215 MI.eraseFromParent();
6216 return BB;
6217 }
6218 case AMDGPU::SI_INIT_M0: {
6219 MachineOperand &M0Init = MI.getOperand(0);
6220 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6221 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6222 AMDGPU::M0)
6223 .add(M0Init);
6224 MI.eraseFromParent();
6225 return BB;
6226 }
6227 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6228 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6229 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6230 TII->get(AMDGPU::S_CMP_EQ_U32))
6231 .addImm(0)
6232 .addImm(0);
6233 return BB;
6234 }
6235 case AMDGPU::GET_GROUPSTATICSIZE: {
6236 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6237 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6238 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6239 .add(MI.getOperand(0))
6240 .addImm(MFI->getLDSSize());
6241 MI.eraseFromParent();
6242 return BB;
6243 }
6244 case AMDGPU::GET_SHADERCYCLESHILO: {
6246 // The algorithm is:
6247 //
6248 // hi1 = getreg(SHADER_CYCLES_HI)
6249 // lo1 = getreg(SHADER_CYCLES_LO)
6250 // hi2 = getreg(SHADER_CYCLES_HI)
6251 //
6252 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6253 // Otherwise there was overflow and the result is hi2:0. In both cases the
6254 // result should represent the actual time at some point during the sequence
6255 // of three getregs.
6256 using namespace AMDGPU::Hwreg;
6257 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6258 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6259 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6260 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6261 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6262 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6263 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6264 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6265 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6266 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6267 .addReg(RegHi1)
6268 .addReg(RegHi2);
6269 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6270 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6271 .addReg(RegLo1)
6272 .addImm(0);
6273 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6274 .add(MI.getOperand(0))
6275 .addReg(RegLo)
6276 .addImm(AMDGPU::sub0)
6277 .addReg(RegHi2)
6278 .addImm(AMDGPU::sub1);
6279 MI.eraseFromParent();
6280 return BB;
6281 }
6282 case AMDGPU::SI_INDIRECT_SRC_V1:
6283 case AMDGPU::SI_INDIRECT_SRC_V2:
6284 case AMDGPU::SI_INDIRECT_SRC_V3:
6285 case AMDGPU::SI_INDIRECT_SRC_V4:
6286 case AMDGPU::SI_INDIRECT_SRC_V5:
6287 case AMDGPU::SI_INDIRECT_SRC_V6:
6288 case AMDGPU::SI_INDIRECT_SRC_V7:
6289 case AMDGPU::SI_INDIRECT_SRC_V8:
6290 case AMDGPU::SI_INDIRECT_SRC_V9:
6291 case AMDGPU::SI_INDIRECT_SRC_V10:
6292 case AMDGPU::SI_INDIRECT_SRC_V11:
6293 case AMDGPU::SI_INDIRECT_SRC_V12:
6294 case AMDGPU::SI_INDIRECT_SRC_V16:
6295 case AMDGPU::SI_INDIRECT_SRC_V32:
6296 return emitIndirectSrc(MI, *BB, *getSubtarget());
6297 case AMDGPU::SI_INDIRECT_DST_V1:
6298 case AMDGPU::SI_INDIRECT_DST_V2:
6299 case AMDGPU::SI_INDIRECT_DST_V3:
6300 case AMDGPU::SI_INDIRECT_DST_V4:
6301 case AMDGPU::SI_INDIRECT_DST_V5:
6302 case AMDGPU::SI_INDIRECT_DST_V6:
6303 case AMDGPU::SI_INDIRECT_DST_V7:
6304 case AMDGPU::SI_INDIRECT_DST_V8:
6305 case AMDGPU::SI_INDIRECT_DST_V9:
6306 case AMDGPU::SI_INDIRECT_DST_V10:
6307 case AMDGPU::SI_INDIRECT_DST_V11:
6308 case AMDGPU::SI_INDIRECT_DST_V12:
6309 case AMDGPU::SI_INDIRECT_DST_V16:
6310 case AMDGPU::SI_INDIRECT_DST_V32:
6311 return emitIndirectDst(MI, *BB, *getSubtarget());
6312 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6313 case AMDGPU::SI_KILL_I1_PSEUDO:
6314 return splitKillBlock(MI, BB);
6315 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6316 Register Dst = MI.getOperand(0).getReg();
6317 const MachineOperand &Src0 = MI.getOperand(1);
6318 const MachineOperand &Src1 = MI.getOperand(2);
6319 Register SrcCond = MI.getOperand(3).getReg();
6320
6321 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6322 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6323 const auto *CondRC = TRI->getWaveMaskRegClass();
6324 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6325
6326 const TargetRegisterClass *Src0RC = Src0.isReg()
6327 ? MRI.getRegClass(Src0.getReg())
6328 : &AMDGPU::VReg_64RegClass;
6329 const TargetRegisterClass *Src1RC = Src1.isReg()
6330 ? MRI.getRegClass(Src1.getReg())
6331 : &AMDGPU::VReg_64RegClass;
6332
6333 const TargetRegisterClass *Src0SubRC =
6334 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6335 const TargetRegisterClass *Src1SubRC =
6336 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6337
6338 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6339 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6340 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6341 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6342
6343 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6344 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6345 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6346 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6347
6348 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6349 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6350 .addImm(0)
6351 .add(Src0Sub0)
6352 .addImm(0)
6353 .add(Src1Sub0)
6354 .addReg(SrcCondCopy);
6355 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6356 .addImm(0)
6357 .add(Src0Sub1)
6358 .addImm(0)
6359 .add(Src1Sub1)
6360 .addReg(SrcCondCopy);
6361
6362 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6363 .addReg(DstLo)
6364 .addImm(AMDGPU::sub0)
6365 .addReg(DstHi)
6366 .addImm(AMDGPU::sub1);
6367 MI.eraseFromParent();
6368 return BB;
6369 }
6370 case AMDGPU::SI_BR_UNDEF: {
6371 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6372 .add(MI.getOperand(0));
6373 Br->getOperand(1).setIsUndef(); // read undef SCC
6374 MI.eraseFromParent();
6375 return BB;
6376 }
6377 case AMDGPU::ADJCALLSTACKUP:
6378 case AMDGPU::ADJCALLSTACKDOWN: {
6380 MachineInstrBuilder MIB(*MF, &MI);
6381 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6382 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6383 return BB;
6384 }
6385 case AMDGPU::SI_CALL_ISEL: {
6386 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6387
6389 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6390
6391 for (const MachineOperand &MO : MI.operands())
6392 MIB.add(MO);
6393
6394 MIB.cloneMemRefs(MI);
6395 MI.eraseFromParent();
6396 return BB;
6397 }
6398 case AMDGPU::V_ADD_CO_U32_e32:
6399 case AMDGPU::V_SUB_CO_U32_e32:
6400 case AMDGPU::V_SUBREV_CO_U32_e32: {
6401 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6402 unsigned Opc = MI.getOpcode();
6403
6404 bool NeedClampOperand = false;
6405 if (TII->pseudoToMCOpcode(Opc) == -1) {
6407 NeedClampOperand = true;
6408 }
6409
6410 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6411 if (TII->isVOP3(*I)) {
6412 I.addReg(TRI->getVCC(), RegState::Define);
6413 }
6414 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6415 if (NeedClampOperand)
6416 I.addImm(0); // clamp bit for e64 encoding
6417
6418 TII->legalizeOperands(*I);
6419
6420 MI.eraseFromParent();
6421 return BB;
6422 }
6423 case AMDGPU::V_ADDC_U32_e32:
6424 case AMDGPU::V_SUBB_U32_e32:
6425 case AMDGPU::V_SUBBREV_U32_e32:
6426 // These instructions have an implicit use of vcc which counts towards the
6427 // constant bus limit.
6428 TII->legalizeOperands(MI);
6429 return BB;
6430 case AMDGPU::DS_GWS_INIT:
6431 case AMDGPU::DS_GWS_SEMA_BR:
6432 case AMDGPU::DS_GWS_BARRIER:
6433 case AMDGPU::DS_GWS_SEMA_V:
6434 case AMDGPU::DS_GWS_SEMA_P:
6435 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6436 // A s_waitcnt 0 is required to be the instruction immediately following.
6437 if (getSubtarget()->hasGWSAutoReplay()) {
6439 return BB;
6440 }
6441
6442 return emitGWSMemViolTestLoop(MI, BB);
6443 case AMDGPU::S_SETREG_B32: {
6444 // Try to optimize cases that only set the denormal mode or rounding mode.
6445 //
6446 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6447 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6448 // instead.
6449 //
6450 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6451 // allow you to have a no side effect instruction in the output of a
6452 // sideeffecting pattern.
6453 auto [ID, Offset, Width] =
6454 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6456 return BB;
6457
6458 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6459 const unsigned SetMask = WidthMask << Offset;
6460
6461 if (getSubtarget()->hasDenormModeInst()) {
6462 unsigned SetDenormOp = 0;
6463 unsigned SetRoundOp = 0;
6464
6465 // The dedicated instructions can only set the whole denorm or round mode
6466 // at once, not a subset of bits in either.
6467 if (SetMask ==
6469 // If this fully sets both the round and denorm mode, emit the two
6470 // dedicated instructions for these.
6471 SetRoundOp = AMDGPU::S_ROUND_MODE;
6472 SetDenormOp = AMDGPU::S_DENORM_MODE;
6473 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6474 SetRoundOp = AMDGPU::S_ROUND_MODE;
6475 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6476 SetDenormOp = AMDGPU::S_DENORM_MODE;
6477 }
6478
6479 if (SetRoundOp || SetDenormOp) {
6480 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6481 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6482 unsigned ImmVal = Def->getOperand(1).getImm();
6483 if (SetRoundOp) {
6484 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6485 .addImm(ImmVal & 0xf);
6486
6487 // If we also have the denorm mode, get just the denorm mode bits.
6488 ImmVal >>= 4;
6489 }
6490
6491 if (SetDenormOp) {
6492 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6493 .addImm(ImmVal & 0xf);
6494 }
6495
6496 MI.eraseFromParent();
6497 return BB;
6498 }
6499 }
6500 }
6501
6502 // If only FP bits are touched, used the no side effects pseudo.
6503 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6504 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6505 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6506
6507 return BB;
6508 }
6509 case AMDGPU::S_INVERSE_BALLOT_U32:
6510 case AMDGPU::S_INVERSE_BALLOT_U64:
6511 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6512 // necessary. After that they are equivalent to a COPY.
6513 MI.setDesc(TII->get(AMDGPU::COPY));
6514 return BB;
6515 case AMDGPU::ENDPGM_TRAP: {
6516 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6517 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6518 MI.addOperand(MachineOperand::CreateImm(0));
6519 return BB;
6520 }
6521
6522 // We need a block split to make the real endpgm a terminator. We also don't
6523 // want to break phis in successor blocks, so we can't just delete to the
6524 // end of the block.
6525
6526 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6528 MF->push_back(TrapBB);
6529 // clang-format off
6530 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6531 .addImm(0);
6532 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6533 .addMBB(TrapBB);
6534 // clang-format on
6535
6536 BB->addSuccessor(TrapBB);
6537 MI.eraseFromParent();
6538 return SplitBB;
6539 }
6540 case AMDGPU::SIMULATED_TRAP: {
6541 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6542 MachineBasicBlock *SplitBB =
6543 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6544 MI.eraseFromParent();
6545 return SplitBB;
6546 }
6547 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6548 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6550
6551 // During ISel, it's difficult to propagate the original EXEC mask to use as
6552 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6553 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6554 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6555 Register OriginalExec = Setup->getOperand(0).getReg();
6556 MF->getRegInfo().clearKillFlags(OriginalExec);
6557 MI.getOperand(0).setReg(OriginalExec);
6558 return BB;
6559 }
6560 default:
6561 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6562 if (!MI.mayStore())
6564 return BB;
6565 }
6567 }
6568}
6569
6571 // This currently forces unfolding various combinations of fsub into fma with
6572 // free fneg'd operands. As long as we have fast FMA (controlled by
6573 // isFMAFasterThanFMulAndFAdd), we should perform these.
6574
6575 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6576 // most of these combines appear to be cycle neutral but save on instruction
6577 // count / code size.
6578 return true;
6579}
6580
6582
6584 EVT VT) const {
6585 if (!VT.isVector()) {
6586 return MVT::i1;
6587 }
6588 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6589}
6590
6592 // TODO: Should i16 be used always if legal? For now it would force VALU
6593 // shifts.
6594 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6595}
6596
6598 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6599 ? Ty.changeElementSize(16)
6600 : Ty.changeElementSize(32);
6601}
6602
6603// Answering this is somewhat tricky and depends on the specific device which
6604// have different rates for fma or all f64 operations.
6605//
6606// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6607// regardless of which device (although the number of cycles differs between
6608// devices), so it is always profitable for f64.
6609//
6610// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6611// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6612// which we can always do even without fused FP ops since it returns the same
6613// result as the separate operations and since it is always full
6614// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6615// however does not support denormals, so we do report fma as faster if we have
6616// a fast fma device and require denormals.
6617//
6619 EVT VT) const {
6620 VT = VT.getScalarType();
6621
6622 switch (VT.getSimpleVT().SimpleTy) {
6623 case MVT::f32: {
6624 // If mad is not available this depends only on if f32 fma is full rate.
6625 if (!Subtarget->hasMadMacF32Insts())
6626 return Subtarget->hasFastFMAF32();
6627
6628 // Otherwise f32 mad is always full rate and returns the same result as
6629 // the separate operations so should be preferred over fma.
6630 // However does not support denormals.
6632 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6633
6634 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6635 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6636 }
6637 case MVT::f64:
6638 return true;
6639 case MVT::f16:
6640 case MVT::bf16:
6641 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6642 default:
6643 break;
6644 }
6645
6646 return false;
6647}
6648
6650 LLT Ty) const {
6651 switch (Ty.getScalarSizeInBits()) {
6652 case 16:
6653 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6654 case 32:
6655 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6656 case 64:
6657 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6658 default:
6659 break;
6660 }
6661
6662 return false;
6663}
6664
6666 if (!Ty.isScalar())
6667 return false;
6668
6669 if (Ty.getScalarSizeInBits() == 16)
6670 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6671 if (Ty.getScalarSizeInBits() == 32)
6672 return Subtarget->hasMadMacF32Insts() &&
6673 denormalModeIsFlushAllF32(*MI.getMF());
6674
6675 return false;
6676}
6677
6679 const SDNode *N) const {
6680 // TODO: Check future ftz flag
6681 // v_mad_f32/v_mac_f32 do not support denormals.
6682 EVT VT = N->getValueType(0);
6683 if (VT == MVT::f32)
6684 return Subtarget->hasMadMacF32Insts() &&
6686 if (VT == MVT::f16) {
6687 return Subtarget->hasMadF16() &&
6689 }
6690
6691 return false;
6692}
6693
6694//===----------------------------------------------------------------------===//
6695// Custom DAG Lowering Operations
6696//===----------------------------------------------------------------------===//
6697
6698// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6699// wider vector type is legal.
6701 SelectionDAG &DAG) const {
6702 unsigned Opc = Op.getOpcode();
6703 EVT VT = Op.getValueType();
6704 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6705 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6706 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6707 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6708 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6709 VT == MVT::v32bf16);
6710
6711 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6712
6713 SDLoc SL(Op);
6714 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6715 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6716
6717 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6718}
6719
6720// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6721// regression whereby extra unnecessary instructions were added to codegen
6722// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6723// instructions to extract the result from the vector.
6725 [[maybe_unused]] EVT VT = Op.getValueType();
6726
6727 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6728 VT == MVT::v16i32) &&
6729 "Unexpected ValueType.");
6730
6731 return DAG.UnrollVectorOp(Op.getNode());
6732}
6733
6734// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6735// wider vector type is legal.
6737 SelectionDAG &DAG) const {
6738 unsigned Opc = Op.getOpcode();
6739 EVT VT = Op.getValueType();
6740 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6741 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6742 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6743 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6744 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6745 VT == MVT::v32bf16);
6746
6747 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6748 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6749
6750 SDLoc SL(Op);
6751
6752 SDValue OpLo =
6753 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6754 SDValue OpHi =
6755 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6756
6757 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6758}
6759
6761 SelectionDAG &DAG) const {
6762 unsigned Opc = Op.getOpcode();
6763 EVT VT = Op.getValueType();
6764 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6765 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6766 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6767 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6768 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6769 VT == MVT::v32bf16);
6770
6771 SDValue Op0 = Op.getOperand(0);
6772 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6773 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6774 : std::pair(Op0, Op0);
6775
6776 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6777 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6778
6779 SDLoc SL(Op);
6780 auto ResVT = DAG.GetSplitDestVTs(VT);
6781
6782 SDValue OpLo =
6783 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6784 SDValue OpHi =
6785 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6786
6787 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6788}
6789
6791 switch (Op.getOpcode()) {
6792 default:
6794 case ISD::BRCOND:
6795 return LowerBRCOND(Op, DAG);
6796 case ISD::RETURNADDR:
6797 return LowerRETURNADDR(Op, DAG);
6798 case ISD::LOAD: {
6799 SDValue Result = LowerLOAD(Op, DAG);
6800 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6801 "Load should return a value and a chain");
6802 return Result;
6803 }
6804 case ISD::FSQRT: {
6805 EVT VT = Op.getValueType();
6806 if (VT == MVT::f32)
6807 return lowerFSQRTF32(Op, DAG);
6808 if (VT == MVT::f64)
6809 return lowerFSQRTF64(Op, DAG);
6810 return SDValue();
6811 }
6812 case ISD::FSIN:
6813 case ISD::FCOS:
6814 return LowerTrig(Op, DAG);
6815 case ISD::SELECT:
6816 return LowerSELECT(Op, DAG);
6817 case ISD::FDIV:
6818 return LowerFDIV(Op, DAG);
6819 case ISD::FFREXP:
6820 return LowerFFREXP(Op, DAG);
6822 return LowerATOMIC_CMP_SWAP(Op, DAG);
6823 case ISD::STORE:
6824 return LowerSTORE(Op, DAG);
6825 case ISD::GlobalAddress: {
6828 return LowerGlobalAddress(MFI, Op, DAG);
6829 }
6831 return LowerExternalSymbol(Op, DAG);
6833 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6835 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6837 return LowerINTRINSIC_VOID(Op, DAG);
6838 case ISD::ADDRSPACECAST:
6839 return lowerADDRSPACECAST(Op, DAG);
6841 return lowerINSERT_SUBVECTOR(Op, DAG);
6843 return lowerINSERT_VECTOR_ELT(Op, DAG);
6845 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6847 return lowerVECTOR_SHUFFLE(Op, DAG);
6849 return lowerSCALAR_TO_VECTOR(Op, DAG);
6850 case ISD::BUILD_VECTOR:
6851 return lowerBUILD_VECTOR(Op, DAG);
6852 case ISD::FP_ROUND:
6854 return lowerFP_ROUND(Op, DAG);
6855 case ISD::TRAP:
6856 return lowerTRAP(Op, DAG);
6857 case ISD::DEBUGTRAP:
6858 return lowerDEBUGTRAP(Op, DAG);
6859 case ISD::ABS:
6860 case ISD::FABS:
6861 case ISD::FNEG:
6862 case ISD::FCANONICALIZE:
6863 case ISD::BSWAP:
6864 return splitUnaryVectorOp(Op, DAG);
6865 case ISD::FMINNUM:
6866 case ISD::FMAXNUM:
6867 return lowerFMINNUM_FMAXNUM(Op, DAG);
6868 case ISD::FMINIMUMNUM:
6869 case ISD::FMAXIMUMNUM:
6870 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6871 case ISD::FMINIMUM:
6872 case ISD::FMAXIMUM:
6873 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6874 case ISD::FLDEXP:
6875 case ISD::STRICT_FLDEXP:
6876 return lowerFLDEXP(Op, DAG);
6877 case ISD::FMA:
6878 return splitTernaryVectorOp(Op, DAG);
6879 case ISD::FP_TO_SINT:
6880 case ISD::FP_TO_UINT:
6881 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
6882 Op.getValueType() == MVT::i16 &&
6883 Op.getOperand(0).getValueType() == MVT::f32) {
6884 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
6885 return Op;
6886 }
6887 return LowerFP_TO_INT(Op, DAG);
6888 case ISD::SHL:
6889 case ISD::SRA:
6890 case ISD::SRL:
6891 case ISD::ADD:
6892 case ISD::SUB:
6893 case ISD::SMIN:
6894 case ISD::SMAX:
6895 case ISD::UMIN:
6896 case ISD::UMAX:
6897 case ISD::FADD:
6898 case ISD::FMUL:
6899 case ISD::FMINNUM_IEEE:
6900 case ISD::FMAXNUM_IEEE:
6901 case ISD::UADDSAT:
6902 case ISD::USUBSAT:
6903 case ISD::SADDSAT:
6904 case ISD::SSUBSAT:
6905 return splitBinaryVectorOp(Op, DAG);
6906 case ISD::FCOPYSIGN:
6907 return lowerFCOPYSIGN(Op, DAG);
6908 case ISD::MUL:
6909 return lowerMUL(Op, DAG);
6910 case ISD::SMULO:
6911 case ISD::UMULO:
6912 return lowerXMULO(Op, DAG);
6913 case ISD::SMUL_LOHI:
6914 case ISD::UMUL_LOHI:
6915 return lowerXMUL_LOHI(Op, DAG);
6917 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6918 case ISD::STACKSAVE:
6919 return LowerSTACKSAVE(Op, DAG);
6920 case ISD::GET_ROUNDING:
6921 return lowerGET_ROUNDING(Op, DAG);
6922 case ISD::SET_ROUNDING:
6923 return lowerSET_ROUNDING(Op, DAG);
6924 case ISD::PREFETCH:
6925 return lowerPREFETCH(Op, DAG);
6926 case ISD::FP_EXTEND:
6928 return lowerFP_EXTEND(Op, DAG);
6929 case ISD::GET_FPENV:
6930 return lowerGET_FPENV(Op, DAG);
6931 case ISD::SET_FPENV:
6932 return lowerSET_FPENV(Op, DAG);
6933 case ISD::ROTR:
6934 return lowerROTR(Op, DAG);
6935 }
6936 return SDValue();
6937}
6938
6939// Used for D16: Casts the result of an instruction into the right vector,
6940// packs values if loads return unpacked values.
6942 const SDLoc &DL, SelectionDAG &DAG,
6943 bool Unpacked) {
6944 if (!LoadVT.isVector())
6945 return Result;
6946
6947 // Cast back to the original packed type or to a larger type that is a
6948 // multiple of 32 bit for D16. Widening the return type is a required for
6949 // legalization.
6950 EVT FittingLoadVT = LoadVT;
6951 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6952 FittingLoadVT =
6954 LoadVT.getVectorNumElements() + 1);
6955 }
6956
6957 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6958 // Truncate to v2i16/v4i16.
6959 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6960
6961 // Workaround legalizer not scalarizing truncate after vector op
6962 // legalization but not creating intermediate vector trunc.
6964 DAG.ExtractVectorElements(Result, Elts);
6965 for (SDValue &Elt : Elts)
6966 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6967
6968 // Pad illegal v1i16/v3fi6 to v4i16
6969 if ((LoadVT.getVectorNumElements() % 2) == 1)
6970 Elts.push_back(DAG.getPOISON(MVT::i16));
6971
6972 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6973
6974 // Bitcast to original type (v2f16/v4f16).
6975 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6976 }
6977
6978 // Cast back to the original packed type.
6979 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6980}
6981
6982SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6983 SelectionDAG &DAG,
6985 bool IsIntrinsic) const {
6986 SDLoc DL(M);
6987
6988 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6989 EVT LoadVT = M->getValueType(0);
6990
6991 EVT EquivLoadVT = LoadVT;
6992 if (LoadVT.isVector()) {
6993 if (Unpacked) {
6994 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6995 LoadVT.getVectorNumElements());
6996 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6997 // Widen v3f16 to legal type
6998 EquivLoadVT =
7000 LoadVT.getVectorNumElements() + 1);
7001 }
7002 }
7003
7004 // Change from v4f16/v2f16 to EquivLoadVT.
7005 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7006
7008 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7009 M->getMemoryVT(), M->getMemOperand());
7010
7011 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7012
7013 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7014}
7015
7016SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7017 SelectionDAG &DAG,
7018 ArrayRef<SDValue> Ops) const {
7019 SDLoc DL(M);
7020 EVT LoadVT = M->getValueType(0);
7021 EVT EltType = LoadVT.getScalarType();
7022 EVT IntVT = LoadVT.changeTypeToInteger();
7023
7024 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7025
7026 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7027 bool IsTFE = M->getNumValues() == 3;
7028
7029 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7030 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7031 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7032 : AMDGPUISD::BUFFER_LOAD;
7033
7034 if (IsD16) {
7035 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7036 }
7037
7038 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7039 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7040 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7041 IsTFE);
7042
7043 if (isTypeLegal(LoadVT)) {
7044 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7045 M->getMemOperand(), DAG);
7046 }
7047
7048 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7049 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7050 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7051 M->getMemOperand(), DAG);
7052 return DAG.getMergeValues(
7053 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7054 DL);
7055}
7056
7058 SelectionDAG &DAG) {
7059 EVT VT = N->getValueType(0);
7060 unsigned CondCode = N->getConstantOperandVal(3);
7061 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7062 return DAG.getPOISON(VT);
7063
7064 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7065
7066 SDValue LHS = N->getOperand(1);
7067 SDValue RHS = N->getOperand(2);
7068
7069 SDLoc DL(N);
7070
7071 EVT CmpVT = LHS.getValueType();
7072 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7073 unsigned PromoteOp =
7075 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7076 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7077 }
7078
7079 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7080
7081 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7082 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7083
7084 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7085 DAG.getCondCode(CCOpcode));
7086 if (VT.bitsEq(CCVT))
7087 return SetCC;
7088 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7089}
7090
7092 SelectionDAG &DAG) {
7093 EVT VT = N->getValueType(0);
7094
7095 unsigned CondCode = N->getConstantOperandVal(3);
7096 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7097 return DAG.getPOISON(VT);
7098
7099 SDValue Src0 = N->getOperand(1);
7100 SDValue Src1 = N->getOperand(2);
7101 EVT CmpVT = Src0.getValueType();
7102 SDLoc SL(N);
7103
7104 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7105 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7106 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7107 }
7108
7109 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7110 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7111 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7112 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7113 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7114 DAG.getCondCode(CCOpcode));
7115 if (VT.bitsEq(CCVT))
7116 return SetCC;
7117 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7118}
7119
7121 SelectionDAG &DAG) {
7122 EVT VT = N->getValueType(0);
7123 SDValue Src = N->getOperand(1);
7124 SDLoc SL(N);
7125
7126 if (Src.getOpcode() == ISD::SETCC) {
7127 SDValue Op0 = Src.getOperand(0);
7128 SDValue Op1 = Src.getOperand(1);
7129 // Need to expand bfloat to float for comparison (setcc).
7130 if (Op0.getValueType() == MVT::bf16) {
7131 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7132 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7133 }
7134 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7135 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7136 }
7137 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7138 // (ballot 0) -> 0
7139 if (Arg->isZero())
7140 return DAG.getConstant(0, SL, VT);
7141
7142 // (ballot 1) -> EXEC/EXEC_LO
7143 if (Arg->isOne()) {
7144 Register Exec;
7145 if (VT.getScalarSizeInBits() == 32)
7146 Exec = AMDGPU::EXEC_LO;
7147 else if (VT.getScalarSizeInBits() == 64)
7148 Exec = AMDGPU::EXEC;
7149 else
7150 return SDValue();
7151
7152 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7153 }
7154 }
7155
7156 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7157 // ISD::SETNE)
7158 return DAG.getNode(
7159 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7160 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7161}
7162
7164 SelectionDAG &DAG) {
7165 EVT VT = N->getValueType(0);
7166 unsigned ValSize = VT.getSizeInBits();
7167 unsigned IID = N->getConstantOperandVal(0);
7168 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7169 IID == Intrinsic::amdgcn_permlanex16;
7170 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7171 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7172 SDLoc SL(N);
7173 MVT IntVT = MVT::getIntegerVT(ValSize);
7174 const GCNSubtarget *ST = TLI.getSubtarget();
7175 unsigned SplitSize = 32;
7176 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7177 ST->hasDPALU_DPP() &&
7178 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7179 SplitSize = 64;
7180
7181 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7182 SDValue Src2, MVT ValT) -> SDValue {
7183 SmallVector<SDValue, 8> Operands;
7184 switch (IID) {
7185 case Intrinsic::amdgcn_permlane16:
7186 case Intrinsic::amdgcn_permlanex16:
7187 case Intrinsic::amdgcn_update_dpp:
7188 Operands.push_back(N->getOperand(6));
7189 Operands.push_back(N->getOperand(5));
7190 Operands.push_back(N->getOperand(4));
7191 [[fallthrough]];
7192 case Intrinsic::amdgcn_writelane:
7193 Operands.push_back(Src2);
7194 [[fallthrough]];
7195 case Intrinsic::amdgcn_readlane:
7196 case Intrinsic::amdgcn_set_inactive:
7197 case Intrinsic::amdgcn_set_inactive_chain_arg:
7198 case Intrinsic::amdgcn_mov_dpp8:
7199 Operands.push_back(Src1);
7200 [[fallthrough]];
7201 case Intrinsic::amdgcn_readfirstlane:
7202 case Intrinsic::amdgcn_permlane64:
7203 Operands.push_back(Src0);
7204 break;
7205 default:
7206 llvm_unreachable("unhandled lane op");
7207 }
7208
7209 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7210 std::reverse(Operands.begin(), Operands.end());
7211
7212 if (SDNode *GL = N->getGluedNode()) {
7213 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7214 GL = GL->getOperand(0).getNode();
7215 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7216 SDValue(GL, 0)));
7217 }
7218
7219 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7220 };
7221
7222 SDValue Src0 = N->getOperand(1);
7223 SDValue Src1, Src2;
7224 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7225 IID == Intrinsic::amdgcn_mov_dpp8 ||
7226 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7227 Src1 = N->getOperand(2);
7228 if (IID == Intrinsic::amdgcn_writelane ||
7229 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7230 Src2 = N->getOperand(3);
7231 }
7232
7233 if (ValSize == SplitSize) {
7234 // Already legal
7235 return SDValue();
7236 }
7237
7238 if (ValSize < 32) {
7239 bool IsFloat = VT.isFloatingPoint();
7240 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7241 SL, MVT::i32);
7242
7243 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7244 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7245 SL, MVT::i32);
7246 }
7247
7248 if (IID == Intrinsic::amdgcn_writelane) {
7249 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7250 SL, MVT::i32);
7251 }
7252
7253 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7254 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7255 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7256 }
7257
7258 if (ValSize % SplitSize != 0)
7259 return SDValue();
7260
7261 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7262 EVT VT = N->getValueType(0);
7263 unsigned NE = VT.getVectorNumElements();
7264 EVT EltVT = VT.getVectorElementType();
7266 unsigned NumOperands = N->getNumOperands();
7267 SmallVector<SDValue, 4> Operands(NumOperands);
7268 SDNode *GL = N->getGluedNode();
7269
7270 // only handle convergencectrl_glue
7272
7273 for (unsigned i = 0; i != NE; ++i) {
7274 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7275 ++j) {
7276 SDValue Operand = N->getOperand(j);
7277 EVT OperandVT = Operand.getValueType();
7278 if (OperandVT.isVector()) {
7279 // A vector operand; extract a single element.
7280 EVT OperandEltVT = OperandVT.getVectorElementType();
7281 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7282 Operand, DAG.getVectorIdxConstant(i, SL));
7283 } else {
7284 // A scalar operand; just use it as is.
7285 Operands[j] = Operand;
7286 }
7287 }
7288
7289 if (GL)
7290 Operands[NumOperands - 1] =
7291 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7292 SDValue(GL->getOperand(0).getNode(), 0));
7293
7294 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7295 }
7296
7297 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7298 return DAG.getBuildVector(VecVT, SL, Scalars);
7299 };
7300
7301 if (VT.isVector()) {
7302 switch (MVT::SimpleValueType EltTy =
7304 case MVT::i32:
7305 case MVT::f32:
7306 if (SplitSize == 32) {
7307 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7308 return unrollLaneOp(LaneOp.getNode());
7309 }
7310 [[fallthrough]];
7311 case MVT::i16:
7312 case MVT::f16:
7313 case MVT::bf16: {
7314 unsigned SubVecNumElt =
7315 SplitSize / VT.getVectorElementType().getSizeInBits();
7316 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7318 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7319 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7320 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7321 DAG.getConstant(EltIdx, SL, MVT::i32));
7322
7323 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7324 IsPermLane16)
7325 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7326 DAG.getConstant(EltIdx, SL, MVT::i32));
7327
7328 if (IID == Intrinsic::amdgcn_writelane)
7329 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7330 DAG.getConstant(EltIdx, SL, MVT::i32));
7331
7332 Pieces.push_back(
7333 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7334 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7335 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7336 EltIdx += SubVecNumElt;
7337 }
7338 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7339 }
7340 default:
7341 // Handle all other cases by bitcasting to i32 vectors
7342 break;
7343 }
7344 }
7345
7346 MVT VecVT =
7347 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7348 Src0 = DAG.getBitcast(VecVT, Src0);
7349
7350 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7351 Src1 = DAG.getBitcast(VecVT, Src1);
7352
7353 if (IID == Intrinsic::amdgcn_writelane)
7354 Src2 = DAG.getBitcast(VecVT, Src2);
7355
7356 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7357 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7358 return DAG.getBitcast(VT, UnrolledLaneOp);
7359}
7360
7362 SelectionDAG &DAG) {
7363 EVT VT = N->getValueType(0);
7364
7365 if (VT.getSizeInBits() != 32)
7366 return SDValue();
7367
7368 SDLoc SL(N);
7369
7370 SDValue Value = N->getOperand(1);
7371 SDValue Index = N->getOperand(2);
7372
7373 // ds_bpermute requires index to be multiplied by 4
7374 SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
7375 SDValue ShiftedIndex =
7376 DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
7377
7378 // Intrinsics will require i32 to operate on
7379 SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
7380
7381 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7382 SmallVector<SDValue> IntrinArgs) -> SDValue {
7383 SmallVector<SDValue> Operands(1);
7384 Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
7385 Operands.append(IntrinArgs);
7386 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
7387 };
7388
7389 // If we can bpermute across the whole wave, then just do that
7391 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7392 {ShiftedIndex, ValueI32});
7393 return DAG.getBitcast(VT, BPermute);
7394 }
7395
7396 assert(TLI.getSubtarget()->isWave64());
7397
7398 // Otherwise, we need to make use of whole wave mode
7399 SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
7400
7401 // Set inactive lanes to poison
7402 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7403 {ValueI32, PoisonVal});
7404 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7405 {ShiftedIndex, PoisonVal});
7406
7407 SDValue Swapped =
7408 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7409
7410 // Get permutation of each half, then we'll select which one to use
7411 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7412 {WWMIndex, WWMValue});
7413 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7414 MVT::i32, {WWMIndex, Swapped});
7415 SDValue BPermOtherHalfWWM =
7416 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7417
7418 // Select which side to take the permute from
7419 SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
7420 // We can get away with only using mbcnt_lo here since we're only
7421 // trying to detect which side of 32 each lane is on, and mbcnt_lo
7422 // returns 32 for lanes 32-63.
7423 SDValue ThreadID =
7424 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7425 {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
7426
7427 SDValue SameOrOtherHalf =
7428 DAG.getNode(ISD::AND, SL, MVT::i32,
7429 DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
7430 DAG.getTargetConstant(32, SL, MVT::i32));
7431 SDValue UseSameHalf =
7432 DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7433 DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
7434 SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
7435 BPermOtherHalfWWM);
7436 return DAG.getBitcast(VT, Result);
7437}
7438
7441 SelectionDAG &DAG) const {
7442 switch (N->getOpcode()) {
7444 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7445 Results.push_back(Res);
7446 return;
7447 }
7449 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7450 Results.push_back(Res);
7451 return;
7452 }
7454 unsigned IID = N->getConstantOperandVal(0);
7455 switch (IID) {
7456 case Intrinsic::amdgcn_make_buffer_rsrc:
7457 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7458 return;
7459 case Intrinsic::amdgcn_cvt_pkrtz: {
7460 SDValue Src0 = N->getOperand(1);
7461 SDValue Src1 = N->getOperand(2);
7462 SDLoc SL(N);
7463 SDValue Cvt =
7464 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7465 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7466 return;
7467 }
7468 case Intrinsic::amdgcn_cvt_pknorm_i16:
7469 case Intrinsic::amdgcn_cvt_pknorm_u16:
7470 case Intrinsic::amdgcn_cvt_pk_i16:
7471 case Intrinsic::amdgcn_cvt_pk_u16: {
7472 SDValue Src0 = N->getOperand(1);
7473 SDValue Src1 = N->getOperand(2);
7474 SDLoc SL(N);
7475 unsigned Opcode;
7476
7477 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7478 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7479 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7480 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7481 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7482 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7483 else
7484 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7485
7486 EVT VT = N->getValueType(0);
7487 if (isTypeLegal(VT))
7488 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7489 else {
7490 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7491 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7492 }
7493 return;
7494 }
7495 case Intrinsic::amdgcn_s_buffer_load: {
7496 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7497 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7498 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7499 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7500 // s_buffer_load_i8.
7501 if (!Subtarget->hasScalarSubwordLoads())
7502 return;
7503 SDValue Op = SDValue(N, 0);
7504 SDValue Rsrc = Op.getOperand(1);
7505 SDValue Offset = Op.getOperand(2);
7506 SDValue CachePolicy = Op.getOperand(3);
7507 EVT VT = Op.getValueType();
7508 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7509 SDLoc DL(Op);
7511 const DataLayout &DataLayout = DAG.getDataLayout();
7512 Align Alignment =
7518 VT.getStoreSize(), Alignment);
7519 SDValue LoadVal;
7520 if (!Offset->isDivergent()) {
7521 SDValue Ops[] = {Rsrc, // source register
7522 Offset, CachePolicy};
7523 SDValue BufferLoad =
7524 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
7525 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7526 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7527 } else {
7528 SDValue Ops[] = {
7529 DAG.getEntryNode(), // Chain
7530 Rsrc, // rsrc
7531 DAG.getConstant(0, DL, MVT::i32), // vindex
7532 {}, // voffset
7533 {}, // soffset
7534 {}, // offset
7535 CachePolicy, // cachepolicy
7536 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7537 };
7538 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7539 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7540 }
7541 Results.push_back(LoadVal);
7542 return;
7543 }
7544 case Intrinsic::amdgcn_dead: {
7545 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7546 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7547 return;
7548 }
7549 }
7550 break;
7551 }
7553 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7554 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7555 // FIXME: Hacky
7556 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7557 Results.push_back(Res.getOperand(I));
7558 }
7559 } else {
7560 Results.push_back(Res);
7561 Results.push_back(Res.getValue(1));
7562 }
7563 return;
7564 }
7565
7566 break;
7567 }
7568 case ISD::SELECT: {
7569 SDLoc SL(N);
7570 EVT VT = N->getValueType(0);
7571 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7572 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7573 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7574
7575 EVT SelectVT = NewVT;
7576 if (NewVT.bitsLT(MVT::i32)) {
7577 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7578 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7579 SelectVT = MVT::i32;
7580 }
7581
7582 SDValue NewSelect =
7583 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7584
7585 if (NewVT != SelectVT)
7586 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7587 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7588 return;
7589 }
7590 case ISD::FNEG: {
7591 if (N->getValueType(0) != MVT::v2f16)
7592 break;
7593
7594 SDLoc SL(N);
7595 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7596
7597 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7598 DAG.getConstant(0x80008000, SL, MVT::i32));
7599 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7600 return;
7601 }
7602 case ISD::FABS: {
7603 if (N->getValueType(0) != MVT::v2f16)
7604 break;
7605
7606 SDLoc SL(N);
7607 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7608
7609 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7610 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7611 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7612 return;
7613 }
7614 case ISD::FSQRT: {
7615 if (N->getValueType(0) != MVT::f16)
7616 break;
7617 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7618 break;
7619 }
7620 default:
7622 break;
7623 }
7624}
7625
7626/// Helper function for LowerBRCOND
7627static SDNode *findUser(SDValue Value, unsigned Opcode) {
7628
7629 for (SDUse &U : Value->uses()) {
7630 if (U.get() != Value)
7631 continue;
7632
7633 if (U.getUser()->getOpcode() == Opcode)
7634 return U.getUser();
7635 }
7636 return nullptr;
7637}
7638
7639unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7640 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7641 switch (Intr->getConstantOperandVal(1)) {
7642 case Intrinsic::amdgcn_if:
7643 return AMDGPUISD::IF;
7644 case Intrinsic::amdgcn_else:
7645 return AMDGPUISD::ELSE;
7646 case Intrinsic::amdgcn_loop:
7647 return AMDGPUISD::LOOP;
7648 case Intrinsic::amdgcn_end_cf:
7649 llvm_unreachable("should not occur");
7650 default:
7651 return 0;
7652 }
7653 }
7654
7655 // break, if_break, else_break are all only used as inputs to loop, not
7656 // directly as branch conditions.
7657 return 0;
7658}
7659
7666
7668 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7669 return false;
7670
7671 // FIXME: Either avoid relying on address space here or change the default
7672 // address space for functions to avoid the explicit check.
7673 return (GV->getValueType()->isFunctionTy() ||
7676}
7677
7679 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7680}
7681
7683 if (!GV->hasExternalLinkage())
7684 return true;
7685
7686 const auto OS = getTargetMachine().getTargetTriple().getOS();
7687 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7688}
7689
7690/// This transforms the control flow intrinsics to get the branch destination as
7691/// last parameter, also switches branch target with BR if the need arise
7692SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7693 SDLoc DL(BRCOND);
7694
7695 SDNode *Intr = BRCOND.getOperand(1).getNode();
7696 SDValue Target = BRCOND.getOperand(2);
7697 SDNode *BR = nullptr;
7698 SDNode *SetCC = nullptr;
7699
7700 switch (Intr->getOpcode()) {
7701 case ISD::SETCC: {
7702 // As long as we negate the condition everything is fine
7703 SetCC = Intr;
7704 Intr = SetCC->getOperand(0).getNode();
7705 break;
7706 }
7707 case ISD::XOR: {
7708 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7709 SDValue LHS = Intr->getOperand(0);
7710 SDValue RHS = Intr->getOperand(1);
7711 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7712 Intr = LHS.getNode();
7713 break;
7714 }
7715 [[fallthrough]];
7716 }
7717 default: {
7718 // Get the target from BR if we don't negate the condition
7719 BR = findUser(BRCOND, ISD::BR);
7720 assert(BR && "brcond missing unconditional branch user");
7721 Target = BR->getOperand(1);
7722 }
7723 }
7724
7725 unsigned CFNode = isCFIntrinsic(Intr);
7726 if (CFNode == 0) {
7727 // This is a uniform branch so we don't need to legalize.
7728 return BRCOND;
7729 }
7730
7731 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7733
7734 assert(!SetCC ||
7735 (SetCC->getConstantOperandVal(1) == 1 &&
7736 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7737 ISD::SETNE));
7738
7739 // operands of the new intrinsic call
7741 if (HaveChain)
7742 Ops.push_back(BRCOND.getOperand(0));
7743
7744 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7745 Ops.push_back(Target);
7746
7747 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7748
7749 // build the new intrinsic call
7750 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7751
7752 if (!HaveChain) {
7753 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7754
7756 }
7757
7758 if (BR) {
7759 // Give the branch instruction our target
7760 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7761 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7762 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7763 }
7764
7765 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7766
7767 // Copy the intrinsic results to registers
7768 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7769 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7770 if (!CopyToReg)
7771 continue;
7772
7773 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7774 SDValue(Result, i - 1), SDValue());
7775
7776 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7777 }
7778
7779 // Remove the old intrinsic from the chain
7780 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7781 Intr->getOperand(0));
7782
7783 return Chain;
7784}
7785
7786SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7787 MVT VT = Op.getSimpleValueType();
7788 SDLoc DL(Op);
7789 // Checking the depth
7790 if (Op.getConstantOperandVal(0) != 0)
7791 return DAG.getConstant(0, DL, VT);
7792
7793 MachineFunction &MF = DAG.getMachineFunction();
7794 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7795 // Check for kernel and shader functions
7796 if (Info->isEntryFunction())
7797 return DAG.getConstant(0, DL, VT);
7798
7799 MachineFrameInfo &MFI = MF.getFrameInfo();
7800 // There is a call to @llvm.returnaddress in this function
7801 MFI.setReturnAddressIsTaken(true);
7802
7803 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7804 // Get the return address reg and mark it as an implicit live-in
7805 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7806 getRegClassFor(VT, Op.getNode()->isDivergent()));
7807
7808 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7809}
7810
7811SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7812 const SDLoc &DL, EVT VT) const {
7813 return Op.getValueType().bitsLE(VT)
7814 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7815 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7816 DAG.getTargetConstant(0, DL, MVT::i32));
7817}
7818
7819SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7820 SelectionDAG &DAG) const {
7821 EVT DstVT = Op.getValueType();
7822 unsigned NumElts = DstVT.getVectorNumElements();
7823 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7824
7825 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7826
7827 SDLoc DL(Op);
7828 unsigned Opc = Op.getOpcode();
7829 SDValue Flags = Op.getOperand(1);
7830 EVT HalfDstVT =
7831 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7832 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7833 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7834
7835 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7836}
7837
7838SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7839 SDValue Src = Op.getOperand(0);
7840 EVT SrcVT = Src.getValueType();
7841 EVT DstVT = Op.getValueType();
7842
7843 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7844 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7845 if (SrcVT.getScalarType() != MVT::f32)
7846 return SDValue();
7847 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7848 }
7849
7850 if (SrcVT.getScalarType() != MVT::f64)
7851 return Op;
7852
7853 SDLoc DL(Op);
7854 if (DstVT == MVT::f16) {
7855 // TODO: Handle strictfp
7856 if (Op.getOpcode() != ISD::FP_ROUND)
7857 return Op;
7858
7859 if (!Subtarget->has16BitInsts()) {
7860 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7861 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7862 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7863 }
7864 if (Op->getFlags().hasApproximateFuncs()) {
7865 SDValue Flags = Op.getOperand(1);
7866 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7867 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7868 }
7869 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7870 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7871 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7872 }
7873
7874 assert(DstVT.getScalarType() == MVT::bf16 &&
7875 "custom lower FP_ROUND for f16 or bf16");
7876 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7877
7878 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7879 // hardware f32 -> bf16 instruction.
7880 EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
7881 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7882 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7883 DAG.getTargetConstant(0, DL, MVT::i32));
7884}
7885
7886SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7887 SelectionDAG &DAG) const {
7888 EVT VT = Op.getValueType();
7889 const MachineFunction &MF = DAG.getMachineFunction();
7890 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7891 bool IsIEEEMode = Info->getMode().IEEE;
7892
7893 // FIXME: Assert during selection that this is only selected for
7894 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7895 // mode functions, but this happens to be OK since it's only done in cases
7896 // where there is known no sNaN.
7897 if (IsIEEEMode)
7898 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7899
7900 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7901 VT == MVT::v16bf16)
7902 return splitBinaryVectorOp(Op, DAG);
7903 return Op;
7904}
7905
7906SDValue
7907SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7908 SelectionDAG &DAG) const {
7909 EVT VT = Op.getValueType();
7910 const MachineFunction &MF = DAG.getMachineFunction();
7911 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7912 bool IsIEEEMode = Info->getMode().IEEE;
7913
7914 if (IsIEEEMode)
7915 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7916
7917 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7918 VT == MVT::v16bf16)
7919 return splitBinaryVectorOp(Op, DAG);
7920 return Op;
7921}
7922
7923SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7924 SelectionDAG &DAG) const {
7925 EVT VT = Op.getValueType();
7926 if (VT.isVector())
7927 return splitBinaryVectorOp(Op, DAG);
7928
7929 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7930 !Subtarget->hasMinimum3Maximum3F16() &&
7931 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7932 "should not need to widen f16 minimum/maximum to v2f16");
7933
7934 // Widen f16 operation to v2f16
7935
7936 // fminimum f16:x, f16:y ->
7937 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7938 // (v2f16 (scalar_to_vector y))), 0
7939 SDLoc SL(Op);
7940 SDValue WideSrc0 =
7941 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7942 SDValue WideSrc1 =
7943 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7944
7945 SDValue Widened =
7946 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7947
7948 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7949 DAG.getConstant(0, SL, MVT::i32));
7950}
7951
7952SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7953 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7954 EVT VT = Op.getValueType();
7955 assert(VT == MVT::f16);
7956
7957 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7958 EVT ExpVT = Exp.getValueType();
7959 if (ExpVT == MVT::i16)
7960 return Op;
7961
7962 SDLoc DL(Op);
7963
7964 // Correct the exponent type for f16 to i16.
7965 // Clamp the range of the exponent to the instruction's range.
7966
7967 // TODO: This should be a generic narrowing legalization, and can easily be
7968 // for GlobalISel.
7969
7970 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7971 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7972
7973 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7974 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7975
7976 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7977
7978 if (IsStrict) {
7979 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7980 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7981 }
7982
7983 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7984}
7985
7987 switch (Op->getOpcode()) {
7988 case ISD::SRA:
7989 case ISD::SMIN:
7990 case ISD::SMAX:
7991 return ISD::SIGN_EXTEND;
7992 case ISD::SRL:
7993 case ISD::UMIN:
7994 case ISD::UMAX:
7995 return ISD::ZERO_EXTEND;
7996 case ISD::ADD:
7997 case ISD::SUB:
7998 case ISD::AND:
7999 case ISD::OR:
8000 case ISD::XOR:
8001 case ISD::SHL:
8002 case ISD::SELECT:
8003 case ISD::MUL:
8004 // operation result won't be influenced by garbage high bits.
8005 // TODO: are all of those cases correct, and are there more?
8006 return ISD::ANY_EXTEND;
8007 case ISD::SETCC: {
8008 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8010 }
8011 default:
8012 llvm_unreachable("unexpected opcode!");
8013 }
8014}
8015
8016SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8017 DAGCombinerInfo &DCI) const {
8018 const unsigned Opc = Op.getOpcode();
8019 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8020 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8021 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8022 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8023 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
8024
8025 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8026 : Op->getOperand(0).getValueType();
8027 auto &DAG = DCI.DAG;
8028 auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
8029
8030 if (DCI.isBeforeLegalizeOps() ||
8031 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
8032 return SDValue();
8033
8034 SDLoc DL(Op);
8035 SDValue LHS;
8036 SDValue RHS;
8037 if (Opc == ISD::SELECT) {
8038 LHS = Op->getOperand(1);
8039 RHS = Op->getOperand(2);
8040 } else {
8041 LHS = Op->getOperand(0);
8042 RHS = Op->getOperand(1);
8043 }
8044
8045 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8046 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
8047
8048 // Special case: for shifts, the RHS always needs a zext.
8049 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8050 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
8051 else
8052 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
8053
8054 // setcc always return i1/i1 vec so no need to truncate after.
8055 if (Opc == ISD::SETCC) {
8056 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8057 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
8058 }
8059
8060 // For other ops, we extend the operation's return type as well so we need to
8061 // truncate back to the original type.
8062 SDValue NewVal;
8063 if (Opc == ISD::SELECT)
8064 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
8065 else
8066 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
8067
8068 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8069}
8070
8071SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8072 SDValue Mag = Op.getOperand(0);
8073 EVT MagVT = Mag.getValueType();
8074
8075 if (MagVT.getVectorNumElements() > 2)
8076 return splitBinaryVectorOp(Op, DAG);
8077
8078 SDValue Sign = Op.getOperand(1);
8079 EVT SignVT = Sign.getValueType();
8080
8081 if (MagVT == SignVT)
8082 return Op;
8083
8084 // fcopysign v2f16:mag, v2f32:sign ->
8085 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8086
8087 SDLoc SL(Op);
8088 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8089 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8090
8091 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8092
8093 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8094}
8095
8096// Custom lowering for vector multiplications and s_mul_u64.
8097SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8098 EVT VT = Op.getValueType();
8099
8100 // Split vector operands.
8101 if (VT.isVector())
8102 return splitBinaryVectorOp(Op, DAG);
8103
8104 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8105
8106 // There are four ways to lower s_mul_u64:
8107 //
8108 // 1. If all the operands are uniform, then we lower it as it is.
8109 //
8110 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8111 // multiplications because there is not a vector equivalent of s_mul_u64.
8112 //
8113 // 3. If the cost model decides that it is more efficient to use vector
8114 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8115 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8116 //
8117 // 4. If the cost model decides to use vector registers and both of the
8118 // operands are zero-extended/sign-extended from 32-bits, then we split the
8119 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8120 // possible to check if the operands are zero-extended or sign-extended in
8121 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8122 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8123 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8124 // If the cost model decides that we have to use vector registers, then
8125 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8126 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8127 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8128 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8129 // SIInstrInfo.cpp .
8130
8131 if (Op->isDivergent())
8132 return SDValue();
8133
8134 SDValue Op0 = Op.getOperand(0);
8135 SDValue Op1 = Op.getOperand(1);
8136 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8137 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8138 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8139 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8140 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8141 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8142 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8143 SDLoc SL(Op);
8144 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8145 return SDValue(
8146 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8147 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8148 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8149 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8150 return SDValue(
8151 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8152 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8153 return Op;
8154}
8155
8156SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8157 EVT VT = Op.getValueType();
8158 SDLoc SL(Op);
8159 SDValue LHS = Op.getOperand(0);
8160 SDValue RHS = Op.getOperand(1);
8161 bool isSigned = Op.getOpcode() == ISD::SMULO;
8162
8163 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8164 const APInt &C = RHSC->getAPIntValue();
8165 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8166 if (C.isPowerOf2()) {
8167 // smulo(x, signed_min) is same as umulo(x, signed_min).
8168 bool UseArithShift = isSigned && !C.isMinSignedValue();
8169 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8170 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8171 SDValue Overflow =
8172 DAG.getSetCC(SL, MVT::i1,
8173 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8174 Result, ShiftAmt),
8175 LHS, ISD::SETNE);
8176 return DAG.getMergeValues({Result, Overflow}, SL);
8177 }
8178 }
8179
8180 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8181 SDValue Top =
8182 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8183
8184 SDValue Sign = isSigned
8185 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8186 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8187 SL, MVT::i32))
8188 : DAG.getConstant(0, SL, VT);
8189 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8190
8191 return DAG.getMergeValues({Result, Overflow}, SL);
8192}
8193
8194SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8195 if (Op->isDivergent()) {
8196 // Select to V_MAD_[IU]64_[IU]32.
8197 return Op;
8198 }
8199 if (Subtarget->hasSMulHi()) {
8200 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8201 return SDValue();
8202 }
8203 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8204 // calculate the high part, so we might as well do the whole thing with
8205 // V_MAD_[IU]64_[IU]32.
8206 return Op;
8207}
8208
8209SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8210 if (!Subtarget->isTrapHandlerEnabled() ||
8211 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8212 return lowerTrapEndpgm(Op, DAG);
8213
8214 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8215 : lowerTrapHsaQueuePtr(Op, DAG);
8216}
8217
8218SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8219 SDLoc SL(Op);
8220 SDValue Chain = Op.getOperand(0);
8221 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8222}
8223
8224SDValue
8225SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8226 const SDLoc &DL, Align Alignment,
8227 ImplicitParameter Param) const {
8228 MachineFunction &MF = DAG.getMachineFunction();
8229 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8230 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8231 MachinePointerInfo PtrInfo =
8233 return DAG.getLoad(
8234 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
8236}
8237
8238SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8239 SelectionDAG &DAG) const {
8240 SDLoc SL(Op);
8241 SDValue Chain = Op.getOperand(0);
8242
8243 SDValue QueuePtr;
8244 // For code object version 5, QueuePtr is passed through implicit kernarg.
8245 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8247 QueuePtr =
8248 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8249 } else {
8250 MachineFunction &MF = DAG.getMachineFunction();
8251 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8252 Register UserSGPR = Info->getQueuePtrUserSGPR();
8253
8254 if (UserSGPR == AMDGPU::NoRegister) {
8255 // We probably are in a function incorrectly marked with
8256 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8257 // trap, so just use a null pointer.
8258 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8259 } else {
8260 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8261 MVT::i64);
8262 }
8263 }
8264
8265 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8266 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8267
8268 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8269 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8270 ToReg.getValue(1)};
8271 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8272}
8273
8274SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8275 SDLoc SL(Op);
8276 SDValue Chain = Op.getOperand(0);
8277
8278 // We need to simulate the 's_trap 2' instruction on targets that run in
8279 // PRIV=1 (where it is treated as a nop).
8280 if (Subtarget->hasPrivEnabledTrap2NopBug())
8281 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8282
8283 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8284 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8285 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8286}
8287
8288SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8289 SDLoc SL(Op);
8290 SDValue Chain = Op.getOperand(0);
8291 MachineFunction &MF = DAG.getMachineFunction();
8292
8293 if (!Subtarget->isTrapHandlerEnabled() ||
8294 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8295 LLVMContext &Ctx = MF.getFunction().getContext();
8296 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8297 "debugtrap handler not supported",
8298 Op.getDebugLoc(), DS_Warning));
8299 return Chain;
8300 }
8301
8302 uint64_t TrapID =
8303 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8304 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8305 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8306}
8307
8308SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8309 SelectionDAG &DAG) const {
8310 if (Subtarget->hasApertureRegs()) {
8311 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8312 ? AMDGPU::SRC_SHARED_BASE
8313 : AMDGPU::SRC_PRIVATE_BASE;
8314 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8315 !Subtarget->hasGloballyAddressableScratch()) &&
8316 "Cannot use src_private_base with globally addressable scratch!");
8317 // Note: this feature (register) is broken. When used as a 32-bit operand,
8318 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8319 // bits.
8320 //
8321 // To work around the issue, emit a 64 bit copy from this register
8322 // then extract the high bits. Note that this shouldn't even result in a
8323 // shift being emitted and simply become a pair of registers (e.g.):
8324 // s_mov_b64 s[6:7], src_shared_base
8325 // v_mov_b32_e32 v1, s7
8326 SDValue Copy =
8327 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8328 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8329 }
8330
8331 // For code object version 5, private_base and shared_base are passed through
8332 // implicit kernargs.
8333 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8337 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8338 }
8339
8340 MachineFunction &MF = DAG.getMachineFunction();
8341 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8342 Register UserSGPR = Info->getQueuePtrUserSGPR();
8343 if (UserSGPR == AMDGPU::NoRegister) {
8344 // We probably are in a function incorrectly marked with
8345 // amdgpu-no-queue-ptr. This is undefined.
8346 return DAG.getPOISON(MVT::i32);
8347 }
8348
8349 SDValue QueuePtr =
8350 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8351
8352 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8353 // private_segment_aperture_base_hi.
8354 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8355
8356 SDValue Ptr =
8357 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8358
8359 // TODO: Use custom target PseudoSourceValue.
8360 // TODO: We should use the value from the IR intrinsic call, but it might not
8361 // be available and how do we get it?
8362 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8363 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8364 commonAlignment(Align(64), StructOffset),
8367}
8368
8369/// Return true if the value is a known valid address, such that a null check is
8370/// not necessary.
8372 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8374 return true;
8375
8376 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8377 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8378
8379 // TODO: Search through arithmetic, handle arguments and loads
8380 // marked nonnull.
8381 return false;
8382}
8383
8384SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8385 SelectionDAG &DAG) const {
8386 SDLoc SL(Op);
8387
8388 const AMDGPUTargetMachine &TM =
8389 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8390
8391 unsigned DestAS, SrcAS;
8392 SDValue Src;
8393 bool IsNonNull = false;
8394 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8395 SrcAS = ASC->getSrcAddressSpace();
8396 Src = ASC->getOperand(0);
8397 DestAS = ASC->getDestAddressSpace();
8398 } else {
8399 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8400 Op.getConstantOperandVal(0) ==
8401 Intrinsic::amdgcn_addrspacecast_nonnull);
8402 Src = Op->getOperand(1);
8403 SrcAS = Op->getConstantOperandVal(2);
8404 DestAS = Op->getConstantOperandVal(3);
8405 IsNonNull = true;
8406 }
8407
8408 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8409
8410 // flat -> local/private
8411 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8412 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8413 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8414 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8415
8416 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8417 Subtarget->hasGloballyAddressableScratch()) {
8418 // flat -> private with globally addressable scratch: subtract
8419 // src_flat_scratch_base_lo.
8420 SDValue FlatScratchBaseLo(
8421 DAG.getMachineNode(
8422 AMDGPU::S_MOV_B32, SL, MVT::i32,
8423 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8424 0);
8425 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8426 }
8427
8428 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8429 return Ptr;
8430
8431 unsigned NullVal = TM.getNullPointerValue(DestAS);
8432 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8433 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8434
8435 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8436 SegmentNullPtr);
8437 }
8438 }
8439
8440 // local/private -> flat
8441 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8442 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8443 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8444 SDValue CvtPtr;
8445 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8446 Subtarget->hasGloballyAddressableScratch()) {
8447 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8448 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8449 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8450 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8451 ThreadID = DAG.getNode(
8452 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8453 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8454 AllOnes, ThreadID);
8455 if (Subtarget->isWave64())
8456 ThreadID = DAG.getNode(
8457 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8458 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8459 AllOnes, ThreadID);
8460 SDValue ShAmt = DAG.getShiftAmountConstant(
8461 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8462 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8463 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8464 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8465 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8466 // 64-bit hi:lo value.
8467 SDValue FlatScratchBase = {
8468 DAG.getMachineNode(
8469 AMDGPU::S_MOV_B64, SL, MVT::i64,
8470 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8471 0};
8472 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8473 } else {
8474 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8475 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8476 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8477 }
8478
8479 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8480 return CvtPtr;
8481
8482 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8483 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8484
8485 SDValue NonNull =
8486 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8487
8488 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8489 FlatNullPtr);
8490 }
8491 }
8492
8493 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8494 Op.getValueType() == MVT::i64) {
8495 const SIMachineFunctionInfo *Info =
8496 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8497 if (Info->get32BitAddressHighBits() == 0)
8498 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
8499
8500 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8501 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8502 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8503 }
8504
8505 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8506 Src.getValueType() == MVT::i64)
8507 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8508
8509 // global <-> flat are no-ops and never emitted.
8510
8511 // Invalid casts are poison.
8512 return DAG.getPOISON(Op->getValueType(0));
8513}
8514
8515// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8516// the small vector and inserting them into the big vector. That is better than
8517// the default expansion of doing it via a stack slot. Even though the use of
8518// the stack slot would be optimized away afterwards, the stack slot itself
8519// remains.
8520SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8521 SelectionDAG &DAG) const {
8522 SDValue Vec = Op.getOperand(0);
8523 SDValue Ins = Op.getOperand(1);
8524 SDValue Idx = Op.getOperand(2);
8525 EVT VecVT = Vec.getValueType();
8526 EVT InsVT = Ins.getValueType();
8527 EVT EltVT = VecVT.getVectorElementType();
8528 unsigned InsNumElts = InsVT.getVectorNumElements();
8529 unsigned IdxVal = Idx->getAsZExtVal();
8530 SDLoc SL(Op);
8531
8532 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8533 // Insert 32-bit registers at a time.
8534 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8535
8536 unsigned VecNumElts = VecVT.getVectorNumElements();
8537 EVT NewVecVT =
8538 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8539 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8541 MVT::i32, InsNumElts / 2);
8542
8543 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8544 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8545
8546 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8547 SDValue Elt;
8548 if (InsNumElts == 2) {
8549 Elt = Ins;
8550 } else {
8551 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8552 DAG.getConstant(I, SL, MVT::i32));
8553 }
8554 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8555 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8556 }
8557
8558 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8559 }
8560
8561 for (unsigned I = 0; I != InsNumElts; ++I) {
8562 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8563 DAG.getConstant(I, SL, MVT::i32));
8564 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8565 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8566 }
8567 return Vec;
8568}
8569
8570SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8571 SelectionDAG &DAG) const {
8572 SDValue Vec = Op.getOperand(0);
8573 SDValue InsVal = Op.getOperand(1);
8574 SDValue Idx = Op.getOperand(2);
8575 EVT VecVT = Vec.getValueType();
8576 EVT EltVT = VecVT.getVectorElementType();
8577 unsigned VecSize = VecVT.getSizeInBits();
8578 unsigned EltSize = EltVT.getSizeInBits();
8579 SDLoc SL(Op);
8580
8581 // Specially handle the case of v4i16 with static indexing.
8582 unsigned NumElts = VecVT.getVectorNumElements();
8583 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8584 if (NumElts == 4 && EltSize == 16 && KIdx) {
8585 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8586
8587 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8588 DAG.getConstant(0, SL, MVT::i32));
8589 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8590 DAG.getConstant(1, SL, MVT::i32));
8591
8592 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8593 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8594
8595 unsigned Idx = KIdx->getZExtValue();
8596 bool InsertLo = Idx < 2;
8597 SDValue InsHalf = DAG.getNode(
8598 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8599 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8600 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8601
8602 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8603
8604 SDValue Concat =
8605 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8606 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8607
8608 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8609 }
8610
8611 // Static indexing does not lower to stack access, and hence there is no need
8612 // for special custom lowering to avoid stack access.
8613 if (isa<ConstantSDNode>(Idx))
8614 return SDValue();
8615
8616 // Avoid stack access for dynamic indexing by custom lowering to
8617 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8618
8619 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8620
8621 MVT IntVT = MVT::getIntegerVT(VecSize);
8622
8623 // Convert vector index to bit-index and get the required bit mask.
8624 assert(isPowerOf2_32(EltSize));
8625 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8626 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8627 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8628 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8629 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8630
8631 // 1. Create a congruent vector with the target value in each element.
8632 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8633 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8634
8635 // 2. Mask off all other indices except the required index within (1).
8636 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8637
8638 // 3. Mask off the required index within the target vector.
8639 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8640 SDValue RHS =
8641 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8642
8643 // 4. Get (2) and (3) ORed into the target vector.
8644 SDValue BFI =
8645 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8646
8647 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8648}
8649
8650SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8651 SelectionDAG &DAG) const {
8652 SDLoc SL(Op);
8653
8654 EVT ResultVT = Op.getValueType();
8655 SDValue Vec = Op.getOperand(0);
8656 SDValue Idx = Op.getOperand(1);
8657 EVT VecVT = Vec.getValueType();
8658 unsigned VecSize = VecVT.getSizeInBits();
8659 EVT EltVT = VecVT.getVectorElementType();
8660
8661 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8662
8663 // Make sure we do any optimizations that will make it easier to fold
8664 // source modifiers before obscuring it with bit operations.
8665
8666 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8667 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8668 return Combined;
8669
8670 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8671 SDValue Lo, Hi;
8672 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8673
8674 if (VecSize == 128) {
8675 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8676 Lo = DAG.getBitcast(LoVT,
8677 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8678 DAG.getConstant(0, SL, MVT::i32)));
8679 Hi = DAG.getBitcast(HiVT,
8680 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8681 DAG.getConstant(1, SL, MVT::i32)));
8682 } else if (VecSize == 256) {
8683 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8684 SDValue Parts[4];
8685 for (unsigned P = 0; P < 4; ++P) {
8686 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8687 DAG.getConstant(P, SL, MVT::i32));
8688 }
8689
8690 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8691 Parts[0], Parts[1]));
8692 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8693 Parts[2], Parts[3]));
8694 } else {
8695 assert(VecSize == 512);
8696
8697 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8698 SDValue Parts[8];
8699 for (unsigned P = 0; P < 8; ++P) {
8700 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8701 DAG.getConstant(P, SL, MVT::i32));
8702 }
8703
8704 Lo = DAG.getBitcast(LoVT,
8705 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8706 Parts[0], Parts[1], Parts[2], Parts[3]));
8707 Hi = DAG.getBitcast(HiVT,
8708 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8709 Parts[4], Parts[5], Parts[6], Parts[7]));
8710 }
8711
8712 EVT IdxVT = Idx.getValueType();
8713 unsigned NElem = VecVT.getVectorNumElements();
8714 assert(isPowerOf2_32(NElem));
8715 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8716 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8717 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8718 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8719 }
8720
8721 assert(VecSize <= 64);
8722
8723 MVT IntVT = MVT::getIntegerVT(VecSize);
8724
8725 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8726 SDValue VecBC = peekThroughBitcasts(Vec);
8727 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8728 SDValue Src = VecBC.getOperand(0);
8729 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8730 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8731 }
8732
8733 unsigned EltSize = EltVT.getSizeInBits();
8734 assert(isPowerOf2_32(EltSize));
8735
8736 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8737
8738 // Convert vector index to bit-index (* EltSize)
8739 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8740
8741 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8742 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8743
8744 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8745 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8746 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8747 }
8748
8749 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8750}
8751
8752static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8753 assert(Elt % 2 == 0);
8754 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8755}
8756
8757static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8758 assert(Elt % 2 == 0);
8759 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8760 !(Mask[Elt + 1] & 1);
8761}
8762
8763SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8764 SelectionDAG &DAG) const {
8765 SDLoc SL(Op);
8766 EVT ResultVT = Op.getValueType();
8767 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8768 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8769 const int NewSrcNumElts = 2;
8770 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8771 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8772
8773 // Break up the shuffle into registers sized pieces.
8774 //
8775 // We're trying to form sub-shuffles that the register allocation pipeline
8776 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8777 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8778 // pair of copies into a consecutive register copy, so use the ordinary
8779 // extract_vector_elt lowering unless we can use the shuffle.
8780 //
8781 // TODO: This is a bit of hack, and we should probably always use
8782 // extract_subvector for the largest possible subvector we can (or at least
8783 // use it for PackVT aligned pieces). However we have worse support for
8784 // combines on them don't directly treat extract_subvector / insert_subvector
8785 // as legal. The DAG scheduler also ends up doing a worse job with the
8786 // extract_subvectors.
8787 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8788
8789 // vector_shuffle <0,1,6,7> lhs, rhs
8790 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8791 //
8792 // vector_shuffle <6,7,2,3> lhs, rhs
8793 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8794 //
8795 // vector_shuffle <6,7,0,1> lhs, rhs
8796 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8797
8798 // Avoid scalarizing when both halves are reading from consecutive elements.
8799
8800 // If we're treating 2 element shuffles as legal, also create odd-to-even
8801 // shuffles of neighboring pairs.
8802 //
8803 // vector_shuffle <3,2,7,6> lhs, rhs
8804 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8805 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8806
8808 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8809 if (ShouldUseConsecutiveExtract &&
8811 const int Idx = SVN->getMaskElt(I);
8812 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8813 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8814 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8815 SVN->getOperand(VecIdx),
8816 DAG.getConstant(EltIdx, SL, MVT::i32));
8817 Pieces.push_back(SubVec);
8818 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8820 int Idx0 = SVN->getMaskElt(I);
8821 int Idx1 = SVN->getMaskElt(I + 1);
8822
8823 SDValue SrcOp0 = SVN->getOperand(0);
8824 SDValue SrcOp1 = SrcOp0;
8825 if (Idx0 >= SrcNumElts) {
8826 SrcOp0 = SVN->getOperand(1);
8827 Idx0 -= SrcNumElts;
8828 }
8829
8830 if (Idx1 >= SrcNumElts) {
8831 SrcOp1 = SVN->getOperand(1);
8832 Idx1 -= SrcNumElts;
8833 }
8834
8835 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8836 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8837
8838 // Extract nearest even aligned piece.
8839 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8840 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8841 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8842 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8843
8844 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8845 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8846
8847 SDValue Result0 = SubVec0;
8848 SDValue Result1 = SubVec0;
8849
8850 if (SubVec0 != SubVec1) {
8851 NewMaskIdx1 += NewSrcNumElts;
8852 Result1 = SubVec1;
8853 } else {
8854 Result1 = DAG.getPOISON(PackVT);
8855 }
8856
8857 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8858 {NewMaskIdx0, NewMaskIdx1});
8859 Pieces.push_back(Shuf);
8860 } else {
8861 const int Idx0 = SVN->getMaskElt(I);
8862 const int Idx1 = SVN->getMaskElt(I + 1);
8863 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8864 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8865 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8866 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8867
8868 SDValue Vec0 = SVN->getOperand(VecIdx0);
8869 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8870 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8871
8872 SDValue Vec1 = SVN->getOperand(VecIdx1);
8873 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8874 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8875 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8876 }
8877 }
8878
8879 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8880}
8881
8882SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8883 SelectionDAG &DAG) const {
8884 SDValue SVal = Op.getOperand(0);
8885 EVT ResultVT = Op.getValueType();
8886 EVT SValVT = SVal.getValueType();
8887 SDValue UndefVal = DAG.getPOISON(SValVT);
8888 SDLoc SL(Op);
8889
8891 VElts.push_back(SVal);
8892 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8893 VElts.push_back(UndefVal);
8894
8895 return DAG.getBuildVector(ResultVT, SL, VElts);
8896}
8897
8898SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8899 SelectionDAG &DAG) const {
8900 SDLoc SL(Op);
8901 EVT VT = Op.getValueType();
8902
8903 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8904 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8905
8906 SDValue Lo = Op.getOperand(0);
8907 SDValue Hi = Op.getOperand(1);
8908
8909 // Avoid adding defined bits with the zero_extend.
8910 if (Hi.isUndef()) {
8911 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8912 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8913 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8914 }
8915
8916 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8917 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8918
8919 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8920 DAG.getConstant(16, SL, MVT::i32));
8921 if (Lo.isUndef())
8922 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8923
8924 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8925 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8926
8927 SDValue Or =
8928 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8929 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8930 }
8931
8932 // Split into 2-element chunks.
8933 const unsigned NumParts = VT.getVectorNumElements() / 2;
8934 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8935 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8936
8938 for (unsigned P = 0; P < NumParts; ++P) {
8939 SDValue Vec = DAG.getBuildVector(
8940 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8941 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8942 }
8943
8944 SDValue Blend =
8945 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8946 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8947}
8948
8950 const GlobalAddressSDNode *GA) const {
8951 // OSes that use ELF REL relocations (instead of RELA) can only store a
8952 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8953 // which can create arbitrary 64-bit addends. (This is only a problem for
8954 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8955 // the high 32 bits of the addend.)
8956 //
8957 // This should be kept in sync with how HasRelocationAddend is initialized in
8958 // the constructor of ELFAMDGPUAsmBackend.
8959 if (!Subtarget->isAmdHsaOS())
8960 return false;
8961
8962 // We can fold offsets for anything that doesn't require a GOT relocation.
8963 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8967}
8968
8969static SDValue
8971 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8972 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8973 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8974 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8975 // lowered to the following code sequence:
8976 //
8977 // For constant address space:
8978 // s_getpc_b64 s[0:1]
8979 // s_add_u32 s0, s0, $symbol
8980 // s_addc_u32 s1, s1, 0
8981 //
8982 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8983 // a fixup or relocation is emitted to replace $symbol with a literal
8984 // constant, which is a pc-relative offset from the encoding of the $symbol
8985 // operand to the global variable.
8986 //
8987 // For global address space:
8988 // s_getpc_b64 s[0:1]
8989 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8990 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8991 //
8992 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8993 // fixups or relocations are emitted to replace $symbol@*@lo and
8994 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8995 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8996 // operand to the global variable.
8997 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8998 assert(GAFlags != SIInstrInfo::MO_NONE);
8999
9000 SDValue Ptr =
9001 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
9002 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
9003 }
9004
9005 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
9006 SDValue PtrHi;
9007 if (GAFlags == SIInstrInfo::MO_NONE)
9008 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
9009 else
9010 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
9011 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
9012}
9013
9014SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
9015 SDValue Op,
9016 SelectionDAG &DAG) const {
9017 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
9018 SDLoc DL(GSD);
9019 EVT PtrVT = Op.getValueType();
9020
9021 const GlobalValue *GV = GSD->getGlobal();
9027 GV->hasExternalLinkage()) {
9028 Type *Ty = GV->getValueType();
9029 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9030 // zero-sized type in other languages to declare the dynamic shared
9031 // memory which size is not known at the compile time. They will be
9032 // allocated by the runtime and placed directly after the static
9033 // allocated ones. They all share the same offset.
9034 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
9035 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9036 // Adjust alignment for that dynamic shared memory array.
9039 MFI->setUsesDynamicLDS(true);
9040 return SDValue(
9041 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
9042 }
9043 }
9045 }
9046
9048 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
9050 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
9051 }
9052
9053 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9054 if (Subtarget->has64BitLiterals()) {
9056 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
9057 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
9058 0);
9059 }
9060
9061 SDValue AddrLo = DAG.getTargetGlobalAddress(
9062 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
9063 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
9064
9065 SDValue AddrHi = DAG.getTargetGlobalAddress(
9066 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
9067 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
9068
9069 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9070 }
9071
9072 if (shouldEmitFixup(GV))
9073 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9074
9075 if (shouldEmitPCReloc(GV))
9076 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
9078
9079 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
9081 PointerType *PtrTy =
9083 const DataLayout &DataLayout = DAG.getDataLayout();
9084 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
9085 MachinePointerInfo PtrInfo =
9087
9088 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9091}
9092
9093SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9094 SelectionDAG &DAG) const {
9095 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9096 const Function &Fn = DAG.getMachineFunction().getFunction();
9097 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9098 Fn, "unsupported external symbol", Op.getDebugLoc()));
9099 return DAG.getPOISON(Op.getValueType());
9100}
9101
9103 const SDLoc &DL, SDValue V) const {
9104 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9105 // the destination register.
9106 //
9107 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9108 // so we will end up with redundant moves to m0.
9109 //
9110 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9111
9112 // A Null SDValue creates a glue result.
9113 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9114 V, Chain);
9115 return SDValue(M0, 0);
9116}
9117
9118SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9119 MVT VT,
9120 unsigned Offset) const {
9121 SDLoc SL(Op);
9122 SDValue Param = lowerKernargMemParameter(
9123 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9124 // The local size values will have the hi 16-bits as zero.
9125 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9126 DAG.getValueType(VT));
9127}
9128
9130 EVT VT) {
9133 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9134 return DAG.getPOISON(VT);
9135}
9136
9138 EVT VT) {
9141 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9142 return DAG.getPOISON(VT);
9143}
9144
9146 ArrayRef<SDValue> Elts) {
9147 assert(!Elts.empty());
9148 MVT Type;
9149 unsigned NumElts = Elts.size();
9150
9151 if (NumElts <= 12) {
9152 Type = MVT::getVectorVT(MVT::f32, NumElts);
9153 } else {
9154 assert(Elts.size() <= 16);
9155 Type = MVT::v16f32;
9156 NumElts = 16;
9157 }
9158
9159 SmallVector<SDValue, 16> VecElts(NumElts);
9160 for (unsigned i = 0; i < Elts.size(); ++i) {
9161 SDValue Elt = Elts[i];
9162 if (Elt.getValueType() != MVT::f32)
9163 Elt = DAG.getBitcast(MVT::f32, Elt);
9164 VecElts[i] = Elt;
9165 }
9166 for (unsigned i = Elts.size(); i < NumElts; ++i)
9167 VecElts[i] = DAG.getPOISON(MVT::f32);
9168
9169 if (NumElts == 1)
9170 return VecElts[0];
9171 return DAG.getBuildVector(Type, DL, VecElts);
9172}
9173
9174static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9175 SDValue Src, int ExtraElts) {
9176 EVT SrcVT = Src.getValueType();
9177
9179
9180 if (SrcVT.isVector())
9181 DAG.ExtractVectorElements(Src, Elts);
9182 else
9183 Elts.push_back(Src);
9184
9185 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9186 while (ExtraElts--)
9187 Elts.push_back(Undef);
9188
9189 return DAG.getBuildVector(CastVT, DL, Elts);
9190}
9191
9192// Re-construct the required return value for a image load intrinsic.
9193// This is more complicated due to the optional use TexFailCtrl which means the
9194// required return type is an aggregate
9196 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9197 bool Unpacked, bool IsD16, int DMaskPop,
9198 int NumVDataDwords, bool IsAtomicPacked16Bit,
9199 const SDLoc &DL) {
9200 // Determine the required return type. This is the same regardless of
9201 // IsTexFail flag
9202 EVT ReqRetVT = ResultTypes[0];
9203 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9204 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9205 ? (ReqRetNumElts + 1) / 2
9206 : ReqRetNumElts;
9207
9208 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9209
9210 MVT DataDwordVT =
9211 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9212
9213 MVT MaskPopVT =
9214 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9215
9216 SDValue Data(Result, 0);
9217 SDValue TexFail;
9218
9219 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9220 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9221 if (MaskPopVT.isVector()) {
9222 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9223 SDValue(Result, 0), ZeroIdx);
9224 } else {
9225 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9226 SDValue(Result, 0), ZeroIdx);
9227 }
9228 }
9229
9230 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9231 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9232 NumDataDwords - MaskPopDwords);
9233
9234 if (IsD16)
9235 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9236
9237 EVT LegalReqRetVT = ReqRetVT;
9238 if (!ReqRetVT.isVector()) {
9239 if (!Data.getValueType().isInteger())
9240 Data = DAG.getNode(ISD::BITCAST, DL,
9241 Data.getValueType().changeTypeToInteger(), Data);
9242 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9243 } else {
9244 // We need to widen the return vector to a legal type
9245 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9246 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9247 LegalReqRetVT =
9249 ReqRetVT.getVectorNumElements() + 1);
9250 }
9251 }
9252 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9253
9254 if (IsTexFail) {
9255 TexFail =
9256 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9257 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9258
9259 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9260 }
9261
9262 if (Result->getNumValues() == 1)
9263 return Data;
9264
9265 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9266}
9267
9268static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9269 SDValue *LWE, bool &IsTexFail) {
9270 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9271
9272 uint64_t Value = TexFailCtrlConst->getZExtValue();
9273 if (Value) {
9274 IsTexFail = true;
9275 }
9276
9277 SDLoc DL(TexFailCtrlConst);
9278 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9279 Value &= ~(uint64_t)0x1;
9280 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9281 Value &= ~(uint64_t)0x2;
9282
9283 return Value == 0;
9284}
9285
9287 MVT PackVectorVT,
9288 SmallVectorImpl<SDValue> &PackedAddrs,
9289 unsigned DimIdx, unsigned EndIdx,
9290 unsigned NumGradients) {
9291 SDLoc DL(Op);
9292 for (unsigned I = DimIdx; I < EndIdx; I++) {
9293 SDValue Addr = Op.getOperand(I);
9294
9295 // Gradients are packed with undef for each coordinate.
9296 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9297 // 1D: undef,dx/dh; undef,dx/dv
9298 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9299 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9300 if (((I + 1) >= EndIdx) ||
9301 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9302 I == DimIdx + NumGradients - 1))) {
9303 if (Addr.getValueType() != MVT::i16)
9304 Addr = DAG.getBitcast(MVT::i16, Addr);
9305 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9306 } else {
9307 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9308 I++;
9309 }
9310 Addr = DAG.getBitcast(MVT::f32, Addr);
9311 PackedAddrs.push_back(Addr);
9312 }
9313}
9314
9315SDValue SITargetLowering::lowerImage(SDValue Op,
9317 SelectionDAG &DAG, bool WithChain) const {
9318 SDLoc DL(Op);
9319 MachineFunction &MF = DAG.getMachineFunction();
9320 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9321 unsigned IntrOpcode = Intr->BaseOpcode;
9322 // For image atomic: use no-return opcode if result is unused.
9323 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9324 !Op.getNode()->hasAnyUseOfValue(0))
9325 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9326 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9328 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9329 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9330 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9331 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9332
9333 SmallVector<EVT, 3> ResultTypes(Op->values());
9334 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9335 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9336 ResultTypes.erase(&ResultTypes[0]);
9337
9338 bool IsD16 = false;
9339 bool IsG16 = false;
9340 bool IsA16 = false;
9341 SDValue VData;
9342 int NumVDataDwords = 0;
9343 bool AdjustRetType = false;
9344 bool IsAtomicPacked16Bit = false;
9345
9346 // Offset of intrinsic arguments
9347 const unsigned ArgOffset = WithChain ? 2 : 1;
9348
9349 unsigned DMask;
9350 unsigned DMaskLanes = 0;
9351
9352 if (BaseOpcode->Atomic) {
9353 VData = Op.getOperand(2);
9354
9355 IsAtomicPacked16Bit =
9356 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9357 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9358 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9359 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9360
9361 bool Is64Bit = VData.getValueSizeInBits() == 64;
9362 if (BaseOpcode->AtomicX2) {
9363 SDValue VData2 = Op.getOperand(3);
9364 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9365 {VData, VData2});
9366 if (Is64Bit)
9367 VData = DAG.getBitcast(MVT::v4i32, VData);
9368
9369 if (!BaseOpcode->NoReturn)
9370 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9371
9372 DMask = Is64Bit ? 0xf : 0x3;
9373 NumVDataDwords = Is64Bit ? 4 : 2;
9374 } else {
9375 DMask = Is64Bit ? 0x3 : 0x1;
9376 NumVDataDwords = Is64Bit ? 2 : 1;
9377 }
9378 } else {
9379 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9380 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9381
9382 if (BaseOpcode->Store) {
9383 VData = Op.getOperand(2);
9384
9385 MVT StoreVT = VData.getSimpleValueType();
9386 if (StoreVT.getScalarType() == MVT::f16) {
9387 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9388 return Op; // D16 is unsupported for this instruction
9389
9390 IsD16 = true;
9391 VData = handleD16VData(VData, DAG, true);
9392 }
9393
9394 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9395 } else if (!BaseOpcode->NoReturn) {
9396 // Work out the num dwords based on the dmask popcount and underlying type
9397 // and whether packing is supported.
9398 MVT LoadVT = ResultTypes[0].getSimpleVT();
9399 if (LoadVT.getScalarType() == MVT::f16) {
9400 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9401 return Op; // D16 is unsupported for this instruction
9402
9403 IsD16 = true;
9404 }
9405
9406 // Confirm that the return type is large enough for the dmask specified
9407 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9408 (!LoadVT.isVector() && DMaskLanes > 1))
9409 return Op;
9410
9411 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9412 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9413 // instructions.
9414 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9415 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9416 NumVDataDwords = (DMaskLanes + 1) / 2;
9417 else
9418 NumVDataDwords = DMaskLanes;
9419
9420 AdjustRetType = true;
9421 }
9422 }
9423
9424 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9426
9427 // Check for 16 bit addresses or derivatives and pack if true.
9428 MVT VAddrVT =
9429 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9430 MVT VAddrScalarVT = VAddrVT.getScalarType();
9431 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9432 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9433
9434 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9435 VAddrScalarVT = VAddrVT.getScalarType();
9436 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9437 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9438
9439 // Push back extra arguments.
9440 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9441 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9442 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9443 // Special handling of bias when A16 is on. Bias is of type half but
9444 // occupies full 32-bit.
9445 SDValue Bias = DAG.getBuildVector(
9446 MVT::v2f16, DL,
9447 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9448 VAddrs.push_back(Bias);
9449 } else {
9450 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9451 "Bias needs to be converted to 16 bit in A16 mode");
9452 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9453 }
9454 }
9455
9456 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9457 // 16 bit gradients are supported, but are tied to the A16 control
9458 // so both gradients and addresses must be 16 bit
9459 LLVM_DEBUG(
9460 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9461 "require 16 bit args for both gradients and addresses");
9462 return Op;
9463 }
9464
9465 if (IsA16) {
9466 if (!ST->hasA16()) {
9467 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9468 "support 16 bit addresses\n");
9469 return Op;
9470 }
9471 }
9472
9473 // We've dealt with incorrect input so we know that if IsA16, IsG16
9474 // are set then we have to compress/pack operands (either address,
9475 // gradient or both)
9476 // In the case where a16 and gradients are tied (no G16 support) then we
9477 // have already verified that both IsA16 and IsG16 are true
9478 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9479 // Activate g16
9480 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9482 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9483 }
9484
9485 // Add gradients (packed or unpacked)
9486 if (IsG16) {
9487 // Pack the gradients
9488 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9489 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9490 ArgOffset + Intr->GradientStart,
9491 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9492 } else {
9493 for (unsigned I = ArgOffset + Intr->GradientStart;
9494 I < ArgOffset + Intr->CoordStart; I++)
9495 VAddrs.push_back(Op.getOperand(I));
9496 }
9497
9498 // Add addresses (packed or unpacked)
9499 if (IsA16) {
9500 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9501 ArgOffset + Intr->CoordStart, VAddrEnd,
9502 0 /* No gradients */);
9503 } else {
9504 // Add uncompressed address
9505 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9506 VAddrs.push_back(Op.getOperand(I));
9507 }
9508
9509 // If the register allocator cannot place the address registers contiguously
9510 // without introducing moves, then using the non-sequential address encoding
9511 // is always preferable, since it saves VALU instructions and is usually a
9512 // wash in terms of code size or even better.
9513 //
9514 // However, we currently have no way of hinting to the register allocator that
9515 // MIMG addresses should be placed contiguously when it is possible to do so,
9516 // so force non-NSA for the common 2-address case as a heuristic.
9517 //
9518 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9519 // allocation when possible.
9520 //
9521 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9522 // set of the remaining addresses.
9523 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9524 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9525 const bool UseNSA = ST->hasNSAEncoding() &&
9526 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9527 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9528 const bool UsePartialNSA =
9529 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9530
9531 SDValue VAddr;
9532 if (UsePartialNSA) {
9533 VAddr = getBuildDwordsVector(DAG, DL,
9534 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9535 } else if (!UseNSA) {
9536 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9537 }
9538
9539 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9540 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9541 SDValue Unorm;
9542 if (!BaseOpcode->Sampler) {
9543 Unorm = True;
9544 } else {
9545 uint64_t UnormConst =
9546 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9547
9548 Unorm = UnormConst ? True : False;
9549 }
9550
9551 SDValue TFE;
9552 SDValue LWE;
9553 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9554 bool IsTexFail = false;
9555 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9556 return Op;
9557
9558 if (IsTexFail) {
9559 if (!DMaskLanes) {
9560 // Expecting to get an error flag since TFC is on - and dmask is 0
9561 // Force dmask to be at least 1 otherwise the instruction will fail
9562 DMask = 0x1;
9563 DMaskLanes = 1;
9564 NumVDataDwords = 1;
9565 }
9566 NumVDataDwords += 1;
9567 AdjustRetType = true;
9568 }
9569
9570 // Has something earlier tagged that the return type needs adjusting
9571 // This happens if the instruction is a load or has set TexFailCtrl flags
9572 if (AdjustRetType) {
9573 // NumVDataDwords reflects the true number of dwords required in the return
9574 // type
9575 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9576 // This is a no-op load. This can be eliminated
9577 SDValue Undef = DAG.getPOISON(Op.getValueType());
9578 if (isa<MemSDNode>(Op))
9579 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9580 return Undef;
9581 }
9582
9583 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9584 MVT::i32, NumVDataDwords)
9585 : MVT::i32;
9586
9587 ResultTypes[0] = NewVT;
9588 if (ResultTypes.size() == 3) {
9589 // Original result was aggregate type used for TexFailCtrl results
9590 // The actual instruction returns as a vector type which has now been
9591 // created. Remove the aggregate result.
9592 ResultTypes.erase(&ResultTypes[1]);
9593 }
9594 }
9595
9596 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9597 // Keep GLC only when the atomic's result is actually used.
9598 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9600 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9602 return Op;
9603
9605 if (BaseOpcode->Store || BaseOpcode->Atomic)
9606 Ops.push_back(VData); // vdata
9607 if (UsePartialNSA) {
9608 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9609 Ops.push_back(VAddr);
9610 } else if (UseNSA)
9611 append_range(Ops, VAddrs);
9612 else
9613 Ops.push_back(VAddr);
9614 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9615 EVT RsrcVT = Rsrc.getValueType();
9616 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9617 return Op;
9618 Ops.push_back(Rsrc);
9619 if (BaseOpcode->Sampler) {
9620 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9621 if (Samp.getValueType() != MVT::v4i32)
9622 return Op;
9623 Ops.push_back(Samp);
9624 }
9625 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9626 if (IsGFX10Plus)
9627 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9628 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9629 Ops.push_back(Unorm);
9630 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9631 Ops.push_back(IsA16 && // r128, a16 for gfx9
9632 ST->hasFeature(AMDGPU::FeatureR128A16)
9633 ? True
9634 : False);
9635 if (IsGFX10Plus)
9636 Ops.push_back(IsA16 ? True : False);
9637
9638 if (!Subtarget->hasGFX90AInsts())
9639 Ops.push_back(TFE); // tfe
9640 else if (TFE->getAsZExtVal()) {
9641 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9643 "TFE is not supported on this GPU", DL.getDebugLoc()));
9644 }
9645
9646 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9647 Ops.push_back(LWE); // lwe
9648 if (!IsGFX10Plus)
9649 Ops.push_back(DimInfo->DA ? True : False);
9650 if (BaseOpcode->HasD16)
9651 Ops.push_back(IsD16 ? True : False);
9652 if (isa<MemSDNode>(Op))
9653 Ops.push_back(Op.getOperand(0)); // chain
9654
9655 int NumVAddrDwords =
9656 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9657 int Opcode = -1;
9658
9659 if (IsGFX12Plus) {
9660 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9661 NumVDataDwords, NumVAddrDwords);
9662 } else if (IsGFX11Plus) {
9663 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9664 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9665 : AMDGPU::MIMGEncGfx11Default,
9666 NumVDataDwords, NumVAddrDwords);
9667 } else if (IsGFX10Plus) {
9668 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9669 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9670 : AMDGPU::MIMGEncGfx10Default,
9671 NumVDataDwords, NumVAddrDwords);
9672 } else {
9673 if (Subtarget->hasGFX90AInsts()) {
9674 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9675 NumVDataDwords, NumVAddrDwords);
9676 if (Opcode == -1) {
9677 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9679 "requested image instruction is not supported on this GPU",
9680 DL.getDebugLoc()));
9681
9682 unsigned Idx = 0;
9683 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9684 for (EVT VT : OrigResultTypes) {
9685 if (VT == MVT::Other)
9686 RetValues[Idx++] = Op.getOperand(0); // Chain
9687 else
9688 RetValues[Idx++] = DAG.getPOISON(VT);
9689 }
9690
9691 return DAG.getMergeValues(RetValues, DL);
9692 }
9693 }
9694 if (Opcode == -1 &&
9695 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9696 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9697 NumVDataDwords, NumVAddrDwords);
9698 if (Opcode == -1)
9699 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9700 NumVDataDwords, NumVAddrDwords);
9701 }
9702 if (Opcode == -1)
9703 return Op;
9704
9705 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9706 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9707 MachineMemOperand *MemRef = MemOp->getMemOperand();
9708 DAG.setNodeMemRefs(NewNode, {MemRef});
9709 }
9710
9711 if (BaseOpcode->NoReturn) {
9712 if (BaseOpcode->Atomic)
9713 return DAG.getMergeValues(
9714 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
9715
9716 return SDValue(NewNode, 0);
9717 }
9718
9719 if (BaseOpcode->AtomicX2) {
9721 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9722 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9723 }
9724
9725 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9726 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9727 NumVDataDwords, IsAtomicPacked16Bit, DL);
9728}
9729
9730SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9731 SDValue Offset, SDValue CachePolicy,
9732 SelectionDAG &DAG) const {
9733 MachineFunction &MF = DAG.getMachineFunction();
9734
9735 const DataLayout &DataLayout = DAG.getDataLayout();
9736 Align Alignment =
9737 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9738
9739 MachineMemOperand *MMO = MF.getMachineMemOperand(
9740 MachinePointerInfo(),
9743 VT.getStoreSize(), Alignment);
9744
9745 if (!Offset->isDivergent()) {
9746 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9747
9748 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9749 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9750 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9751 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9752 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9753 SDValue BufferLoad =
9754 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
9755 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9756 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9757 }
9758
9759 // Widen vec3 load to vec4.
9760 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9761 !Subtarget->hasScalarDwordx3Loads()) {
9762 EVT WidenedVT =
9764 auto WidenedOp = DAG.getMemIntrinsicNode(
9765 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9766 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9767 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9768 DAG.getVectorIdxConstant(0, DL));
9769 return Subvector;
9770 }
9771
9772 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
9773 DAG.getVTList(VT), Ops, VT, MMO);
9774 }
9775
9776 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9777 // assume that the buffer is unswizzled.
9778 SDValue Ops[] = {
9779 DAG.getEntryNode(), // Chain
9780 Rsrc, // rsrc
9781 DAG.getConstant(0, DL, MVT::i32), // vindex
9782 {}, // voffset
9783 {}, // soffset
9784 {}, // offset
9785 CachePolicy, // cachepolicy
9786 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9787 };
9788 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9789 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9790 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9791 }
9792
9794 unsigned NumLoads = 1;
9795 MVT LoadVT = VT.getSimpleVT();
9796 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9797 assert((LoadVT.getScalarType() == MVT::i32 ||
9798 LoadVT.getScalarType() == MVT::f32));
9799
9800 if (NumElts == 8 || NumElts == 16) {
9801 NumLoads = NumElts / 4;
9802 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9803 }
9804
9805 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9806
9807 // Use the alignment to ensure that the required offsets will fit into the
9808 // immediate offsets.
9809 setBufferOffsets(Offset, DAG, &Ops[3],
9810 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9811
9812 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9813 for (unsigned i = 0; i < NumLoads; ++i) {
9814 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9815 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9816 LoadVT, MMO, DAG));
9817 }
9818
9819 if (NumElts == 8 || NumElts == 16)
9820 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9821
9822 return Loads[0];
9823}
9824
9825SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9826 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9827 if (!Subtarget->hasArchitectedSGPRs())
9828 return {};
9829 SDLoc SL(Op);
9830 MVT VT = MVT::i32;
9831 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9832 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9833 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9834}
9835
9836SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9837 AMDGPU::Hwreg::Id HwReg,
9838 unsigned LowBit,
9839 unsigned Width) const {
9840 SDLoc SL(Op);
9841 using namespace AMDGPU::Hwreg;
9842 return {DAG.getMachineNode(
9843 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9844 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9845 SL, MVT::i32)),
9846 0};
9847}
9848
9849SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9850 unsigned Dim,
9851 const ArgDescriptor &Arg) const {
9852 SDLoc SL(Op);
9853 MachineFunction &MF = DAG.getMachineFunction();
9854 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9855 if (MaxID == 0)
9856 return DAG.getConstant(0, SL, MVT::i32);
9857
9858 // It's undefined behavior if a function marked with the amdgpu-no-*
9859 // attributes uses the corresponding intrinsic.
9860 if (!Arg)
9861 return DAG.getPOISON(Op->getValueType(0));
9862
9863 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9864 SDLoc(DAG.getEntryNode()), Arg);
9865
9866 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9867 // masking operations anyway.
9868 //
9869 // TODO: We could assert the top bit is 0 for the source copy.
9870 if (Arg.isMasked())
9871 return Val;
9872
9873 // Preserve the known bits after expansion to a copy.
9874 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9875 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9876 DAG.getValueType(SmallVT));
9877}
9878
9879SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9880 SelectionDAG &DAG) const {
9881 MachineFunction &MF = DAG.getMachineFunction();
9882 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9883
9884 EVT VT = Op.getValueType();
9885 SDLoc DL(Op);
9886 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9887
9888 // TODO: Should this propagate fast-math-flags?
9889
9890 switch (IntrinsicID) {
9891 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9892 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9893 return emitNonHSAIntrinsicError(DAG, DL, VT);
9894 return getPreloadedValue(DAG, *MFI, VT,
9896 }
9897 case Intrinsic::amdgcn_dispatch_ptr:
9898 case Intrinsic::amdgcn_queue_ptr: {
9899 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9900 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9901 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9902 DL.getDebugLoc()));
9903 return DAG.getPOISON(VT);
9904 }
9905
9906 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9909 return getPreloadedValue(DAG, *MFI, VT, RegID);
9910 }
9911 case Intrinsic::amdgcn_implicitarg_ptr: {
9912 if (MFI->isEntryFunction())
9913 return getImplicitArgPtr(DAG, DL);
9914 return getPreloadedValue(DAG, *MFI, VT,
9916 }
9917 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9918 if (!AMDGPU::isKernel(MF.getFunction())) {
9919 // This only makes sense to call in a kernel, so just lower to null.
9920 return DAG.getConstant(0, DL, VT);
9921 }
9922
9923 return getPreloadedValue(DAG, *MFI, VT,
9925 }
9926 case Intrinsic::amdgcn_dispatch_id: {
9927 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9928 }
9929 case Intrinsic::amdgcn_rcp:
9930 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9931 case Intrinsic::amdgcn_rsq:
9932 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9933 case Intrinsic::amdgcn_rsq_legacy:
9934 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9935 return emitRemovedIntrinsicError(DAG, DL, VT);
9936 return SDValue();
9937 case Intrinsic::amdgcn_rcp_legacy:
9938 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9939 return emitRemovedIntrinsicError(DAG, DL, VT);
9940 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9941 case Intrinsic::amdgcn_rsq_clamp: {
9942 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9943 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9944
9945 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9946 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9947 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9948
9949 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9950 SDValue Tmp =
9951 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9952 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9953 DAG.getConstantFP(Min, DL, VT));
9954 }
9955 case Intrinsic::r600_read_ngroups_x:
9956 if (Subtarget->isAmdHsaOS())
9957 return emitNonHSAIntrinsicError(DAG, DL, VT);
9958
9959 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9961 false);
9962 case Intrinsic::r600_read_ngroups_y:
9963 if (Subtarget->isAmdHsaOS())
9964 return emitNonHSAIntrinsicError(DAG, DL, VT);
9965
9966 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9968 false);
9969 case Intrinsic::r600_read_ngroups_z:
9970 if (Subtarget->isAmdHsaOS())
9971 return emitNonHSAIntrinsicError(DAG, DL, VT);
9972
9973 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9975 false);
9976 case Intrinsic::r600_read_local_size_x:
9977 if (Subtarget->isAmdHsaOS())
9978 return emitNonHSAIntrinsicError(DAG, DL, VT);
9979
9980 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9982 case Intrinsic::r600_read_local_size_y:
9983 if (Subtarget->isAmdHsaOS())
9984 return emitNonHSAIntrinsicError(DAG, DL, VT);
9985
9986 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9988 case Intrinsic::r600_read_local_size_z:
9989 if (Subtarget->isAmdHsaOS())
9990 return emitNonHSAIntrinsicError(DAG, DL, VT);
9991
9992 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9994 case Intrinsic::amdgcn_workgroup_id_x:
9995 return lowerWorkGroupId(DAG, *MFI, VT,
9999 case Intrinsic::amdgcn_workgroup_id_y:
10000 return lowerWorkGroupId(DAG, *MFI, VT,
10004 case Intrinsic::amdgcn_workgroup_id_z:
10005 return lowerWorkGroupId(DAG, *MFI, VT,
10009 case Intrinsic::amdgcn_cluster_id_x:
10010 return Subtarget->hasClusters()
10011 ? getPreloadedValue(DAG, *MFI, VT,
10013 : DAG.getPOISON(VT);
10014 case Intrinsic::amdgcn_cluster_id_y:
10015 return Subtarget->hasClusters()
10016 ? getPreloadedValue(DAG, *MFI, VT,
10018 : DAG.getPOISON(VT);
10019 case Intrinsic::amdgcn_cluster_id_z:
10020 return Subtarget->hasClusters()
10021 ? getPreloadedValue(DAG, *MFI, VT,
10023 : DAG.getPOISON(VT);
10024 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10025 return Subtarget->hasClusters()
10026 ? getPreloadedValue(
10027 DAG, *MFI, VT,
10029 : DAG.getPOISON(VT);
10030 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10031 return Subtarget->hasClusters()
10032 ? getPreloadedValue(
10033 DAG, *MFI, VT,
10035 : DAG.getPOISON(VT);
10036 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10037 return Subtarget->hasClusters()
10038 ? getPreloadedValue(
10039 DAG, *MFI, VT,
10041 : DAG.getPOISON(VT);
10042 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10043 return Subtarget->hasClusters()
10044 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
10045 : SDValue();
10046 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10047 return Subtarget->hasClusters()
10048 ? getPreloadedValue(
10049 DAG, *MFI, VT,
10051 : DAG.getPOISON(VT);
10052 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10053 return Subtarget->hasClusters()
10054 ? getPreloadedValue(
10055 DAG, *MFI, VT,
10057 : DAG.getPOISON(VT);
10058 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10059 return Subtarget->hasClusters()
10060 ? getPreloadedValue(
10061 DAG, *MFI, VT,
10063 : DAG.getPOISON(VT);
10064 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10065 return Subtarget->hasClusters()
10066 ? getPreloadedValue(
10067 DAG, *MFI, VT,
10069 : DAG.getPOISON(VT);
10070 case Intrinsic::amdgcn_wave_id:
10071 return lowerWaveID(DAG, Op);
10072 case Intrinsic::amdgcn_lds_kernel_id: {
10073 if (MFI->isEntryFunction())
10074 return getLDSKernelId(DAG, DL);
10075 return getPreloadedValue(DAG, *MFI, VT,
10077 }
10078 case Intrinsic::amdgcn_workitem_id_x:
10079 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
10080 case Intrinsic::amdgcn_workitem_id_y:
10081 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
10082 case Intrinsic::amdgcn_workitem_id_z:
10083 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
10084 case Intrinsic::amdgcn_wavefrontsize:
10085 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10086 SDLoc(Op), MVT::i32);
10087 case Intrinsic::amdgcn_s_buffer_load: {
10088 unsigned CPol = Op.getConstantOperandVal(3);
10089 // s_buffer_load, because of how it's optimized, can't be volatile
10090 // so reject ones with the volatile bit set.
10091 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10094 return Op;
10095 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
10096 Op.getOperand(3), DAG);
10097 }
10098 case Intrinsic::amdgcn_fdiv_fast:
10099 return lowerFDIV_FAST(Op, DAG);
10100 case Intrinsic::amdgcn_sin:
10101 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10102
10103 case Intrinsic::amdgcn_cos:
10104 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10105
10106 case Intrinsic::amdgcn_mul_u24:
10107 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10108 Op.getOperand(2));
10109 case Intrinsic::amdgcn_mul_i24:
10110 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10111 Op.getOperand(2));
10112
10113 case Intrinsic::amdgcn_log_clamp: {
10114 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10115 return SDValue();
10116
10117 return emitRemovedIntrinsicError(DAG, DL, VT);
10118 }
10119 case Intrinsic::amdgcn_fract:
10120 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10121
10122 case Intrinsic::amdgcn_class:
10123 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10124 Op.getOperand(2));
10125 case Intrinsic::amdgcn_div_fmas:
10126 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10127 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10128
10129 case Intrinsic::amdgcn_div_fixup:
10130 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10131 Op.getOperand(2), Op.getOperand(3));
10132
10133 case Intrinsic::amdgcn_div_scale: {
10134 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10135
10136 // Translate to the operands expected by the machine instruction. The
10137 // first parameter must be the same as the first instruction.
10138 SDValue Numerator = Op.getOperand(1);
10139 SDValue Denominator = Op.getOperand(2);
10140
10141 // Note this order is opposite of the machine instruction's operations,
10142 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10143 // intrinsic has the numerator as the first operand to match a normal
10144 // division operation.
10145
10146 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10147
10148 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10149 Denominator, Numerator);
10150 }
10151 case Intrinsic::amdgcn_icmp: {
10152 // There is a Pat that handles this variant, so return it as-is.
10153 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10154 Op.getConstantOperandVal(2) == 0 &&
10155 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10156 return Op;
10157 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10158 }
10159 case Intrinsic::amdgcn_fcmp: {
10160 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10161 }
10162 case Intrinsic::amdgcn_ballot:
10163 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10164 case Intrinsic::amdgcn_fmed3:
10165 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10166 Op.getOperand(2), Op.getOperand(3));
10167 case Intrinsic::amdgcn_fdot2:
10168 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10169 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10170 case Intrinsic::amdgcn_fmul_legacy:
10171 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10172 Op.getOperand(2));
10173 case Intrinsic::amdgcn_sffbh:
10174 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
10175 case Intrinsic::amdgcn_sbfe:
10176 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10177 Op.getOperand(2), Op.getOperand(3));
10178 case Intrinsic::amdgcn_ubfe:
10179 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10180 Op.getOperand(2), Op.getOperand(3));
10181 case Intrinsic::amdgcn_cvt_pkrtz:
10182 case Intrinsic::amdgcn_cvt_pknorm_i16:
10183 case Intrinsic::amdgcn_cvt_pknorm_u16:
10184 case Intrinsic::amdgcn_cvt_pk_i16:
10185 case Intrinsic::amdgcn_cvt_pk_u16: {
10186 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10187 EVT VT = Op.getValueType();
10188 unsigned Opcode;
10189
10190 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10191 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10192 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10193 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10194 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10195 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10196 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10197 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10198 else
10199 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10200
10201 if (isTypeLegal(VT))
10202 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10203
10204 SDValue Node =
10205 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10206 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10207 }
10208 case Intrinsic::amdgcn_fmad_ftz:
10209 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10210 Op.getOperand(2), Op.getOperand(3));
10211
10212 case Intrinsic::amdgcn_if_break:
10213 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10214 Op->getOperand(1), Op->getOperand(2)),
10215 0);
10216
10217 case Intrinsic::amdgcn_groupstaticsize: {
10219 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10220 return Op;
10221
10222 const Module *M = MF.getFunction().getParent();
10223 const GlobalValue *GV =
10224 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10225 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10227 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10228 }
10229 case Intrinsic::amdgcn_is_shared:
10230 case Intrinsic::amdgcn_is_private: {
10231 SDLoc SL(Op);
10232 SDValue SrcVec =
10233 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10234 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10235 DAG.getConstant(1, SL, MVT::i32));
10236
10237 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10239 : AMDGPUAS::PRIVATE_ADDRESS;
10240 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10241 Subtarget->hasGloballyAddressableScratch()) {
10242 SDValue FlatScratchBaseHi(
10243 DAG.getMachineNode(
10244 AMDGPU::S_MOV_B32, DL, MVT::i32,
10245 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10246 0);
10247 // Test bits 63..58 against the aperture address.
10248 return DAG.getSetCC(
10249 SL, MVT::i1,
10250 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10251 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10252 }
10253
10254 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10255 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10256 }
10257 case Intrinsic::amdgcn_perm:
10258 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10259 Op.getOperand(2), Op.getOperand(3));
10260 case Intrinsic::amdgcn_reloc_constant: {
10261 Module *M = MF.getFunction().getParent();
10262 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10263 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10264 auto *RelocSymbol = cast<GlobalVariable>(
10265 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10266 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10268 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10269 }
10270 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10271 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10272 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10273 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10274 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10275 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10276 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10277 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10278 if (Op.getOperand(4).getValueType() == MVT::i32)
10279 return SDValue();
10280
10281 SDLoc SL(Op);
10282 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10283 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10284 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10285 Op.getOperand(3), IndexKeyi32);
10286 }
10287 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10288 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10289 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10290 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10291 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10292 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10293 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10294 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10295 if (Op.getOperand(4).getValueType() == MVT::i64)
10296 return SDValue();
10297
10298 SDLoc SL(Op);
10299 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10300 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10301 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10302 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10303 Op.getOperand(6)});
10304 }
10305 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10306 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10307 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10308 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10309 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10310 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10311 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10312 ? MVT::i64
10313 : MVT::i32;
10314 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10315 return SDValue();
10316
10317 SDLoc SL(Op);
10318 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10320 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10321 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10322 IndexKey, Op.getOperand(7), Op.getOperand(8)};
10323 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10324 Args.push_back(Op.getOperand(9));
10325 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
10326 }
10327 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10328 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10329 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10330 if (Op.getOperand(6).getValueType() == MVT::i32)
10331 return SDValue();
10332
10333 SDLoc SL(Op);
10334 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10335 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10336 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10337 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10338 IndexKeyi32, Op.getOperand(7)});
10339 }
10340 case Intrinsic::amdgcn_addrspacecast_nonnull:
10341 return lowerADDRSPACECAST(Op, DAG);
10342 case Intrinsic::amdgcn_readlane:
10343 case Intrinsic::amdgcn_readfirstlane:
10344 case Intrinsic::amdgcn_writelane:
10345 case Intrinsic::amdgcn_permlane16:
10346 case Intrinsic::amdgcn_permlanex16:
10347 case Intrinsic::amdgcn_permlane64:
10348 case Intrinsic::amdgcn_set_inactive:
10349 case Intrinsic::amdgcn_set_inactive_chain_arg:
10350 case Intrinsic::amdgcn_mov_dpp8:
10351 case Intrinsic::amdgcn_update_dpp:
10352 return lowerLaneOp(*this, Op.getNode(), DAG);
10353 case Intrinsic::amdgcn_dead: {
10355 for (const EVT ValTy : Op.getNode()->values())
10356 Poisons.push_back(DAG.getPOISON(ValTy));
10357 return DAG.getMergeValues(Poisons, SDLoc(Op));
10358 }
10359 case Intrinsic::amdgcn_wave_shuffle:
10360 return lowerWaveShuffle(*this, Op.getNode(), DAG);
10361 default:
10362 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10364 return lowerImage(Op, ImageDimIntr, DAG, false);
10365
10366 return Op;
10367 }
10368}
10369
10370// On targets not supporting constant in soffset field, turn zero to
10371// SGPR_NULL to avoid generating an extra s_mov with zero.
10373 const GCNSubtarget *Subtarget) {
10374 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10375 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10376 return SOffset;
10377}
10378
10379SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10380 SelectionDAG &DAG,
10381 unsigned NewOpcode) const {
10382 SDLoc DL(Op);
10383
10384 SDValue VData = Op.getOperand(2);
10385 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10386 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10387 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10388 SDValue Ops[] = {
10389 Op.getOperand(0), // Chain
10390 VData, // vdata
10391 Rsrc, // rsrc
10392 DAG.getConstant(0, DL, MVT::i32), // vindex
10393 VOffset, // voffset
10394 SOffset, // soffset
10395 Offset, // offset
10396 Op.getOperand(6), // cachepolicy
10397 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10398 };
10399
10400 auto *M = cast<MemSDNode>(Op);
10401
10402 EVT MemVT = VData.getValueType();
10403 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10404 M->getMemOperand());
10405}
10406
10407SDValue
10408SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10409 unsigned NewOpcode) const {
10410 SDLoc DL(Op);
10411
10412 SDValue VData = Op.getOperand(2);
10413 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10414 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10415 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10416 SDValue Ops[] = {
10417 Op.getOperand(0), // Chain
10418 VData, // vdata
10419 Rsrc, // rsrc
10420 Op.getOperand(4), // vindex
10421 VOffset, // voffset
10422 SOffset, // soffset
10423 Offset, // offset
10424 Op.getOperand(7), // cachepolicy
10425 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10426 };
10427
10428 auto *M = cast<MemSDNode>(Op);
10429
10430 EVT MemVT = VData.getValueType();
10431 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10432 M->getMemOperand());
10433}
10434
10435SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10436 SelectionDAG &DAG) const {
10437 unsigned IntrID = Op.getConstantOperandVal(1);
10438 SDLoc DL(Op);
10439
10440 switch (IntrID) {
10441 case Intrinsic::amdgcn_ds_ordered_add:
10442 case Intrinsic::amdgcn_ds_ordered_swap: {
10443 MemSDNode *M = cast<MemSDNode>(Op);
10444 SDValue Chain = M->getOperand(0);
10445 SDValue M0 = M->getOperand(2);
10446 SDValue Value = M->getOperand(3);
10447 unsigned IndexOperand = M->getConstantOperandVal(7);
10448 unsigned WaveRelease = M->getConstantOperandVal(8);
10449 unsigned WaveDone = M->getConstantOperandVal(9);
10450
10451 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10452 IndexOperand &= ~0x3f;
10453 unsigned CountDw = 0;
10454
10455 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10456 CountDw = (IndexOperand >> 24) & 0xf;
10457 IndexOperand &= ~(0xf << 24);
10458
10459 if (CountDw < 1 || CountDw > 4) {
10460 const Function &Fn = DAG.getMachineFunction().getFunction();
10461 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10462 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10463 DL.getDebugLoc()));
10464 CountDw = 1;
10465 }
10466 }
10467
10468 if (IndexOperand) {
10469 const Function &Fn = DAG.getMachineFunction().getFunction();
10470 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10471 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10472 }
10473
10474 if (WaveDone && !WaveRelease) {
10475 // TODO: Move this to IR verifier
10476 const Function &Fn = DAG.getMachineFunction().getFunction();
10477 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10478 Fn, "ds_ordered_count: wave_done requires wave_release",
10479 DL.getDebugLoc()));
10480 }
10481
10482 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10483 unsigned ShaderType =
10485 unsigned Offset0 = OrderedCountIndex << 2;
10486 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10487
10488 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10489 Offset1 |= (CountDw - 1) << 6;
10490
10491 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10492 Offset1 |= ShaderType << 2;
10493
10494 unsigned Offset = Offset0 | (Offset1 << 8);
10495
10496 SDValue Ops[] = {
10497 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10498 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10499 };
10500 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
10501 M->getVTList(), Ops, M->getMemoryVT(),
10502 M->getMemOperand());
10503 }
10504 case Intrinsic::amdgcn_raw_buffer_load:
10505 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10506 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10507 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10508 case Intrinsic::amdgcn_raw_buffer_load_format:
10509 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10510 const bool IsFormat =
10511 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10512 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10513
10514 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10515 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10516 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10517 SDValue Ops[] = {
10518 Op.getOperand(0), // Chain
10519 Rsrc, // rsrc
10520 DAG.getConstant(0, DL, MVT::i32), // vindex
10521 VOffset, // voffset
10522 SOffset, // soffset
10523 Offset, // offset
10524 Op.getOperand(5), // cachepolicy, swizzled buffer
10525 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10526 };
10527
10528 auto *M = cast<MemSDNode>(Op);
10529 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10530 }
10531 case Intrinsic::amdgcn_struct_buffer_load:
10532 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10533 case Intrinsic::amdgcn_struct_buffer_load_format:
10534 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10535 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10536 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10537 const bool IsFormat =
10538 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10539 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10540
10541 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10542 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10543 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10544 SDValue Ops[] = {
10545 Op.getOperand(0), // Chain
10546 Rsrc, // rsrc
10547 Op.getOperand(3), // vindex
10548 VOffset, // voffset
10549 SOffset, // soffset
10550 Offset, // offset
10551 Op.getOperand(6), // cachepolicy, swizzled buffer
10552 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10553 };
10554
10555 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10556 }
10557 case Intrinsic::amdgcn_raw_tbuffer_load:
10558 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10559 MemSDNode *M = cast<MemSDNode>(Op);
10560 EVT LoadVT = Op.getValueType();
10561 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10562 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10563 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10564
10565 SDValue Ops[] = {
10566 Op.getOperand(0), // Chain
10567 Rsrc, // rsrc
10568 DAG.getConstant(0, DL, MVT::i32), // vindex
10569 VOffset, // voffset
10570 SOffset, // soffset
10571 Offset, // offset
10572 Op.getOperand(5), // format
10573 Op.getOperand(6), // cachepolicy, swizzled buffer
10574 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10575 };
10576
10577 if (LoadVT.getScalarType() == MVT::f16)
10578 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10579 Ops);
10580 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10581 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10582 DAG);
10583 }
10584 case Intrinsic::amdgcn_struct_tbuffer_load:
10585 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10586 MemSDNode *M = cast<MemSDNode>(Op);
10587 EVT LoadVT = Op.getValueType();
10588 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10589 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10590 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10591
10592 SDValue Ops[] = {
10593 Op.getOperand(0), // Chain
10594 Rsrc, // rsrc
10595 Op.getOperand(3), // vindex
10596 VOffset, // voffset
10597 SOffset, // soffset
10598 Offset, // offset
10599 Op.getOperand(6), // format
10600 Op.getOperand(7), // cachepolicy, swizzled buffer
10601 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10602 };
10603
10604 if (LoadVT.getScalarType() == MVT::f16)
10605 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10606 Ops);
10607 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10608 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10609 DAG);
10610 }
10611 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10612 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10613 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10614 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10615 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10616 return lowerStructBufferAtomicIntrin(Op, DAG,
10617 AMDGPUISD::BUFFER_ATOMIC_FADD);
10618 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10619 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10620 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10621 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10622 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10623 return lowerStructBufferAtomicIntrin(Op, DAG,
10624 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10625 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10626 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10627 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10628 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10629 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10630 return lowerStructBufferAtomicIntrin(Op, DAG,
10631 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10632 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10633 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10634 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10635 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10636 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10637 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10638 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10639 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10640 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10641 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10642 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10643 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10644 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10645 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10646 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10647 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10648 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10649 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10650 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10651 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10652 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10653 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10654 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10655 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10656 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10657 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10658 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10659 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10660 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10661 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10662 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10663 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10664 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10665 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10666 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10667 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10668 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10669 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10670 return lowerStructBufferAtomicIntrin(Op, DAG,
10671 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10672 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10673 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10674 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10675 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10676 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10677 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10678 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10679 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10680 return lowerStructBufferAtomicIntrin(Op, DAG,
10681 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10682 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10683 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10684 return lowerStructBufferAtomicIntrin(Op, DAG,
10685 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10686 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10687 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10688 return lowerStructBufferAtomicIntrin(Op, DAG,
10689 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10690 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10691 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10692 return lowerStructBufferAtomicIntrin(Op, DAG,
10693 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10694 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10695 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10696 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10697 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10698 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10699 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10700 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10701 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10702 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10703 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10704 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10705 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10706 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10707 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10708 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10709 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10710 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10711 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10712 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10713 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10714 return lowerStructBufferAtomicIntrin(Op, DAG,
10715 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10716 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10717 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10718 return lowerRawBufferAtomicIntrin(Op, DAG,
10719 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10720 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10721 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10722 return lowerStructBufferAtomicIntrin(Op, DAG,
10723 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10724 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10725 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10726 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10727 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10728 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10729 SDValue Ops[] = {
10730 Op.getOperand(0), // Chain
10731 Op.getOperand(2), // src
10732 Op.getOperand(3), // cmp
10733 Rsrc, // rsrc
10734 DAG.getConstant(0, DL, MVT::i32), // vindex
10735 VOffset, // voffset
10736 SOffset, // soffset
10737 Offset, // offset
10738 Op.getOperand(7), // cachepolicy
10739 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10740 };
10741 EVT VT = Op.getValueType();
10742 auto *M = cast<MemSDNode>(Op);
10743
10744 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10745 Op->getVTList(), Ops, VT,
10746 M->getMemOperand());
10747 }
10748 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10749 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10750 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10751 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10752 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10753 SDValue Ops[] = {
10754 Op.getOperand(0), // Chain
10755 Op.getOperand(2), // src
10756 Op.getOperand(3), // cmp
10757 Rsrc, // rsrc
10758 Op.getOperand(5), // vindex
10759 VOffset, // voffset
10760 SOffset, // soffset
10761 Offset, // offset
10762 Op.getOperand(8), // cachepolicy
10763 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10764 };
10765 EVT VT = Op.getValueType();
10766 auto *M = cast<MemSDNode>(Op);
10767
10768 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10769 Op->getVTList(), Ops, VT,
10770 M->getMemOperand());
10771 }
10772 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10773 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10774 MemSDNode *M = cast<MemSDNode>(Op);
10775 SDValue NodePtr = M->getOperand(2);
10776 SDValue RayExtent = M->getOperand(3);
10777 SDValue InstanceMask = M->getOperand(4);
10778 SDValue RayOrigin = M->getOperand(5);
10779 SDValue RayDir = M->getOperand(6);
10780 SDValue Offsets = M->getOperand(7);
10781 SDValue TDescr = M->getOperand(8);
10782
10783 assert(NodePtr.getValueType() == MVT::i64);
10784 assert(RayDir.getValueType() == MVT::v3f32);
10785
10786 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10787 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10788 return SDValue();
10789 }
10790
10791 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10792 const unsigned NumVDataDwords = 10;
10793 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10794 int Opcode = AMDGPU::getMIMGOpcode(
10795 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10796 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10797 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10798 assert(Opcode != -1);
10799
10801 Ops.push_back(NodePtr);
10802 Ops.push_back(DAG.getBuildVector(
10803 MVT::v2i32, DL,
10804 {DAG.getBitcast(MVT::i32, RayExtent),
10805 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10806 Ops.push_back(RayOrigin);
10807 Ops.push_back(RayDir);
10808 Ops.push_back(Offsets);
10809 Ops.push_back(TDescr);
10810 Ops.push_back(M->getChain());
10811
10812 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10813 MachineMemOperand *MemRef = M->getMemOperand();
10814 DAG.setNodeMemRefs(NewNode, {MemRef});
10815 return SDValue(NewNode, 0);
10816 }
10817 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10818 MemSDNode *M = cast<MemSDNode>(Op);
10819 SDValue NodePtr = M->getOperand(2);
10820 SDValue RayExtent = M->getOperand(3);
10821 SDValue RayOrigin = M->getOperand(4);
10822 SDValue RayDir = M->getOperand(5);
10823 SDValue RayInvDir = M->getOperand(6);
10824 SDValue TDescr = M->getOperand(7);
10825
10826 assert(NodePtr.getValueType() == MVT::i32 ||
10827 NodePtr.getValueType() == MVT::i64);
10828 assert(RayDir.getValueType() == MVT::v3f16 ||
10829 RayDir.getValueType() == MVT::v3f32);
10830
10831 if (!Subtarget->hasGFX10_AEncoding()) {
10832 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10833 return SDValue();
10834 }
10835
10836 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10837 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10838 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10839 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10840 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10841 const unsigned NumVDataDwords = 4;
10842 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10843 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10844 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10845 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10846 IsGFX12Plus;
10847 const unsigned BaseOpcodes[2][2] = {
10848 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10849 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10850 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10851 int Opcode;
10852 if (UseNSA) {
10853 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10854 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10855 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10856 : AMDGPU::MIMGEncGfx10NSA,
10857 NumVDataDwords, NumVAddrDwords);
10858 } else {
10859 assert(!IsGFX12Plus);
10860 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10861 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10862 : AMDGPU::MIMGEncGfx10Default,
10863 NumVDataDwords, NumVAddrDwords);
10864 }
10865 assert(Opcode != -1);
10866
10868
10869 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10871 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10872 if (Lanes[0].getValueSizeInBits() == 32) {
10873 for (unsigned I = 0; I < 3; ++I)
10874 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10875 } else {
10876 if (IsAligned) {
10877 Ops.push_back(DAG.getBitcast(
10878 MVT::i32,
10879 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10880 Ops.push_back(Lanes[2]);
10881 } else {
10882 SDValue Elt0 = Ops.pop_back_val();
10883 Ops.push_back(DAG.getBitcast(
10884 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10885 Ops.push_back(DAG.getBitcast(
10886 MVT::i32,
10887 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10888 }
10889 }
10890 };
10891
10892 if (UseNSA && IsGFX11Plus) {
10893 Ops.push_back(NodePtr);
10894 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10895 Ops.push_back(RayOrigin);
10896 if (IsA16) {
10897 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10898 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10899 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10900 for (unsigned I = 0; I < 3; ++I) {
10901 MergedLanes.push_back(DAG.getBitcast(
10902 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10903 {DirLanes[I], InvDirLanes[I]})));
10904 }
10905 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10906 } else {
10907 Ops.push_back(RayDir);
10908 Ops.push_back(RayInvDir);
10909 }
10910 } else {
10911 if (Is64)
10912 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10913 2);
10914 else
10915 Ops.push_back(NodePtr);
10916
10917 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10918 packLanes(RayOrigin, true);
10919 packLanes(RayDir, true);
10920 packLanes(RayInvDir, false);
10921 }
10922
10923 if (!UseNSA) {
10924 // Build a single vector containing all the operands so far prepared.
10925 if (NumVAddrDwords > 12) {
10926 SDValue Undef = DAG.getPOISON(MVT::i32);
10927 Ops.append(16 - Ops.size(), Undef);
10928 }
10929 assert(Ops.size() >= 8 && Ops.size() <= 12);
10930 SDValue MergedOps =
10931 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10932 Ops.clear();
10933 Ops.push_back(MergedOps);
10934 }
10935
10936 Ops.push_back(TDescr);
10937 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10938 Ops.push_back(M->getChain());
10939
10940 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10941 MachineMemOperand *MemRef = M->getMemOperand();
10942 DAG.setNodeMemRefs(NewNode, {MemRef});
10943 return SDValue(NewNode, 0);
10944 }
10945 case Intrinsic::amdgcn_global_atomic_fmin_num:
10946 case Intrinsic::amdgcn_global_atomic_fmax_num:
10947 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10948 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10949 MemSDNode *M = cast<MemSDNode>(Op);
10950 SDValue Ops[] = {
10951 M->getOperand(0), // Chain
10952 M->getOperand(2), // Ptr
10953 M->getOperand(3) // Value
10954 };
10955 unsigned Opcode = 0;
10956 switch (IntrID) {
10957 case Intrinsic::amdgcn_global_atomic_fmin_num:
10958 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10959 Opcode = ISD::ATOMIC_LOAD_FMIN;
10960 break;
10961 }
10962 case Intrinsic::amdgcn_global_atomic_fmax_num:
10963 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10964 Opcode = ISD::ATOMIC_LOAD_FMAX;
10965 break;
10966 }
10967 default:
10968 llvm_unreachable("unhandled atomic opcode");
10969 }
10970 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10971 Ops, M->getMemOperand());
10972 }
10973 case Intrinsic::amdgcn_s_get_barrier_state:
10974 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10975 SDValue Chain = Op->getOperand(0);
10977 unsigned Opc;
10978
10979 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10980 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10981 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10982 BarID = (BarID >> 4) & 0x3F;
10983 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10984 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10985 Ops.push_back(K);
10986 Ops.push_back(Chain);
10987 } else {
10988 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10989 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10990 SDValue M0Val;
10991 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10992 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10993 M0Val = SDValue(
10994 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10995 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10996 0);
10997 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10998 } else
10999 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
11000 }
11001
11002 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11003 return SDValue(NewMI, 0);
11004 }
11005 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11006 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11007 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11008 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11009 SDValue Chain = Op->getOperand(0);
11010 SDValue Ptr = Op->getOperand(2);
11011 EVT VT = Op->getValueType(0);
11012 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
11013 Chain, Ptr, MII->getMemOperand());
11014 }
11015 default:
11016
11017 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11019 return lowerImage(Op, ImageDimIntr, DAG, true);
11020
11021 return SDValue();
11022 }
11023}
11024
11025// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11026// dwordx4 if on SI and handle TFE loads.
11027SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11028 SDVTList VTList,
11029 ArrayRef<SDValue> Ops, EVT MemVT,
11030 MachineMemOperand *MMO,
11031 SelectionDAG &DAG) const {
11032 LLVMContext &C = *DAG.getContext();
11033 MachineFunction &MF = DAG.getMachineFunction();
11034 EVT VT = VTList.VTs[0];
11035
11036 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
11037 bool IsTFE = VTList.NumVTs == 3;
11038 if (IsTFE) {
11039 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
11040 unsigned NumOpDWords = NumValueDWords + 1;
11041 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
11042 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
11043 MachineMemOperand *OpDWordsMMO =
11044 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
11045 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
11046 OpDWordsVT, OpDWordsMMO, DAG);
11047 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11048 DAG.getVectorIdxConstant(NumValueDWords, DL));
11049 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
11050 SDValue ValueDWords =
11051 NumValueDWords == 1
11052 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
11054 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
11055 ZeroIdx);
11056 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
11057 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11058 }
11059
11060 if (!Subtarget->hasDwordx3LoadStores() &&
11061 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11062 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
11063 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
11064 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
11065 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
11066 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
11067 WidenedMemVT, WidenedMMO);
11069 DAG.getVectorIdxConstant(0, DL));
11070 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
11071 }
11072
11073 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
11074}
11075
11076SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11077 bool ImageStore) const {
11078 EVT StoreVT = VData.getValueType();
11079
11080 // No change for f16 and legal vector D16 types.
11081 if (!StoreVT.isVector())
11082 return VData;
11083
11084 SDLoc DL(VData);
11085 unsigned NumElements = StoreVT.getVectorNumElements();
11086
11087 if (Subtarget->hasUnpackedD16VMem()) {
11088 // We need to unpack the packed data to store.
11089 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11090 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11091
11092 EVT EquivStoreVT =
11093 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
11094 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
11095 return DAG.UnrollVectorOp(ZExt.getNode());
11096 }
11097
11098 // The sq block of gfx8.1 does not estimate register use correctly for d16
11099 // image store instructions. The data operand is computed as if it were not a
11100 // d16 image instruction.
11101 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11102 // Bitcast to i16
11103 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11104 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11105
11106 // Decompose into scalars
11108 DAG.ExtractVectorElements(IntVData, Elts);
11109
11110 // Group pairs of i16 into v2i16 and bitcast to i32
11111 SmallVector<SDValue, 4> PackedElts;
11112 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11113 SDValue Pair =
11114 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11115 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11116 PackedElts.push_back(IntPair);
11117 }
11118 if ((NumElements % 2) == 1) {
11119 // Handle v3i16
11120 unsigned I = Elts.size() / 2;
11121 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11122 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11123 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11124 PackedElts.push_back(IntPair);
11125 }
11126
11127 // Pad using UNDEF
11128 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11129
11130 // Build final vector
11131 EVT VecVT =
11132 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11133 return DAG.getBuildVector(VecVT, DL, PackedElts);
11134 }
11135
11136 if (NumElements == 3) {
11137 EVT IntStoreVT =
11139 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11140
11141 EVT WidenedStoreVT = EVT::getVectorVT(
11142 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11143 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11144 WidenedStoreVT.getStoreSizeInBits());
11145 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11146 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11147 }
11148
11149 assert(isTypeLegal(StoreVT));
11150 return VData;
11151}
11152
11153SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11154 SelectionDAG &DAG) const {
11155 SDLoc DL(Op);
11156 SDValue Chain = Op.getOperand(0);
11157 unsigned IntrinsicID = Op.getConstantOperandVal(1);
11158 MachineFunction &MF = DAG.getMachineFunction();
11159
11160 switch (IntrinsicID) {
11161 case Intrinsic::amdgcn_exp_compr: {
11162 if (!Subtarget->hasCompressedExport()) {
11163 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11165 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11166 }
11167 SDValue Src0 = Op.getOperand(4);
11168 SDValue Src1 = Op.getOperand(5);
11169 // Hack around illegal type on SI by directly selecting it.
11170 if (isTypeLegal(Src0.getValueType()))
11171 return SDValue();
11172
11173 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
11174 SDValue Undef = DAG.getPOISON(MVT::f32);
11175 const SDValue Ops[] = {
11176 Op.getOperand(2), // tgt
11177 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
11178 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
11179 Undef, // src2
11180 Undef, // src3
11181 Op.getOperand(7), // vm
11182 DAG.getTargetConstant(1, DL, MVT::i1), // compr
11183 Op.getOperand(3), // en
11184 Op.getOperand(0) // Chain
11185 };
11186
11187 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11188 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11189 }
11190
11191 case Intrinsic::amdgcn_struct_tbuffer_store:
11192 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11193 SDValue VData = Op.getOperand(2);
11194 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11195 if (IsD16)
11196 VData = handleD16VData(VData, DAG);
11197 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11198 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11199 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11200 SDValue Ops[] = {
11201 Chain,
11202 VData, // vdata
11203 Rsrc, // rsrc
11204 Op.getOperand(4), // vindex
11205 VOffset, // voffset
11206 SOffset, // soffset
11207 Offset, // offset
11208 Op.getOperand(7), // format
11209 Op.getOperand(8), // cachepolicy, swizzled buffer
11210 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11211 };
11212 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11213 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11214 MemSDNode *M = cast<MemSDNode>(Op);
11215 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11216 M->getMemoryVT(), M->getMemOperand());
11217 }
11218
11219 case Intrinsic::amdgcn_raw_tbuffer_store:
11220 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11221 SDValue VData = Op.getOperand(2);
11222 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11223 if (IsD16)
11224 VData = handleD16VData(VData, DAG);
11225 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11226 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11227 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11228 SDValue Ops[] = {
11229 Chain,
11230 VData, // vdata
11231 Rsrc, // rsrc
11232 DAG.getConstant(0, DL, MVT::i32), // vindex
11233 VOffset, // voffset
11234 SOffset, // soffset
11235 Offset, // offset
11236 Op.getOperand(6), // format
11237 Op.getOperand(7), // cachepolicy, swizzled buffer
11238 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11239 };
11240 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11241 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11242 MemSDNode *M = cast<MemSDNode>(Op);
11243 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11244 M->getMemoryVT(), M->getMemOperand());
11245 }
11246
11247 case Intrinsic::amdgcn_raw_buffer_store:
11248 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11249 case Intrinsic::amdgcn_raw_buffer_store_format:
11250 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11251 const bool IsFormat =
11252 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11253 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11254
11255 SDValue VData = Op.getOperand(2);
11256 EVT VDataVT = VData.getValueType();
11257 EVT EltType = VDataVT.getScalarType();
11258 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11259 if (IsD16) {
11260 VData = handleD16VData(VData, DAG);
11261 VDataVT = VData.getValueType();
11262 }
11263
11264 if (!isTypeLegal(VDataVT)) {
11265 VData =
11266 DAG.getNode(ISD::BITCAST, DL,
11267 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11268 }
11269
11270 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11271 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11272 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11273 SDValue Ops[] = {
11274 Chain,
11275 VData,
11276 Rsrc,
11277 DAG.getConstant(0, DL, MVT::i32), // vindex
11278 VOffset, // voffset
11279 SOffset, // soffset
11280 Offset, // offset
11281 Op.getOperand(6), // cachepolicy, swizzled buffer
11282 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11283 };
11284 unsigned Opc =
11285 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11286 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11287 MemSDNode *M = cast<MemSDNode>(Op);
11288
11289 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11290 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11291 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11292
11293 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11294 M->getMemoryVT(), M->getMemOperand());
11295 }
11296
11297 case Intrinsic::amdgcn_struct_buffer_store:
11298 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11299 case Intrinsic::amdgcn_struct_buffer_store_format:
11300 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11301 const bool IsFormat =
11302 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11303 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11304
11305 SDValue VData = Op.getOperand(2);
11306 EVT VDataVT = VData.getValueType();
11307 EVT EltType = VDataVT.getScalarType();
11308 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11309
11310 if (IsD16) {
11311 VData = handleD16VData(VData, DAG);
11312 VDataVT = VData.getValueType();
11313 }
11314
11315 if (!isTypeLegal(VDataVT)) {
11316 VData =
11317 DAG.getNode(ISD::BITCAST, DL,
11318 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11319 }
11320
11321 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11322 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11323 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11324 SDValue Ops[] = {
11325 Chain,
11326 VData,
11327 Rsrc,
11328 Op.getOperand(4), // vindex
11329 VOffset, // voffset
11330 SOffset, // soffset
11331 Offset, // offset
11332 Op.getOperand(7), // cachepolicy, swizzled buffer
11333 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11334 };
11335 unsigned Opc =
11336 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11337 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11338 MemSDNode *M = cast<MemSDNode>(Op);
11339
11340 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11341 EVT VDataType = VData.getValueType().getScalarType();
11342 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11343 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11344
11345 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11346 M->getMemoryVT(), M->getMemOperand());
11347 }
11348 case Intrinsic::amdgcn_raw_buffer_load_lds:
11349 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11350 case Intrinsic::amdgcn_struct_buffer_load_lds:
11351 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11352 if (!Subtarget->hasVMemToLDSLoad())
11353 return SDValue();
11354 unsigned Opc;
11355 bool HasVIndex =
11356 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11357 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11358 unsigned OpOffset = HasVIndex ? 1 : 0;
11359 SDValue VOffset = Op.getOperand(5 + OpOffset);
11360 bool HasVOffset = !isNullConstant(VOffset);
11361 unsigned Size = Op->getConstantOperandVal(4);
11362
11363 switch (Size) {
11364 default:
11365 return SDValue();
11366 case 1:
11367 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11368 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11369 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11370 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11371 break;
11372 case 2:
11373 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11374 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11375 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11376 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11377 break;
11378 case 4:
11379 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11380 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11381 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11382 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11383 break;
11384 case 12:
11385 if (!Subtarget->hasLDSLoadB96_B128())
11386 return SDValue();
11387 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11388 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11389 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11390 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11391 break;
11392 case 16:
11393 if (!Subtarget->hasLDSLoadB96_B128())
11394 return SDValue();
11395 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11396 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11397 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11398 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11399 break;
11400 }
11401
11402 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11403
11405
11406 if (HasVIndex && HasVOffset)
11407 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11408 {Op.getOperand(5), // VIndex
11409 VOffset}));
11410 else if (HasVIndex)
11411 Ops.push_back(Op.getOperand(5));
11412 else if (HasVOffset)
11413 Ops.push_back(VOffset);
11414
11415 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11416 Ops.push_back(Rsrc);
11417 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11418 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11419 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11420 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11421 Ops.push_back(DAG.getTargetConstant(
11422 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11423 DL, MVT::i8)); // cpol
11424 Ops.push_back(DAG.getTargetConstant(
11425 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11426 ? 1
11427 : 0,
11428 DL, MVT::i8)); // swz
11429 Ops.push_back(M0Val.getValue(0)); // Chain
11430 Ops.push_back(M0Val.getValue(1)); // Glue
11431
11432 auto *M = cast<MemSDNode>(Op);
11433 MachineMemOperand *LoadMMO = M->getMemOperand();
11434 // Don't set the offset value here because the pointer points to the base of
11435 // the buffer.
11436 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11437
11438 MachinePointerInfo StorePtrI = LoadPtrI;
11439 LoadPtrI.V = PoisonValue::get(
11443
11444 auto F = LoadMMO->getFlags() &
11446 LoadMMO =
11448 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11449
11450 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11451 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11452 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11453
11454 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11455 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11456
11457 return SDValue(Load, 0);
11458 }
11459 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11460 // for "trust me" that the remaining cases are global pointers until
11461 // such time as we can put two mem operands on an intrinsic.
11462 case Intrinsic::amdgcn_load_to_lds:
11463 case Intrinsic::amdgcn_global_load_lds: {
11464 if (!Subtarget->hasVMemToLDSLoad())
11465 return SDValue();
11466
11467 unsigned Opc;
11468 unsigned Size = Op->getConstantOperandVal(4);
11469 switch (Size) {
11470 default:
11471 return SDValue();
11472 case 1:
11473 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11474 break;
11475 case 2:
11476 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11477 break;
11478 case 4:
11479 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11480 break;
11481 case 12:
11482 if (!Subtarget->hasLDSLoadB96_B128())
11483 return SDValue();
11484 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11485 break;
11486 case 16:
11487 if (!Subtarget->hasLDSLoadB96_B128())
11488 return SDValue();
11489 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11490 break;
11491 }
11492
11493 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11494
11496
11497 SDValue Addr = Op.getOperand(2); // Global ptr
11498 SDValue VOffset;
11499 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11500 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11501 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11502 SDValue LHS = Addr.getOperand(0);
11503 SDValue RHS = Addr.getOperand(1);
11504
11505 if (LHS->isDivergent())
11506 std::swap(LHS, RHS);
11507
11508 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11509 RHS.getOperand(0).getValueType() == MVT::i32) {
11510 // add (i64 sgpr), (zero_extend (i32 vgpr))
11511 Addr = LHS;
11512 VOffset = RHS.getOperand(0);
11513 }
11514 }
11515
11516 Ops.push_back(Addr);
11517 if (!Addr->isDivergent()) {
11519 if (!VOffset)
11520 VOffset =
11521 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11522 DAG.getTargetConstant(0, DL, MVT::i32)),
11523 0);
11524 Ops.push_back(VOffset);
11525 }
11526
11527 Ops.push_back(Op.getOperand(5)); // Offset
11528
11529 unsigned Aux = Op.getConstantOperandVal(6);
11530 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11531 MVT::i32)); // CPol
11532
11533 Ops.push_back(M0Val.getValue(0)); // Chain
11534 Ops.push_back(M0Val.getValue(1)); // Glue
11535
11536 auto *M = cast<MemSDNode>(Op);
11537 MachineMemOperand *LoadMMO = M->getMemOperand();
11538 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11539 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11540 MachinePointerInfo StorePtrI = LoadPtrI;
11541 LoadPtrI.V = PoisonValue::get(
11545 auto F = LoadMMO->getFlags() &
11547 LoadMMO =
11549 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11550 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11551 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11552 LoadMMO->getAAInfo());
11553
11554 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11555 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11556
11557 return SDValue(Load, 0);
11558 }
11559 case Intrinsic::amdgcn_end_cf:
11560 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11561 Op->getOperand(2), Chain),
11562 0);
11563 case Intrinsic::amdgcn_s_barrier_init:
11564 case Intrinsic::amdgcn_s_barrier_signal_var: {
11565 // these two intrinsics have two operands: barrier pointer and member count
11566 SDValue Chain = Op->getOperand(0);
11568 SDValue BarOp = Op->getOperand(2);
11569 SDValue CntOp = Op->getOperand(3);
11570 SDValue M0Val;
11571 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11572 ? AMDGPU::S_BARRIER_INIT_M0
11573 : AMDGPU::S_BARRIER_SIGNAL_M0;
11574 // extract the BarrierID from bits 4-9 of BarOp
11575 SDValue BarID;
11576 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11577 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11578 BarID =
11579 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11580 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11581 0);
11582 // Member count should be put into M0[ShAmt:+6]
11583 // Barrier ID should be put into M0[5:0]
11584 M0Val =
11585 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11586 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11587 0);
11588 constexpr unsigned ShAmt = 16;
11589 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11590 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11591
11592 M0Val = SDValue(
11593 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11594
11595 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11596
11597 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11598 return SDValue(NewMI, 0);
11599 }
11600 case Intrinsic::amdgcn_s_wakeup_barrier: {
11601 if (!Subtarget->hasSWakeupBarrier())
11602 return SDValue();
11603 [[fallthrough]];
11604 }
11605 case Intrinsic::amdgcn_s_barrier_join: {
11606 // these three intrinsics have one operand: barrier pointer
11607 SDValue Chain = Op->getOperand(0);
11609 SDValue BarOp = Op->getOperand(2);
11610 unsigned Opc;
11611
11612 if (isa<ConstantSDNode>(BarOp)) {
11613 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11614 switch (IntrinsicID) {
11615 default:
11616 return SDValue();
11617 case Intrinsic::amdgcn_s_barrier_join:
11618 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11619 break;
11620 case Intrinsic::amdgcn_s_wakeup_barrier:
11621 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11622 break;
11623 }
11624 // extract the BarrierID from bits 4-9 of the immediate
11625 unsigned BarID = (BarVal >> 4) & 0x3F;
11626 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11627 Ops.push_back(K);
11628 Ops.push_back(Chain);
11629 } else {
11630 switch (IntrinsicID) {
11631 default:
11632 return SDValue();
11633 case Intrinsic::amdgcn_s_barrier_join:
11634 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11635 break;
11636 case Intrinsic::amdgcn_s_wakeup_barrier:
11637 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11638 break;
11639 }
11640 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11641 SDValue M0Val;
11642 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11643 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11644 M0Val =
11645 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11646 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11647 0);
11648 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11649 }
11650
11651 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11652 return SDValue(NewMI, 0);
11653 }
11654 case Intrinsic::amdgcn_s_prefetch_data: {
11655 // For non-global address space preserve the chain and remove the call.
11657 return Op.getOperand(0);
11658 return Op;
11659 }
11660 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11661 SDValue Ops[] = {
11662 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11663 Op.getOperand(3), // offset
11664 Op.getOperand(4), // length
11665 };
11666
11667 MemSDNode *M = cast<MemSDNode>(Op);
11668 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
11669 Op->getVTList(), Ops, M->getMemoryVT(),
11670 M->getMemOperand());
11671 }
11672 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11673 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11674 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11675 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11676 SDValue Chain = Op->getOperand(0);
11677 SDValue Ptr = Op->getOperand(2);
11678 SDValue Val = Op->getOperand(3);
11679 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11680 Ptr, MII->getMemOperand());
11681 }
11682 default: {
11683 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11685 return lowerImage(Op, ImageDimIntr, DAG, true);
11686
11687 return Op;
11688 }
11689 }
11690}
11691
11692// Return whether the operation has NoUnsignedWrap property.
11693static bool isNoUnsignedWrap(SDValue Addr) {
11694 return (Addr.getOpcode() == ISD::ADD &&
11695 Addr->getFlags().hasNoUnsignedWrap()) ||
11696 Addr->getOpcode() == ISD::OR;
11697}
11698
11700 EVT PtrVT) const {
11701 return PtrVT == MVT::i64;
11702}
11703
11705 EVT PtrVT) const {
11706 return true;
11707}
11708
11709// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11710// offset (the offset that is included in bounds checking and swizzling, to be
11711// split between the instruction's voffset and immoffset fields) and soffset
11712// (the offset that is excluded from bounds checking and swizzling, to go in
11713// the instruction's soffset field). This function takes the first kind of
11714// offset and figures out how to split it between voffset and immoffset.
11715std::pair<SDValue, SDValue>
11716SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11717 SDLoc DL(Offset);
11718 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11719 SDValue N0 = Offset;
11720 ConstantSDNode *C1 = nullptr;
11721
11722 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11723 N0 = SDValue();
11724 else if (DAG.isBaseWithConstantOffset(N0)) {
11725 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11726 // being added, so we can only safely match a 32-bit addition with no
11727 // unsigned overflow.
11728 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11729 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11730 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11731 N0 = N0.getOperand(0);
11732 }
11733 }
11734
11735 if (C1) {
11736 unsigned ImmOffset = C1->getZExtValue();
11737 // If the immediate value is too big for the immoffset field, put only bits
11738 // that would normally fit in the immoffset field. The remaining value that
11739 // is copied/added for the voffset field is a large power of 2, and it
11740 // stands more chance of being CSEd with the copy/add for another similar
11741 // load/store.
11742 // However, do not do that rounding down if that is a negative
11743 // number, as it appears to be illegal to have a negative offset in the
11744 // vgpr, even if adding the immediate offset makes it positive.
11745 unsigned Overflow = ImmOffset & ~MaxImm;
11746 ImmOffset -= Overflow;
11747 if ((int32_t)Overflow < 0) {
11748 Overflow += ImmOffset;
11749 ImmOffset = 0;
11750 }
11751 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11752 if (Overflow) {
11753 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11754 if (!N0)
11755 N0 = OverflowVal;
11756 else {
11757 SDValue Ops[] = {N0, OverflowVal};
11758 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11759 }
11760 }
11761 }
11762 if (!N0)
11763 N0 = DAG.getConstant(0, DL, MVT::i32);
11764 if (!C1)
11765 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11766 return {N0, SDValue(C1, 0)};
11767}
11768
11769// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11770// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11771// pointed to by Offsets.
11772void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11773 SelectionDAG &DAG, SDValue *Offsets,
11774 Align Alignment) const {
11775 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11776 SDLoc DL(CombinedOffset);
11777 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11778 uint32_t Imm = C->getZExtValue();
11779 uint32_t SOffset, ImmOffset;
11780 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11781 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11782 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11783 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11784 return;
11785 }
11786 }
11787 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11788 SDValue N0 = CombinedOffset.getOperand(0);
11789 SDValue N1 = CombinedOffset.getOperand(1);
11790 uint32_t SOffset, ImmOffset;
11791 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11792 if (Offset >= 0 &&
11793 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11794 Offsets[0] = N0;
11795 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11796 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11797 return;
11798 }
11799 }
11800
11801 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11802 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11803 : DAG.getConstant(0, DL, MVT::i32);
11804
11805 Offsets[0] = CombinedOffset;
11806 Offsets[1] = SOffsetZero;
11807 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11808}
11809
11810SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11811 SelectionDAG &DAG) const {
11812 if (!MaybePointer.getValueType().isScalarInteger())
11813 return MaybePointer;
11814
11815 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11816 return Rsrc;
11817}
11818
11819// Wrap a global or flat pointer into a buffer intrinsic using the flags
11820// specified in the intrinsic.
11821SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11822 SelectionDAG &DAG) const {
11823 SDLoc Loc(Op);
11824
11825 SDValue Pointer = Op->getOperand(1);
11826 SDValue Stride = Op->getOperand(2);
11827 SDValue NumRecords = Op->getOperand(3);
11828 SDValue Flags = Op->getOperand(4);
11829
11830 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11831 SDValue Rsrc;
11832
11833 if (Subtarget->has45BitNumRecordsBufferResource()) {
11834 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
11835 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
11836 // num_records.
11837 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
11838 SDValue NumRecordsLHS =
11839 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
11840 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
11841 SDValue LowHalf =
11842 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
11843
11844 // Build the higher 64-bit value, which has the higher 38-bit num_records,
11845 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
11846 SDValue NumRecordsRHS =
11847 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
11848 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
11849 SDValue ShiftedStride =
11850 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11851 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
11852 SDValue ExtShiftedStrideVec =
11853 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
11854 SDValue ExtShiftedStride =
11855 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11856 SDValue ShiftedFlags =
11857 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
11858 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
11859 SDValue ExtShiftedFlagsVec =
11860 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
11861 SDValue ExtShiftedFlags =
11862 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11863 SDValue CombinedFields =
11864 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11865 SDValue HighHalf =
11866 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11867
11868 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
11869 } else {
11870 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
11871 auto [LowHalf, HighHalf] =
11872 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11873 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11874 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11875 SDValue ShiftedStride =
11876 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11877 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11878 SDValue NewHighHalf =
11879 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11880
11881 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
11882 NumRecords, Flags);
11883 }
11884
11885 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11886 return RsrcPtr;
11887}
11888
11889// Handle 8 bit and 16 bit buffer loads
11890SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11891 EVT LoadVT, SDLoc DL,
11893 MachineMemOperand *MMO,
11894 bool IsTFE) const {
11895 EVT IntVT = LoadVT.changeTypeToInteger();
11896
11897 if (IsTFE) {
11898 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11899 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11900 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11901 MachineFunction &MF = DAG.getMachineFunction();
11902 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11903 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11904 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11905 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11906 DAG.getConstant(1, DL, MVT::i32));
11907 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11908 DAG.getConstant(0, DL, MVT::i32));
11909 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11910 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11911 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11912 }
11913
11914 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11915 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11916 : AMDGPUISD::BUFFER_LOAD_USHORT;
11917
11918 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11919 SDValue BufferLoad =
11920 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11921 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11922 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11923
11924 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11925}
11926
11927// Handle 8 bit and 16 bit buffer stores
11928SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11929 EVT VDataType, SDLoc DL,
11930 SDValue Ops[],
11931 MemSDNode *M) const {
11932 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11933 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11934
11935 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11936 Ops[1] = BufferStoreExt;
11937 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11938 : AMDGPUISD::BUFFER_STORE_SHORT;
11939 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11940 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11941 M->getMemOperand());
11942}
11943
11945 SDValue Op, const SDLoc &SL, EVT VT) {
11946 if (VT.bitsLT(Op.getValueType()))
11947 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11948
11949 switch (ExtType) {
11950 case ISD::SEXTLOAD:
11951 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11952 case ISD::ZEXTLOAD:
11953 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11954 case ISD::EXTLOAD:
11955 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11956 case ISD::NON_EXTLOAD:
11957 return Op;
11958 }
11959
11960 llvm_unreachable("invalid ext type");
11961}
11962
11963// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11964// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11965SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11966 DAGCombinerInfo &DCI) const {
11967 SelectionDAG &DAG = DCI.DAG;
11968 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11969 return SDValue();
11970
11971 // FIXME: Constant loads should all be marked invariant.
11972 unsigned AS = Ld->getAddressSpace();
11973 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11975 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11976 return SDValue();
11977
11978 // Don't do this early, since it may interfere with adjacent load merging for
11979 // illegal types. We can avoid losing alignment information for exotic types
11980 // pre-legalize.
11981 EVT MemVT = Ld->getMemoryVT();
11982 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11983 MemVT.getSizeInBits() >= 32)
11984 return SDValue();
11985
11986 SDLoc SL(Ld);
11987
11988 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11989 "unexpected vector extload");
11990
11991 // TODO: Drop only high part of range.
11992 SDValue Ptr = Ld->getBasePtr();
11993 SDValue NewLoad = DAG.getLoad(
11994 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11995 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11996 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11997 nullptr); // Drop ranges
11998
11999 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
12000 if (MemVT.isFloatingPoint()) {
12002 "unexpected fp extload");
12003 TruncVT = MemVT.changeTypeToInteger();
12004 }
12005
12006 SDValue Cvt = NewLoad;
12007 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12008 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
12009 DAG.getValueType(TruncVT));
12010 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
12012 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
12013 } else {
12015 }
12016
12017 EVT VT = Ld->getValueType(0);
12018 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
12019
12020 DCI.AddToWorklist(Cvt.getNode());
12021
12022 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
12023 // the appropriate extension from the 32-bit load.
12024 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
12025 DCI.AddToWorklist(Cvt.getNode());
12026
12027 // Handle conversion back to floating point if necessary.
12028 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
12029
12030 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
12031}
12032
12034 const SIMachineFunctionInfo &Info) {
12035 // TODO: Should check if the address can definitely not access stack.
12036 if (Info.isEntryFunction())
12037 return Info.getUserSGPRInfo().hasFlatScratchInit();
12038 return true;
12039}
12040
12041SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12042 SDLoc DL(Op);
12043 LoadSDNode *Load = cast<LoadSDNode>(Op);
12044 ISD::LoadExtType ExtType = Load->getExtensionType();
12045 EVT MemVT = Load->getMemoryVT();
12046 MachineMemOperand *MMO = Load->getMemOperand();
12047
12048 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
12049 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
12050 return SDValue();
12051
12052 // FIXME: Copied from PPC
12053 // First, load into 32 bits, then truncate to 1 bit.
12054
12055 SDValue Chain = Load->getChain();
12056 SDValue BasePtr = Load->getBasePtr();
12057
12058 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12059
12060 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
12061 RealMemVT, MMO);
12062
12063 if (!MemVT.isVector()) {
12064 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
12065 NewLD.getValue(1)};
12066
12067 return DAG.getMergeValues(Ops, DL);
12068 }
12069
12071 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12072 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
12073 DAG.getConstant(I, DL, MVT::i32));
12074
12075 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
12076 }
12077
12078 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
12079
12080 return DAG.getMergeValues(Ops, DL);
12081 }
12082
12083 if (!MemVT.isVector())
12084 return SDValue();
12085
12086 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12087 "Custom lowering for non-i32 vectors hasn't been implemented.");
12088
12089 Align Alignment = Load->getAlign();
12090 unsigned AS = Load->getAddressSpace();
12091 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12092 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12093 return SplitVectorLoad(Op, DAG);
12094 }
12095
12096 MachineFunction &MF = DAG.getMachineFunction();
12097 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12098 // If there is a possibility that flat instruction access scratch memory
12099 // then we need to use the same legalization rules we use for private.
12100 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12101 !Subtarget->hasMultiDwordFlatScratchAddressing())
12102 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
12105
12106 unsigned NumElements = MemVT.getVectorNumElements();
12107
12108 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12110 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12111 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12112 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
12113 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12114 Alignment >= Align(4) && NumElements < 32) {
12115 if (MemVT.isPow2VectorType() ||
12116 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12117 return SDValue();
12118 return WidenOrSplitVectorLoad(Op, DAG);
12119 }
12120 // Non-uniform loads will be selected to MUBUF instructions, so they
12121 // have the same legalization requirements as global and private
12122 // loads.
12123 //
12124 }
12125 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12128 if (NumElements > 4)
12129 return SplitVectorLoad(Op, DAG);
12130 // v3 loads not supported on SI.
12131 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12132 return WidenOrSplitVectorLoad(Op, DAG);
12133
12134 // v3 and v4 loads are supported for private and global memory.
12135 return SDValue();
12136 }
12137 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12138 // Depending on the setting of the private_element_size field in the
12139 // resource descriptor, we can only make private accesses up to a certain
12140 // size.
12141 switch (Subtarget->getMaxPrivateElementSize()) {
12142 case 4: {
12143 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
12144 return DAG.getMergeValues({Op0, Op1}, DL);
12145 }
12146 case 8:
12147 if (NumElements > 2)
12148 return SplitVectorLoad(Op, DAG);
12149 return SDValue();
12150 case 16:
12151 // Same as global/flat
12152 if (NumElements > 4)
12153 return SplitVectorLoad(Op, DAG);
12154 // v3 loads not supported on SI.
12155 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12156 return WidenOrSplitVectorLoad(Op, DAG);
12157
12158 return SDValue();
12159 default:
12160 llvm_unreachable("unsupported private_element_size");
12161 }
12162 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12163 unsigned Fast = 0;
12164 auto Flags = Load->getMemOperand()->getFlags();
12166 Load->getAlign(), Flags, &Fast) &&
12167 Fast > 1)
12168 return SDValue();
12169
12170 if (MemVT.isVector())
12171 return SplitVectorLoad(Op, DAG);
12172 }
12173
12175 MemVT, *Load->getMemOperand())) {
12176 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
12177 return DAG.getMergeValues({Op0, Op1}, DL);
12178 }
12179
12180 return SDValue();
12181}
12182
12183SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12184 EVT VT = Op.getValueType();
12185 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12186 VT.getSizeInBits() == 512)
12187 return splitTernaryVectorOp(Op, DAG);
12188
12189 assert(VT.getSizeInBits() == 64);
12190
12191 SDLoc DL(Op);
12192 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
12193
12194 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
12195 SDValue One = DAG.getConstant(1, DL, MVT::i32);
12196
12197 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
12198 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
12199
12200 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
12201 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
12202
12203 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
12204
12205 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
12206 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
12207
12208 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
12209
12210 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
12211 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
12212}
12213
12214// Catch division cases where we can use shortcuts with rcp and rsq
12215// instructions.
12216SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12217 SelectionDAG &DAG) const {
12218 SDLoc SL(Op);
12219 SDValue LHS = Op.getOperand(0);
12220 SDValue RHS = Op.getOperand(1);
12221 EVT VT = Op.getValueType();
12222 const SDNodeFlags Flags = Op->getFlags();
12223
12224 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12225
12226 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
12227 // Without !fpmath accuracy information, we can't do more because we don't
12228 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12229 // f16 is always accurate enough
12230 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12231 return SDValue();
12232
12233 if (CLHS->isExactlyValue(1.0)) {
12234 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12235 // the CI documentation has a worst case error of 1 ulp.
12236 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12237 // use it as long as we aren't trying to use denormals.
12238 //
12239 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12240
12241 // 1.0 / sqrt(x) -> rsq(x)
12242
12243 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12244 // error seems really high at 2^29 ULP.
12245 // 1.0 / x -> rcp(x)
12246 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12247 }
12248
12249 // Same as for 1.0, but expand the sign out of the constant.
12250 if (CLHS->isExactlyValue(-1.0)) {
12251 // -1.0 / x -> rcp (fneg x)
12252 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12253 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12254 }
12255 }
12256
12257 // For f16 and bf16 require afn or arcp.
12258 // For f32 require afn.
12259 if (!AllowInaccurateRcp &&
12260 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12261 return SDValue();
12262
12263 // Turn into multiply by the reciprocal.
12264 // x / y -> x * (1.0 / y)
12265 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12266 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12267}
12268
12269SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12270 SelectionDAG &DAG) const {
12271 SDLoc SL(Op);
12272 SDValue X = Op.getOperand(0);
12273 SDValue Y = Op.getOperand(1);
12274 EVT VT = Op.getValueType();
12275 const SDNodeFlags Flags = Op->getFlags();
12276
12277 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12278 if (!AllowInaccurateDiv)
12279 return SDValue();
12280
12281 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12282 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12283
12284 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12285 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12286
12287 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12288 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12289 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12290 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12291 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12292 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12293}
12294
12295static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12296 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12297 SDNodeFlags Flags) {
12298 if (GlueChain->getNumValues() <= 1) {
12299 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12300 }
12301
12302 assert(GlueChain->getNumValues() == 3);
12303
12304 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12305 switch (Opcode) {
12306 default:
12307 llvm_unreachable("no chain equivalent for opcode");
12308 case ISD::FMUL:
12309 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12310 break;
12311 }
12312
12313 return DAG.getNode(Opcode, SL, VTList,
12314 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12315 Flags);
12316}
12317
12318static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12319 EVT VT, SDValue A, SDValue B, SDValue C,
12320 SDValue GlueChain, SDNodeFlags Flags) {
12321 if (GlueChain->getNumValues() <= 1) {
12322 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12323 }
12324
12325 assert(GlueChain->getNumValues() == 3);
12326
12327 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12328 switch (Opcode) {
12329 default:
12330 llvm_unreachable("no chain equivalent for opcode");
12331 case ISD::FMA:
12332 Opcode = AMDGPUISD::FMA_W_CHAIN;
12333 break;
12334 }
12335
12336 return DAG.getNode(Opcode, SL, VTList,
12337 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12338 Flags);
12339}
12340
12341SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12342 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12343 return FastLowered;
12344
12345 SDLoc SL(Op);
12346 EVT VT = Op.getValueType();
12347 SDValue LHS = Op.getOperand(0);
12348 SDValue RHS = Op.getOperand(1);
12349
12350 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12351 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12352
12353 if (VT == MVT::bf16) {
12354 SDValue ExtDiv =
12355 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12356 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12357 DAG.getTargetConstant(0, SL, MVT::i32));
12358 }
12359
12360 assert(VT == MVT::f16);
12361
12362 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12363 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12364 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12365 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12366 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12367 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12368 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12369 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12370 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12371 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12372 // q16.u = opx(V_CVT_F16_F32, q32.u);
12373 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12374
12375 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12376 unsigned FMADOpCode =
12378 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12379 SDValue Rcp =
12380 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12381 SDValue Quot =
12382 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12383 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12384 Op->getFlags());
12385 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12386 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12387 Op->getFlags());
12388 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12389 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12390 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12391 DAG.getConstant(0xff800000, SL, MVT::i32));
12392 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12393 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12394 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12395 DAG.getTargetConstant(0, SL, MVT::i32));
12396 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12397 Op->getFlags());
12398}
12399
12400// Faster 2.5 ULP division that does not support denormals.
12401SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12402 SDNodeFlags Flags = Op->getFlags();
12403 SDLoc SL(Op);
12404 SDValue LHS = Op.getOperand(1);
12405 SDValue RHS = Op.getOperand(2);
12406
12407 // TODO: The combiner should probably handle elimination of redundant fabs.
12409 ? RHS
12410 : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12411
12412 const APFloat K0Val(0x1p+96f);
12413 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12414
12415 const APFloat K1Val(0x1p-32f);
12416 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12417
12418 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12419
12420 EVT SetCCVT =
12421 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12422
12423 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12424
12425 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12426
12427 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12428
12429 // rcp does not support denormals.
12430 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12431
12432 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12433
12434 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12435}
12436
12437// Returns immediate value for setting the F32 denorm mode when using the
12438// S_DENORM_MODE instruction.
12441 const GCNSubtarget *ST) {
12442 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12443 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12444 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12445 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12446}
12447
12448SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12449 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12450 return FastLowered;
12451
12452 // The selection matcher assumes anything with a chain selecting to a
12453 // mayRaiseFPException machine instruction. Since we're introducing a chain
12454 // here, we need to explicitly report nofpexcept for the regular fdiv
12455 // lowering.
12456 SDNodeFlags Flags = Op->getFlags();
12457 Flags.setNoFPExcept(true);
12458
12459 SDLoc SL(Op);
12460 SDValue LHS = Op.getOperand(0);
12461 SDValue RHS = Op.getOperand(1);
12462
12463 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12464
12465 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12466
12467 SDValue DenominatorScaled =
12468 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12469 SDValue NumeratorScaled =
12470 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12471
12472 // Denominator is scaled to not be denormal, so using rcp is ok.
12473 SDValue ApproxRcp =
12474 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12475 SDValue NegDivScale0 =
12476 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12477
12478 using namespace AMDGPU::Hwreg;
12479 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12480 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12481
12482 const MachineFunction &MF = DAG.getMachineFunction();
12483 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12484 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12485
12486 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12487 const bool HasDynamicDenormals =
12488 (DenormMode.Input == DenormalMode::Dynamic) ||
12489 (DenormMode.Output == DenormalMode::Dynamic);
12490
12491 SDValue SavedDenormMode;
12492
12493 if (!PreservesDenormals) {
12494 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12495 // lowering. The chain dependence is insufficient, and we need glue. We do
12496 // not need the glue variants in a strictfp function.
12497
12498 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12499
12500 SDValue Glue = DAG.getEntryNode();
12501 if (HasDynamicDenormals) {
12502 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12503 DAG.getVTList(MVT::i32, MVT::Glue),
12504 {BitField, Glue});
12505 SavedDenormMode = SDValue(GetReg, 0);
12506
12507 Glue = DAG.getMergeValues(
12508 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12509 }
12510
12511 SDNode *EnableDenorm;
12512 if (Subtarget->hasDenormModeInst()) {
12513 const SDValue EnableDenormValue =
12515
12516 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12517 EnableDenormValue)
12518 .getNode();
12519 } else {
12520 const SDValue EnableDenormValue =
12521 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12522 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12523 {EnableDenormValue, BitField, Glue});
12524 }
12525
12526 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12527 SDValue(EnableDenorm, 1)};
12528
12529 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12530 }
12531
12532 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12533 ApproxRcp, One, NegDivScale0, Flags);
12534
12535 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12536 ApproxRcp, Fma0, Flags);
12537
12538 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12539 Fma1, Flags);
12540
12541 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12542 NumeratorScaled, Mul, Flags);
12543
12544 SDValue Fma3 =
12545 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12546
12547 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12548 NumeratorScaled, Fma3, Flags);
12549
12550 if (!PreservesDenormals) {
12551 SDNode *DisableDenorm;
12552 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12553 const SDValue DisableDenormValue = getSPDenormModeValue(
12554 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12555
12556 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12557 DisableDenorm =
12558 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12559 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12560 .getNode();
12561 } else {
12562 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12563 const SDValue DisableDenormValue =
12564 HasDynamicDenormals
12565 ? SavedDenormMode
12566 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12567
12568 DisableDenorm = DAG.getMachineNode(
12569 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12570 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12571 }
12572
12573 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12574 SDValue(DisableDenorm, 0), DAG.getRoot());
12575 DAG.setRoot(OutputChain);
12576 }
12577
12578 SDValue Scale = NumeratorScaled.getValue(1);
12579 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12580 {Fma4, Fma1, Fma3, Scale}, Flags);
12581
12582 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12583}
12584
12585SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12586 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12587 return FastLowered;
12588
12589 SDLoc SL(Op);
12590 SDValue X = Op.getOperand(0);
12591 SDValue Y = Op.getOperand(1);
12592
12593 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12594
12595 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12596
12597 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12598
12599 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12600
12601 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12602
12603 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12604
12605 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12606
12607 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12608
12609 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12610
12611 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12612 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12613
12614 SDValue Fma4 =
12615 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12616
12617 SDValue Scale;
12618
12619 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12620 // Workaround a hardware bug on SI where the condition output from div_scale
12621 // is not usable.
12622
12623 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12624
12625 // Figure out if the scale to use for div_fmas.
12626 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12627 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12628 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12629 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12630
12631 SDValue NumHi =
12632 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12633 SDValue DenHi =
12634 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12635
12636 SDValue Scale0Hi =
12637 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12638 SDValue Scale1Hi =
12639 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12640
12641 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12642 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12643 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12644 } else {
12645 Scale = DivScale1.getValue(1);
12646 }
12647
12648 SDValue Fmas =
12649 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12650
12651 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12652}
12653
12654SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12655 EVT VT = Op.getValueType();
12656
12657 if (VT == MVT::f32)
12658 return LowerFDIV32(Op, DAG);
12659
12660 if (VT == MVT::f64)
12661 return LowerFDIV64(Op, DAG);
12662
12663 if (VT == MVT::f16 || VT == MVT::bf16)
12664 return LowerFDIV16(Op, DAG);
12665
12666 llvm_unreachable("Unexpected type for fdiv");
12667}
12668
12669SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12670 SDLoc dl(Op);
12671 SDValue Val = Op.getOperand(0);
12672 EVT VT = Val.getValueType();
12673 EVT ResultExpVT = Op->getValueType(1);
12674 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12675
12676 SDValue Mant = DAG.getNode(
12678 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12679
12680 SDValue Exp = DAG.getNode(
12681 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12682 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12683
12684 if (Subtarget->hasFractBug()) {
12685 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12686 SDValue Inf =
12688
12689 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12690 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12691 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12692 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12693 }
12694
12695 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12696 return DAG.getMergeValues({Mant, CastExp}, dl);
12697}
12698
12699SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12700 SDLoc DL(Op);
12701 StoreSDNode *Store = cast<StoreSDNode>(Op);
12702 EVT VT = Store->getMemoryVT();
12703
12704 if (VT == MVT::i1) {
12705 return DAG.getTruncStore(
12706 Store->getChain(), DL,
12707 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12708 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12709 }
12710
12711 assert(VT.isVector() &&
12712 Store->getValue().getValueType().getScalarType() == MVT::i32);
12713
12714 unsigned AS = Store->getAddressSpace();
12715 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12716 Store->getAlign().value() < VT.getStoreSize() &&
12717 VT.getSizeInBits() > 32) {
12718 return SplitVectorStore(Op, DAG);
12719 }
12720
12721 MachineFunction &MF = DAG.getMachineFunction();
12722 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12723 // If there is a possibility that flat instruction access scratch memory
12724 // then we need to use the same legalization rules we use for private.
12725 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12726 !Subtarget->hasMultiDwordFlatScratchAddressing())
12727 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12730
12731 unsigned NumElements = VT.getVectorNumElements();
12733 if (NumElements > 4)
12734 return SplitVectorStore(Op, DAG);
12735 // v3 stores not supported on SI.
12736 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12737 return SplitVectorStore(Op, DAG);
12738
12740 VT, *Store->getMemOperand()))
12741 return expandUnalignedStore(Store, DAG);
12742
12743 return SDValue();
12744 }
12745 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12746 switch (Subtarget->getMaxPrivateElementSize()) {
12747 case 4:
12748 return scalarizeVectorStore(Store, DAG);
12749 case 8:
12750 if (NumElements > 2)
12751 return SplitVectorStore(Op, DAG);
12752 return SDValue();
12753 case 16:
12754 if (NumElements > 4 ||
12755 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12756 return SplitVectorStore(Op, DAG);
12757 return SDValue();
12758 default:
12759 llvm_unreachable("unsupported private_element_size");
12760 }
12761 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12762 unsigned Fast = 0;
12763 auto Flags = Store->getMemOperand()->getFlags();
12765 Store->getAlign(), Flags, &Fast) &&
12766 Fast > 1)
12767 return SDValue();
12768
12769 if (VT.isVector())
12770 return SplitVectorStore(Op, DAG);
12771
12772 return expandUnalignedStore(Store, DAG);
12773 }
12774
12775 // Probably an invalid store. If so we'll end up emitting a selection error.
12776 return SDValue();
12777}
12778
12779// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12780SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12781 SDLoc SL(Op);
12782 assert(!Subtarget->has16BitInsts());
12783 SDNodeFlags Flags = Op->getFlags();
12784 SDValue Ext =
12785 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12786
12787 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12788 SDValue Sqrt =
12789 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12790
12791 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12792 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12793}
12794
12795SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12796 SDLoc DL(Op);
12797 SDNodeFlags Flags = Op->getFlags();
12798 MVT VT = Op.getValueType().getSimpleVT();
12799 const SDValue X = Op.getOperand(0);
12800
12801 if (allowApproxFunc(DAG, Flags)) {
12802 // Instruction is 1ulp but ignores denormals.
12803 return DAG.getNode(
12805 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12806 }
12807
12808 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12809 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12810
12811 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12812
12813 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12814
12815 SDValue SqrtX =
12816 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12817
12818 SDValue SqrtS;
12819 if (needsDenormHandlingF32(DAG, X, Flags)) {
12820 SDValue SqrtID =
12821 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12822 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12823
12824 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12825 SDValue SqrtSNextDownInt =
12826 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12827 DAG.getAllOnesConstant(DL, MVT::i32));
12828 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12829
12830 SDValue NegSqrtSNextDown =
12831 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12832
12833 SDValue SqrtVP =
12834 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12835
12836 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12837 DAG.getConstant(1, DL, MVT::i32));
12838 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12839
12840 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12841 SDValue SqrtVS =
12842 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12843
12844 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12845 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12846
12847 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12848 Flags);
12849
12850 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12851 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12852 Flags);
12853 } else {
12854 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12855
12856 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12857
12858 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12859 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12860 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12861
12862 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12863 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12864 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12865
12866 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12867 SDValue SqrtD =
12868 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12869 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12870 }
12871
12872 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12873
12874 SDValue ScaledDown =
12875 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12876
12877 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12878 SDValue IsZeroOrInf =
12879 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12880 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12881
12882 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12883}
12884
12885SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12886 // For double type, the SQRT and RSQ instructions don't have required
12887 // precision, we apply Goldschmidt's algorithm to improve the result:
12888 //
12889 // y0 = rsq(x)
12890 // g0 = x * y0
12891 // h0 = 0.5 * y0
12892 //
12893 // r0 = 0.5 - h0 * g0
12894 // g1 = g0 * r0 + g0
12895 // h1 = h0 * r0 + h0
12896 //
12897 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12898 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12899 // h2 = h1 * r1 + h1
12900 //
12901 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12902 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12903 //
12904 // sqrt(x) = g3
12905
12906 SDNodeFlags Flags = Op->getFlags();
12907
12908 SDLoc DL(Op);
12909
12910 SDValue X = Op.getOperand(0);
12911 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12912
12913 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12914
12915 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12916
12917 // Scale up input if it is too small.
12918 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12919 SDValue ScaleUp =
12920 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12921 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12922
12923 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12924
12925 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12926
12927 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12928 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12929
12930 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12931 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12932
12933 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12934
12935 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12936
12937 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12938 SDValue SqrtD0 =
12939 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12940
12941 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12942
12943 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12944 SDValue SqrtD1 =
12945 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12946
12947 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12948
12949 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12950 SDValue ScaleDown =
12951 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12952 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12953
12954 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12955 // with finite only or nsz because rsq(+/-0) = +/-inf
12956
12957 // TODO: Check for DAZ and expand to subnormals
12958 SDValue IsZeroOrInf =
12959 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12960 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12961
12962 // If x is +INF, +0, or -0, use its original value
12963 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12964 Flags);
12965}
12966
12967SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12968 SDLoc DL(Op);
12969 EVT VT = Op.getValueType();
12970 SDValue Arg = Op.getOperand(0);
12971 SDValue TrigVal;
12972
12973 // Propagate fast-math flags so that the multiply we introduce can be folded
12974 // if Arg is already the result of a multiply by constant.
12975 auto Flags = Op->getFlags();
12976
12977 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12978
12979 if (Subtarget->hasTrigReducedRange()) {
12980 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12981 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12982 } else {
12983 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12984 }
12985
12986 switch (Op.getOpcode()) {
12987 case ISD::FCOS:
12988 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12989 case ISD::FSIN:
12990 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12991 default:
12992 llvm_unreachable("Wrong trig opcode");
12993 }
12994}
12995
12996SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12997 SelectionDAG &DAG) const {
12998 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12999 assert(AtomicNode->isCompareAndSwap());
13000 unsigned AS = AtomicNode->getAddressSpace();
13001
13002 // No custom lowering required for local address space
13004 return Op;
13005
13006 // Non-local address space requires custom lowering for atomic compare
13007 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13008 SDLoc DL(Op);
13009 SDValue ChainIn = Op.getOperand(0);
13010 SDValue Addr = Op.getOperand(1);
13011 SDValue Old = Op.getOperand(2);
13012 SDValue New = Op.getOperand(3);
13013 EVT VT = Op.getValueType();
13014 MVT SimpleVT = VT.getSimpleVT();
13015 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
13016
13017 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
13018 SDValue Ops[] = {ChainIn, Addr, NewOld};
13019
13020 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
13021 Op->getVTList(), Ops, VT,
13022 AtomicNode->getMemOperand());
13023}
13024
13025//===----------------------------------------------------------------------===//
13026// Custom DAG optimizations
13027//===----------------------------------------------------------------------===//
13028
13029SDValue
13030SITargetLowering::performUCharToFloatCombine(SDNode *N,
13031 DAGCombinerInfo &DCI) const {
13032 EVT VT = N->getValueType(0);
13033 EVT ScalarVT = VT.getScalarType();
13034 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13035 return SDValue();
13036
13037 SelectionDAG &DAG = DCI.DAG;
13038 SDLoc DL(N);
13039
13040 SDValue Src = N->getOperand(0);
13041 EVT SrcVT = Src.getValueType();
13042
13043 // TODO: We could try to match extracting the higher bytes, which would be
13044 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
13045 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
13046 // about in practice.
13047 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13048 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
13049 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
13050 DCI.AddToWorklist(Cvt.getNode());
13051
13052 // For the f16 case, fold to a cast to f32 and then cast back to f16.
13053 if (ScalarVT != MVT::f32) {
13054 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
13055 DAG.getTargetConstant(0, DL, MVT::i32));
13056 }
13057 return Cvt;
13058 }
13059 }
13060
13061 return SDValue();
13062}
13063
13064SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13065 DAGCombinerInfo &DCI) const {
13066 SDValue MagnitudeOp = N->getOperand(0);
13067 SDValue SignOp = N->getOperand(1);
13068
13069 // The generic combine for fcopysign + fp cast is too conservative with
13070 // vectors, and also gets confused by the splitting we will perform here, so
13071 // peek through FP casts.
13072 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13073 SignOp.getOpcode() == ISD::FP_ROUND)
13074 SignOp = SignOp.getOperand(0);
13075
13076 SelectionDAG &DAG = DCI.DAG;
13077 SDLoc DL(N);
13078 EVT SignVT = SignOp.getValueType();
13079
13080 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13081 // lower half with a copy.
13082 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13083 EVT MagVT = MagnitudeOp.getValueType();
13084
13085 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13086
13087 if (MagVT.getScalarType() == MVT::f64) {
13088 EVT F32VT = MagVT.isVector()
13089 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13090 : MVT::v2f32;
13091
13092 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
13093
13095 for (unsigned I = 0; I != NumElts; ++I) {
13096 SDValue MagLo =
13097 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13098 DAG.getConstant(2 * I, DL, MVT::i32));
13099 SDValue MagHi =
13100 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13101 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13102
13103 SDValue SignOpElt =
13104 MagVT.isVector()
13106 SignOp, DAG.getConstant(I, DL, MVT::i32))
13107 : SignOp;
13108
13109 SDValue HiOp =
13110 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
13111
13112 SDValue Vector =
13113 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
13114
13115 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
13116 NewElts.push_back(NewElt);
13117 }
13118
13119 if (NewElts.size() == 1)
13120 return NewElts[0];
13121
13122 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
13123 }
13124
13125 if (SignVT.getScalarType() != MVT::f64)
13126 return SDValue();
13127
13128 // Reduce width of sign operand, we only need the highest bit.
13129 //
13130 // fcopysign f64:x, f64:y ->
13131 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13132 // TODO: In some cases it might make sense to go all the way to f16.
13133
13134 EVT F32VT = MagVT.isVector()
13135 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13136 : MVT::v2f32;
13137
13138 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
13139
13140 SmallVector<SDValue, 8> F32Signs;
13141 for (unsigned I = 0; I != NumElts; ++I) {
13142 // Take sign from odd elements of cast vector
13143 SDValue SignAsF32 =
13144 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
13145 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13146 F32Signs.push_back(SignAsF32);
13147 }
13148
13149 SDValue NewSign =
13150 NumElts == 1
13151 ? F32Signs.back()
13153 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
13154 F32Signs);
13155
13156 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
13157 NewSign);
13158}
13159
13160// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13161// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13162// bits
13163
13164// This is a variant of
13165// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13166//
13167// The normal DAG combiner will do this, but only if the add has one use since
13168// that would increase the number of instructions.
13169//
13170// This prevents us from seeing a constant offset that can be folded into a
13171// memory instruction's addressing mode. If we know the resulting add offset of
13172// a pointer can be folded into an addressing offset, we can replace the pointer
13173// operand with the add of new constant offset. This eliminates one of the uses,
13174// and may allow the remaining use to also be simplified.
13175//
13176SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13177 EVT MemVT,
13178 DAGCombinerInfo &DCI) const {
13179 SDValue N0 = N->getOperand(0);
13180 SDValue N1 = N->getOperand(1);
13181
13182 // We only do this to handle cases where it's profitable when there are
13183 // multiple uses of the add, so defer to the standard combine.
13184 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13185 return SDValue();
13186
13187 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
13188 if (!CN1)
13189 return SDValue();
13190
13191 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13192 if (!CAdd)
13193 return SDValue();
13194
13195 SelectionDAG &DAG = DCI.DAG;
13196
13197 if (N0->getOpcode() == ISD::OR &&
13198 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
13199 return SDValue();
13200
13201 // If the resulting offset is too large, we can't fold it into the
13202 // addressing mode offset.
13203 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13204 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
13205
13206 AddrMode AM;
13207 AM.HasBaseReg = true;
13208 AM.BaseOffs = Offset.getSExtValue();
13209 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
13210 return SDValue();
13211
13212 SDLoc SL(N);
13213 EVT VT = N->getValueType(0);
13214
13215 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
13216 SDValue COffset = DAG.getConstant(Offset, SL, VT);
13217
13218 SDNodeFlags Flags;
13219 Flags.setNoUnsignedWrap(
13220 N->getFlags().hasNoUnsignedWrap() &&
13221 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13222
13223 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13224 // be sure that the new left operand is a proper base pointer.
13225 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
13226}
13227
13228/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13229/// by the chain and intrinsic ID. Theoretically we would also need to check the
13230/// specific intrinsic, but they all place the pointer operand first.
13231static unsigned getBasePtrIndex(const MemSDNode *N) {
13232 switch (N->getOpcode()) {
13233 case ISD::STORE:
13236 return 2;
13237 default:
13238 return 1;
13239 }
13240}
13241
13242SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13243 DAGCombinerInfo &DCI) const {
13244 SelectionDAG &DAG = DCI.DAG;
13245
13246 unsigned PtrIdx = getBasePtrIndex(N);
13247 SDValue Ptr = N->getOperand(PtrIdx);
13248
13249 // TODO: We could also do this for multiplies.
13250 if (Ptr.getOpcode() == ISD::SHL) {
13251 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13252 N->getMemoryVT(), DCI);
13253 if (NewPtr) {
13254 SmallVector<SDValue, 8> NewOps(N->ops());
13255
13256 NewOps[PtrIdx] = NewPtr;
13257 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13258 }
13259 }
13260
13261 return SDValue();
13262}
13263
13264static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13265 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13266 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13267 (Opc == ISD::XOR && Val == 0);
13268}
13269
13270// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13271// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13272// integer combine opportunities since most 64-bit operations are decomposed
13273// this way. TODO: We won't want this for SALU especially if it is an inline
13274// immediate.
13275SDValue SITargetLowering::splitBinaryBitConstantOp(
13276 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13277 const ConstantSDNode *CRHS) const {
13278 uint64_t Val = CRHS->getZExtValue();
13279 uint32_t ValLo = Lo_32(Val);
13280 uint32_t ValHi = Hi_32(Val);
13281 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13282
13283 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13285 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13286 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13287 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13288 !CRHS->user_begin()->isDivergent())
13289 return SDValue();
13290
13291 // If we need to materialize a 64-bit immediate, it will be split up later
13292 // anyway. Avoid creating the harder to understand 64-bit immediate
13293 // materialization.
13294 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13295 }
13296
13297 return SDValue();
13298}
13299
13301 if (V.getValueType() != MVT::i1)
13302 return false;
13303 switch (V.getOpcode()) {
13304 default:
13305 break;
13306 case ISD::SETCC:
13307 case ISD::IS_FPCLASS:
13308 case AMDGPUISD::FP_CLASS:
13309 return true;
13310 case ISD::AND:
13311 case ISD::OR:
13312 case ISD::XOR:
13313 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13314 case ISD::SADDO:
13315 case ISD::UADDO:
13316 case ISD::SSUBO:
13317 case ISD::USUBO:
13318 case ISD::SMULO:
13319 case ISD::UMULO:
13320 return V.getResNo() == 1;
13322 unsigned IntrinsicID = V.getConstantOperandVal(0);
13323 switch (IntrinsicID) {
13324 case Intrinsic::amdgcn_is_shared:
13325 case Intrinsic::amdgcn_is_private:
13326 return true;
13327 default:
13328 return false;
13329 }
13330
13331 return false;
13332 }
13333 }
13334 return false;
13335}
13336
13337// If a constant has all zeroes or all ones within each byte return it.
13338// Otherwise return 0.
13340 // 0xff for any zero byte in the mask
13341 uint32_t ZeroByteMask = 0;
13342 if (!(C & 0x000000ff))
13343 ZeroByteMask |= 0x000000ff;
13344 if (!(C & 0x0000ff00))
13345 ZeroByteMask |= 0x0000ff00;
13346 if (!(C & 0x00ff0000))
13347 ZeroByteMask |= 0x00ff0000;
13348 if (!(C & 0xff000000))
13349 ZeroByteMask |= 0xff000000;
13350 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13351 if ((NonZeroByteMask & C) != NonZeroByteMask)
13352 return 0; // Partial bytes selected.
13353 return C;
13354}
13355
13356// Check if a node selects whole bytes from its operand 0 starting at a byte
13357// boundary while masking the rest. Returns select mask as in the v_perm_b32
13358// or -1 if not succeeded.
13359// Note byte select encoding:
13360// value 0-3 selects corresponding source byte;
13361// value 0xc selects zero;
13362// value 0xff selects 0xff.
13364 assert(V.getValueSizeInBits() == 32);
13365
13366 if (V.getNumOperands() != 2)
13367 return ~0;
13368
13369 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13370 if (!N1)
13371 return ~0;
13372
13373 uint32_t C = N1->getZExtValue();
13374
13375 switch (V.getOpcode()) {
13376 default:
13377 break;
13378 case ISD::AND:
13379 if (uint32_t ConstMask = getConstantPermuteMask(C))
13380 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13381 break;
13382
13383 case ISD::OR:
13384 if (uint32_t ConstMask = getConstantPermuteMask(C))
13385 return (0x03020100 & ~ConstMask) | ConstMask;
13386 break;
13387
13388 case ISD::SHL:
13389 if (C % 8)
13390 return ~0;
13391
13392 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13393
13394 case ISD::SRL:
13395 if (C % 8)
13396 return ~0;
13397
13398 return uint32_t(0x0c0c0c0c03020100ull >> C);
13399 }
13400
13401 return ~0;
13402}
13403
13404SDValue SITargetLowering::performAndCombine(SDNode *N,
13405 DAGCombinerInfo &DCI) const {
13406 if (DCI.isBeforeLegalize())
13407 return SDValue();
13408
13409 SelectionDAG &DAG = DCI.DAG;
13410 EVT VT = N->getValueType(0);
13411 SDValue LHS = N->getOperand(0);
13412 SDValue RHS = N->getOperand(1);
13413
13414 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13415 if (VT == MVT::i64 && CRHS) {
13416 if (SDValue Split =
13417 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13418 return Split;
13419 }
13420
13421 if (CRHS && VT == MVT::i32) {
13422 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13423 // nb = number of trailing zeroes in mask
13424 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13425 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13426 uint64_t Mask = CRHS->getZExtValue();
13427 unsigned Bits = llvm::popcount(Mask);
13428 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13429 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13430 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13431 unsigned Shift = CShift->getZExtValue();
13432 unsigned NB = CRHS->getAPIntValue().countr_zero();
13433 unsigned Offset = NB + Shift;
13434 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13435 SDLoc SL(N);
13436 SDValue BFE =
13437 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13438 DAG.getConstant(Offset, SL, MVT::i32),
13439 DAG.getConstant(Bits, SL, MVT::i32));
13440 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13441 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13442 DAG.getValueType(NarrowVT));
13443 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13444 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13445 return Shl;
13446 }
13447 }
13448 }
13449
13450 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13451 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13452 isa<ConstantSDNode>(LHS.getOperand(2))) {
13453 uint32_t Sel = getConstantPermuteMask(Mask);
13454 if (!Sel)
13455 return SDValue();
13456
13457 // Select 0xc for all zero bytes
13458 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13459 SDLoc DL(N);
13460 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13461 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13462 }
13463 }
13464
13465 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13466 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13467 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13468 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13469 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13470
13471 SDValue X = LHS.getOperand(0);
13472 SDValue Y = RHS.getOperand(0);
13473 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13474 !isTypeLegal(X.getValueType()))
13475 return SDValue();
13476
13477 if (LCC == ISD::SETO) {
13478 if (X != LHS.getOperand(1))
13479 return SDValue();
13480
13481 if (RCC == ISD::SETUNE) {
13482 const ConstantFPSDNode *C1 =
13483 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13484 if (!C1 || !C1->isInfinity() || C1->isNegative())
13485 return SDValue();
13486
13487 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13491
13492 static_assert(
13495 0x3ff) == Mask,
13496 "mask not equal");
13497
13498 SDLoc DL(N);
13499 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13500 DAG.getConstant(Mask, DL, MVT::i32));
13501 }
13502 }
13503 }
13504
13505 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13506 std::swap(LHS, RHS);
13507
13508 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13509 RHS.hasOneUse()) {
13510 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13511 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13512 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13513 // | n_nan)
13514 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13515 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13516 (RHS.getOperand(0) == LHS.getOperand(0) &&
13517 LHS.getOperand(0) == LHS.getOperand(1))) {
13518 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13519 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13520 : Mask->getZExtValue() & OrdMask;
13521
13522 SDLoc DL(N);
13523 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13524 DAG.getConstant(NewMask, DL, MVT::i32));
13525 }
13526 }
13527
13528 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13529 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13530 // and x, (sext cc from i1) => select cc, x, 0
13531 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13532 std::swap(LHS, RHS);
13533 if (isBoolSGPR(RHS.getOperand(0)))
13534 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13535 DAG.getConstant(0, SDLoc(N), MVT::i32));
13536 }
13537
13538 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13539 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13540 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13541 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13542 uint32_t LHSMask = getPermuteMask(LHS);
13543 uint32_t RHSMask = getPermuteMask(RHS);
13544 if (LHSMask != ~0u && RHSMask != ~0u) {
13545 // Canonicalize the expression in an attempt to have fewer unique masks
13546 // and therefore fewer registers used to hold the masks.
13547 if (LHSMask > RHSMask) {
13548 std::swap(LHSMask, RHSMask);
13549 std::swap(LHS, RHS);
13550 }
13551
13552 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13553 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13554 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13555 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13556
13557 // Check of we need to combine values from two sources within a byte.
13558 if (!(LHSUsedLanes & RHSUsedLanes) &&
13559 // If we select high and lower word keep it for SDWA.
13560 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13561 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13562 // Each byte in each mask is either selector mask 0-3, or has higher
13563 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13564 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13565 // mask which is not 0xff wins. By anding both masks we have a correct
13566 // result except that 0x0c shall be corrected to give 0x0c only.
13567 uint32_t Mask = LHSMask & RHSMask;
13568 for (unsigned I = 0; I < 32; I += 8) {
13569 uint32_t ByteSel = 0xff << I;
13570 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13571 Mask &= (0x0c << I) & 0xffffffff;
13572 }
13573
13574 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13575 // or 0x0c.
13576 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13577 SDLoc DL(N);
13578
13579 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13580 RHS.getOperand(0),
13581 DAG.getConstant(Sel, DL, MVT::i32));
13582 }
13583 }
13584 }
13585
13586 return SDValue();
13587}
13588
13589// A key component of v_perm is a mapping between byte position of the src
13590// operands, and the byte position of the dest. To provide such, we need: 1. the
13591// node that provides x byte of the dest of the OR, and 2. the byte of the node
13592// used to provide that x byte. calculateByteProvider finds which node provides
13593// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13594// and finds an ultimate src and byte position For example: The supported
13595// LoadCombine pattern for vector loads is as follows
13596// t1
13597// or
13598// / \
13599// t2 t3
13600// zext shl
13601// | | \
13602// t4 t5 16
13603// or anyext
13604// / \ |
13605// t6 t7 t8
13606// srl shl or
13607// / | / \ / \
13608// t9 t10 t11 t12 t13 t14
13609// trunc* 8 trunc* 8 and and
13610// | | / | | \
13611// t15 t16 t17 t18 t19 t20
13612// trunc* 255 srl -256
13613// | / \
13614// t15 t15 16
13615//
13616// *In this example, the truncs are from i32->i16
13617//
13618// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13619// respectively. calculateSrcByte would find (given node) -> ultimate src &
13620// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13621// After finding the mapping, we can combine the tree into vperm t15, t16,
13622// 0x05000407
13623
13624// Find the source and byte position from a node.
13625// \p DestByte is the byte position of the dest of the or that the src
13626// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13627// dest of the or byte. \p Depth tracks how many recursive iterations we have
13628// performed.
13629static const std::optional<ByteProvider<SDValue>>
13630calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13631 unsigned Depth = 0) {
13632 // We may need to recursively traverse a series of SRLs
13633 if (Depth >= 6)
13634 return std::nullopt;
13635
13636 if (Op.getValueSizeInBits() < 8)
13637 return std::nullopt;
13638
13639 if (Op.getValueType().isVector())
13640 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13641
13642 switch (Op->getOpcode()) {
13643 case ISD::TRUNCATE: {
13644 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13645 }
13646
13647 case ISD::SIGN_EXTEND:
13648 case ISD::ZERO_EXTEND:
13650 SDValue NarrowOp = Op->getOperand(0);
13651 auto NarrowVT = NarrowOp.getValueType();
13652 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13653 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13654 NarrowVT = VTSign->getVT();
13655 }
13656 if (!NarrowVT.isByteSized())
13657 return std::nullopt;
13658 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13659
13660 if (SrcIndex >= NarrowByteWidth)
13661 return std::nullopt;
13662 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13663 }
13664
13665 case ISD::SRA:
13666 case ISD::SRL: {
13667 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13668 if (!ShiftOp)
13669 return std::nullopt;
13670
13671 uint64_t BitShift = ShiftOp->getZExtValue();
13672
13673 if (BitShift % 8 != 0)
13674 return std::nullopt;
13675
13676 SrcIndex += BitShift / 8;
13677
13678 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13679 }
13680
13681 default: {
13682 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13683 }
13684 }
13685 llvm_unreachable("fully handled switch");
13686}
13687
13688// For a byte position in the result of an Or, traverse the tree and find the
13689// node (and the byte of the node) which ultimately provides this {Or,
13690// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13691// the byte position of the Op that corresponds with the originally requested
13692// byte of the Or \p Depth tracks how many recursive iterations we have
13693// performed. \p StartingIndex is the originally requested byte of the Or
13694static const std::optional<ByteProvider<SDValue>>
13695calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13696 unsigned StartingIndex = 0) {
13697 // Finding Src tree of RHS of or typically requires at least 1 additional
13698 // depth
13699 if (Depth > 6)
13700 return std::nullopt;
13701
13702 unsigned BitWidth = Op.getScalarValueSizeInBits();
13703 if (BitWidth % 8 != 0)
13704 return std::nullopt;
13705 if (Index > BitWidth / 8 - 1)
13706 return std::nullopt;
13707
13708 bool IsVec = Op.getValueType().isVector();
13709 switch (Op.getOpcode()) {
13710 case ISD::OR: {
13711 if (IsVec)
13712 return std::nullopt;
13713
13714 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13715 StartingIndex);
13716 if (!RHS)
13717 return std::nullopt;
13718 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13719 StartingIndex);
13720 if (!LHS)
13721 return std::nullopt;
13722 // A well formed Or will have two ByteProviders for each byte, one of which
13723 // is constant zero
13724 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13725 return std::nullopt;
13726 if (!LHS || LHS->isConstantZero())
13727 return RHS;
13728 if (!RHS || RHS->isConstantZero())
13729 return LHS;
13730 return std::nullopt;
13731 }
13732
13733 case ISD::AND: {
13734 if (IsVec)
13735 return std::nullopt;
13736
13737 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13738 if (!BitMaskOp)
13739 return std::nullopt;
13740
13741 uint32_t BitMask = BitMaskOp->getZExtValue();
13742 // Bits we expect for our StartingIndex
13743 uint32_t IndexMask = 0xFF << (Index * 8);
13744
13745 if ((IndexMask & BitMask) != IndexMask) {
13746 // If the result of the and partially provides the byte, then it
13747 // is not well formatted
13748 if (IndexMask & BitMask)
13749 return std::nullopt;
13751 }
13752
13753 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13754 }
13755
13756 case ISD::FSHR: {
13757 if (IsVec)
13758 return std::nullopt;
13759
13760 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13761 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13762 if (!ShiftOp || Op.getValueType().isVector())
13763 return std::nullopt;
13764
13765 uint64_t BitsProvided = Op.getValueSizeInBits();
13766 if (BitsProvided % 8 != 0)
13767 return std::nullopt;
13768
13769 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13770 if (BitShift % 8)
13771 return std::nullopt;
13772
13773 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13774 uint64_t ByteShift = BitShift / 8;
13775
13776 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13777 uint64_t BytesProvided = BitsProvided / 8;
13778 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13779 NewIndex %= BytesProvided;
13780 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13781 }
13782
13783 case ISD::SRA:
13784 case ISD::SRL: {
13785 if (IsVec)
13786 return std::nullopt;
13787
13788 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13789 if (!ShiftOp)
13790 return std::nullopt;
13791
13792 uint64_t BitShift = ShiftOp->getZExtValue();
13793 if (BitShift % 8)
13794 return std::nullopt;
13795
13796 auto BitsProvided = Op.getScalarValueSizeInBits();
13797 if (BitsProvided % 8 != 0)
13798 return std::nullopt;
13799
13800 uint64_t BytesProvided = BitsProvided / 8;
13801 uint64_t ByteShift = BitShift / 8;
13802 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13803 // If the byte we are trying to provide (as tracked by index) falls in this
13804 // range, then the SRL provides the byte. The byte of interest of the src of
13805 // the SRL is Index + ByteShift
13806 return BytesProvided - ByteShift > Index
13807 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13808 Index + ByteShift)
13810 }
13811
13812 case ISD::SHL: {
13813 if (IsVec)
13814 return std::nullopt;
13815
13816 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13817 if (!ShiftOp)
13818 return std::nullopt;
13819
13820 uint64_t BitShift = ShiftOp->getZExtValue();
13821 if (BitShift % 8 != 0)
13822 return std::nullopt;
13823 uint64_t ByteShift = BitShift / 8;
13824
13825 // If we are shifting by an amount greater than (or equal to)
13826 // the index we are trying to provide, then it provides 0s. If not,
13827 // then this bytes are not definitively 0s, and the corresponding byte
13828 // of interest is Index - ByteShift of the src
13829 return Index < ByteShift
13831 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13832 Depth + 1, StartingIndex);
13833 }
13834 case ISD::ANY_EXTEND:
13835 case ISD::SIGN_EXTEND:
13836 case ISD::ZERO_EXTEND:
13838 case ISD::AssertZext:
13839 case ISD::AssertSext: {
13840 if (IsVec)
13841 return std::nullopt;
13842
13843 SDValue NarrowOp = Op->getOperand(0);
13844 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13845 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13846 Op->getOpcode() == ISD::AssertZext ||
13847 Op->getOpcode() == ISD::AssertSext) {
13848 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13849 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13850 }
13851 if (NarrowBitWidth % 8 != 0)
13852 return std::nullopt;
13853 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13854
13855 if (Index >= NarrowByteWidth)
13856 return Op.getOpcode() == ISD::ZERO_EXTEND
13857 ? std::optional<ByteProvider<SDValue>>(
13859 : std::nullopt;
13860 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13861 }
13862
13863 case ISD::TRUNCATE: {
13864 if (IsVec)
13865 return std::nullopt;
13866
13867 uint64_t NarrowByteWidth = BitWidth / 8;
13868
13869 if (NarrowByteWidth >= Index) {
13870 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13871 StartingIndex);
13872 }
13873
13874 return std::nullopt;
13875 }
13876
13877 case ISD::CopyFromReg: {
13878 if (BitWidth / 8 > Index)
13879 return calculateSrcByte(Op, StartingIndex, Index);
13880
13881 return std::nullopt;
13882 }
13883
13884 case ISD::LOAD: {
13885 auto *L = cast<LoadSDNode>(Op.getNode());
13886
13887 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13888 if (NarrowBitWidth % 8 != 0)
13889 return std::nullopt;
13890 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13891
13892 // If the width of the load does not reach byte we are trying to provide for
13893 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13894 // question
13895 if (Index >= NarrowByteWidth) {
13896 return L->getExtensionType() == ISD::ZEXTLOAD
13897 ? std::optional<ByteProvider<SDValue>>(
13899 : std::nullopt;
13900 }
13901
13902 if (NarrowByteWidth > Index) {
13903 return calculateSrcByte(Op, StartingIndex, Index);
13904 }
13905
13906 return std::nullopt;
13907 }
13908
13909 case ISD::BSWAP: {
13910 if (IsVec)
13911 return std::nullopt;
13912
13913 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13914 Depth + 1, StartingIndex);
13915 }
13916
13918 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13919 if (!IdxOp)
13920 return std::nullopt;
13921 auto VecIdx = IdxOp->getZExtValue();
13922 auto ScalarSize = Op.getScalarValueSizeInBits();
13923 if (ScalarSize < 32)
13924 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13925 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13926 StartingIndex, Index);
13927 }
13928
13929 case AMDGPUISD::PERM: {
13930 if (IsVec)
13931 return std::nullopt;
13932
13933 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13934 if (!PermMask)
13935 return std::nullopt;
13936
13937 auto IdxMask =
13938 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13939 if (IdxMask > 0x07 && IdxMask != 0x0c)
13940 return std::nullopt;
13941
13942 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13943 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13944
13945 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13948 }
13949
13950 default: {
13951 return std::nullopt;
13952 }
13953 }
13954
13955 llvm_unreachable("fully handled switch");
13956}
13957
13958// Returns true if the Operand is a scalar and is 16 bits
13959static bool isExtendedFrom16Bits(SDValue &Operand) {
13960
13961 switch (Operand.getOpcode()) {
13962 case ISD::ANY_EXTEND:
13963 case ISD::SIGN_EXTEND:
13964 case ISD::ZERO_EXTEND: {
13965 auto OpVT = Operand.getOperand(0).getValueType();
13966 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13967 }
13968 case ISD::LOAD: {
13969 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13970 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13971 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13972 ExtType == ISD::EXTLOAD) {
13973 auto MemVT = L->getMemoryVT();
13974 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13975 }
13976 return L->getMemoryVT().getSizeInBits() == 16;
13977 }
13978 default:
13979 return false;
13980 }
13981}
13982
13983// Returns true if the mask matches consecutive bytes, and the first byte
13984// begins at a power of 2 byte offset from 0th byte
13985static bool addresses16Bits(int Mask) {
13986 int Low8 = Mask & 0xff;
13987 int Hi8 = (Mask & 0xff00) >> 8;
13988
13989 assert(Low8 < 8 && Hi8 < 8);
13990 // Are the bytes contiguous in the order of increasing addresses.
13991 bool IsConsecutive = (Hi8 - Low8 == 1);
13992 // Is the first byte at location that is aligned for 16 bit instructions.
13993 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13994 // In this case, we still need code to extract the 16 bit operand, so it
13995 // is better to use i8 v_perm
13996 bool Is16Aligned = !(Low8 % 2);
13997
13998 return IsConsecutive && Is16Aligned;
13999}
14000
14001// Do not lower into v_perm if the operands are actually 16 bit
14002// and the selected bits (based on PermMask) correspond with two
14003// easily addressable 16 bit operands.
14005 SDValue &OtherOp) {
14006 int Low16 = PermMask & 0xffff;
14007 int Hi16 = (PermMask & 0xffff0000) >> 16;
14008
14009 auto TempOp = peekThroughBitcasts(Op);
14010 auto TempOtherOp = peekThroughBitcasts(OtherOp);
14011
14012 auto OpIs16Bit =
14013 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
14014 if (!OpIs16Bit)
14015 return true;
14016
14017 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14018 isExtendedFrom16Bits(TempOtherOp);
14019 if (!OtherOpIs16Bit)
14020 return true;
14021
14022 // Do we cleanly address both
14023 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
14024}
14025
14027 unsigned DWordOffset) {
14028 SDValue Ret;
14029
14030 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14031 // ByteProvider must be at least 8 bits
14032 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14033
14034 if (TypeSize <= 32)
14035 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
14036
14037 if (Src.getValueType().isVector()) {
14038 auto ScalarTySize = Src.getScalarValueSizeInBits();
14039 auto ScalarTy = Src.getValueType().getScalarType();
14040 if (ScalarTySize == 32) {
14041 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
14042 DAG.getConstant(DWordOffset, SL, MVT::i32));
14043 }
14044 if (ScalarTySize > 32) {
14045 Ret = DAG.getNode(
14046 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
14047 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14048 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14049 if (ShiftVal)
14050 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
14051 DAG.getConstant(ShiftVal, SL, MVT::i32));
14052 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14053 }
14054
14055 assert(ScalarTySize < 32);
14056 auto NumElements = TypeSize / ScalarTySize;
14057 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14058 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14059 auto NumElementsIn32 = 32 / ScalarTySize;
14060 auto NumAvailElements = DWordOffset < Trunc32Elements
14061 ? NumElementsIn32
14062 : NumElements - NormalizedTrunc;
14063
14065 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
14066 NumAvailElements);
14067
14068 Ret = DAG.getBuildVector(
14069 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
14070 VecSrcs);
14071 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14072 }
14073
14074 /// Scalar Type
14075 auto ShiftVal = 32 * DWordOffset;
14076 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
14077 DAG.getConstant(ShiftVal, SL, MVT::i32));
14078 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14079}
14080
14082 SelectionDAG &DAG = DCI.DAG;
14083 [[maybe_unused]] EVT VT = N->getValueType(0);
14085
14086 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14087 assert(VT == MVT::i32);
14088 for (int i = 0; i < 4; i++) {
14089 // Find the ByteProvider that provides the ith byte of the result of OR
14090 std::optional<ByteProvider<SDValue>> P =
14091 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
14092 // TODO support constantZero
14093 if (!P || P->isConstantZero())
14094 return SDValue();
14095
14096 PermNodes.push_back(*P);
14097 }
14098 if (PermNodes.size() != 4)
14099 return SDValue();
14100
14101 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14102 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14103 uint64_t PermMask = 0x00000000;
14104 for (size_t i = 0; i < PermNodes.size(); i++) {
14105 auto PermOp = PermNodes[i];
14106 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14107 // by sizeof(Src2) = 4
14108 int SrcByteAdjust = 4;
14109
14110 // If the Src uses a byte from a different DWORD, then it corresponds
14111 // with a difference source
14112 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14113 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14114 if (SecondSrc)
14115 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14116 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14117 return SDValue();
14118
14119 // Set the index of the second distinct Src node
14120 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14121 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14122 SrcByteAdjust = 0;
14123 }
14124 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14126 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14127 }
14128 SDLoc DL(N);
14129 SDValue Op = *PermNodes[FirstSrc.first].Src;
14130 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
14131 assert(Op.getValueSizeInBits() == 32);
14132
14133 // Check that we are not just extracting the bytes in order from an op
14134 if (!SecondSrc) {
14135 int Low16 = PermMask & 0xffff;
14136 int Hi16 = (PermMask & 0xffff0000) >> 16;
14137
14138 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14139 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14140
14141 // The perm op would really just produce Op. So combine into Op
14142 if (WellFormedLow && WellFormedHi)
14143 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
14144 }
14145
14146 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14147
14148 if (SecondSrc) {
14149 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
14150 assert(OtherOp.getValueSizeInBits() == 32);
14151 }
14152
14153 // Check that we haven't just recreated the same FSHR node.
14154 if (N->getOpcode() == ISD::FSHR &&
14155 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
14156 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
14157 return SDValue();
14158
14159 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14160
14161 assert(Op.getValueType().isByteSized() &&
14162 OtherOp.getValueType().isByteSized());
14163
14164 // If the ultimate src is less than 32 bits, then we will only be
14165 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14166 // CalculateByteProvider would not have returned Op as source if we
14167 // used a byte that is outside its ValueType. Thus, we are free to
14168 // ANY_EXTEND as the extended bits are dont-cares.
14169 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
14170 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
14171
14172 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
14173 DAG.getConstant(PermMask, DL, MVT::i32));
14174 }
14175 return SDValue();
14176}
14177
14178SDValue SITargetLowering::performOrCombine(SDNode *N,
14179 DAGCombinerInfo &DCI) const {
14180 SelectionDAG &DAG = DCI.DAG;
14181 SDValue LHS = N->getOperand(0);
14182 SDValue RHS = N->getOperand(1);
14183
14184 EVT VT = N->getValueType(0);
14185 if (VT == MVT::i1) {
14186 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14187 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14188 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14189 SDValue Src = LHS.getOperand(0);
14190 if (Src != RHS.getOperand(0))
14191 return SDValue();
14192
14193 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
14194 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14195 if (!CLHS || !CRHS)
14196 return SDValue();
14197
14198 // Only 10 bits are used.
14199 static const uint32_t MaxMask = 0x3ff;
14200
14201 uint32_t NewMask =
14202 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14203 SDLoc DL(N);
14204 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
14205 DAG.getConstant(NewMask, DL, MVT::i32));
14206 }
14207
14208 return SDValue();
14209 }
14210
14211 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14213 LHS.getOpcode() == AMDGPUISD::PERM &&
14214 isa<ConstantSDNode>(LHS.getOperand(2))) {
14215 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
14216 if (!Sel)
14217 return SDValue();
14218
14219 Sel |= LHS.getConstantOperandVal(2);
14220 SDLoc DL(N);
14221 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14222 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14223 }
14224
14225 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14226 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14227 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14228 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14229
14230 // If all the uses of an or need to extract the individual elements, do not
14231 // attempt to lower into v_perm
14232 auto usesCombinedOperand = [](SDNode *OrUse) {
14233 // If we have any non-vectorized use, then it is a candidate for v_perm
14234 if (OrUse->getOpcode() != ISD::BITCAST ||
14235 !OrUse->getValueType(0).isVector())
14236 return true;
14237
14238 // If we have any non-vectorized use, then it is a candidate for v_perm
14239 for (auto *VUser : OrUse->users()) {
14240 if (!VUser->getValueType(0).isVector())
14241 return true;
14242
14243 // If the use of a vector is a store, then combining via a v_perm
14244 // is beneficial.
14245 // TODO -- whitelist more uses
14246 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14247 if (VUser->getOpcode() == VectorwiseOp)
14248 return true;
14249 }
14250 return false;
14251 };
14252
14253 if (!any_of(N->users(), usesCombinedOperand))
14254 return SDValue();
14255
14256 uint32_t LHSMask = getPermuteMask(LHS);
14257 uint32_t RHSMask = getPermuteMask(RHS);
14258
14259 if (LHSMask != ~0u && RHSMask != ~0u) {
14260 // Canonicalize the expression in an attempt to have fewer unique masks
14261 // and therefore fewer registers used to hold the masks.
14262 if (LHSMask > RHSMask) {
14263 std::swap(LHSMask, RHSMask);
14264 std::swap(LHS, RHS);
14265 }
14266
14267 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14268 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14269 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14270 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14271
14272 // Check of we need to combine values from two sources within a byte.
14273 if (!(LHSUsedLanes & RHSUsedLanes) &&
14274 // If we select high and lower word keep it for SDWA.
14275 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14276 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14277 // Kill zero bytes selected by other mask. Zero value is 0xc.
14278 LHSMask &= ~RHSUsedLanes;
14279 RHSMask &= ~LHSUsedLanes;
14280 // Add 4 to each active LHS lane
14281 LHSMask |= LHSUsedLanes & 0x04040404;
14282 // Combine masks
14283 uint32_t Sel = LHSMask | RHSMask;
14284 SDLoc DL(N);
14285
14286 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14287 RHS.getOperand(0),
14288 DAG.getConstant(Sel, DL, MVT::i32));
14289 }
14290 }
14291 if (LHSMask == ~0u || RHSMask == ~0u) {
14292 if (SDValue Perm = matchPERM(N, DCI))
14293 return Perm;
14294 }
14295 }
14296
14297 // Detect identity v2i32 OR and replace with identity source node.
14298 // Specifically an Or that has operands constructed from the same source node
14299 // via extract_vector_elt and build_vector. I.E.
14300 // v2i32 or(
14301 // v2i32 build_vector(
14302 // i32 extract_elt(%IdentitySrc, 0),
14303 // i32 0
14304 // ),
14305 // v2i32 build_vector(
14306 // i32 0,
14307 // i32 extract_elt(%IdentitySrc, 1)
14308 // ) )
14309 // =>
14310 // v2i32 %IdentitySrc
14311
14312 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14313 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14314
14315 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14316 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14317
14318 // Test for and normalise build vectors.
14319 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14320
14321 // Get the extract_vector_element operands.
14322 SDValue LEVE = LHS->getOperand(0);
14323 SDValue REVE = RHS->getOperand(1);
14324
14325 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14327 // Check that different elements from the same vector are
14328 // extracted.
14329 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14330 LEVE->getOperand(1) != REVE->getOperand(1)) {
14331 SDValue IdentitySrc = LEVE.getOperand(0);
14332 return IdentitySrc;
14333 }
14334 }
14335 }
14336 }
14337
14338 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14339 return SDValue();
14340
14341 // TODO: This could be a generic combine with a predicate for extracting the
14342 // high half of an integer being free.
14343
14344 // (or i64:x, (zero_extend i32:y)) ->
14345 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14346 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14347 RHS.getOpcode() != ISD::ZERO_EXTEND)
14348 std::swap(LHS, RHS);
14349
14350 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14351 SDValue ExtSrc = RHS.getOperand(0);
14352 EVT SrcVT = ExtSrc.getValueType();
14353 if (SrcVT == MVT::i32) {
14354 SDLoc SL(N);
14355 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14356 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14357
14358 DCI.AddToWorklist(LowOr.getNode());
14359 DCI.AddToWorklist(HiBits.getNode());
14360
14361 SDValue Vec =
14362 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14363 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14364 }
14365 }
14366
14367 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14368 if (CRHS) {
14369 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14370 N->getOperand(0), CRHS))
14371 return Split;
14372 }
14373
14374 return SDValue();
14375}
14376
14377SDValue SITargetLowering::performXorCombine(SDNode *N,
14378 DAGCombinerInfo &DCI) const {
14379 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14380 return RV;
14381
14382 SDValue LHS = N->getOperand(0);
14383 SDValue RHS = N->getOperand(1);
14384
14385 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14386 SelectionDAG &DAG = DCI.DAG;
14387
14388 EVT VT = N->getValueType(0);
14389 if (CRHS && VT == MVT::i64) {
14390 if (SDValue Split =
14391 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14392 return Split;
14393 }
14394
14395 // v2i32 (xor (vselect cc, x, y), K) ->
14396 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14397 // replaced with source modifiers when the select is lowered to CNDMASK.
14398 unsigned Opc = LHS.getOpcode();
14399 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14400 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14401 CRHS && CRHS->getAPIntValue().isSignMask()) {
14402 SDValue CC = LHS->getOperand(0);
14403 SDValue TRUE = LHS->getOperand(1);
14404 SDValue FALSE = LHS->getOperand(2);
14405 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14406 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14407 SDValue XSelect =
14408 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14409 return XSelect;
14410 }
14411
14412 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14413 // fneg-like xors into 64-bit select.
14414 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14415 // This looks like an fneg, try to fold as a source modifier.
14416 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14418 // xor (select c, a, b), 0x80000000 ->
14419 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14420 SDLoc DL(N);
14421 SDValue CastLHS =
14422 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14423 SDValue CastRHS =
14424 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14425 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14426 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14427 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14428 LHS->getOperand(0), FNegLHS, FNegRHS);
14429 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14430 }
14431 }
14432
14433 return SDValue();
14434}
14435
14436SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14437 DAGCombinerInfo &DCI) const {
14438 if (!Subtarget->has16BitInsts() ||
14439 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14440 return SDValue();
14441
14442 EVT VT = N->getValueType(0);
14443 if (VT != MVT::i32)
14444 return SDValue();
14445
14446 SDValue Src = N->getOperand(0);
14447 if (Src.getValueType() != MVT::i16)
14448 return SDValue();
14449
14450 return SDValue();
14451}
14452
14453SDValue
14454SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14455 DAGCombinerInfo &DCI) const {
14456 SDValue Src = N->getOperand(0);
14457 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14458
14459 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14460 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14461 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14462 VTSign->getVT() == MVT::i8) ||
14463 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14464 VTSign->getVT() == MVT::i16))) {
14465 assert(Subtarget->hasScalarSubwordLoads() &&
14466 "s_buffer_load_{u8, i8} are supported "
14467 "in GFX12 (or newer) architectures.");
14468 EVT VT = Src.getValueType();
14469 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14470 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14471 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14472 SDLoc DL(N);
14473 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14474 SDValue Ops[] = {
14475 Src.getOperand(0), // source register
14476 Src.getOperand(1), // offset
14477 Src.getOperand(2) // cachePolicy
14478 };
14479 auto *M = cast<MemSDNode>(Src);
14480 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14481 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14482 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14483 return LoadVal;
14484 }
14485 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14486 VTSign->getVT() == MVT::i8) ||
14487 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14488 VTSign->getVT() == MVT::i16)) &&
14489 Src.hasOneUse()) {
14490 auto *M = cast<MemSDNode>(Src);
14491 SDValue Ops[] = {Src.getOperand(0), // Chain
14492 Src.getOperand(1), // rsrc
14493 Src.getOperand(2), // vindex
14494 Src.getOperand(3), // voffset
14495 Src.getOperand(4), // soffset
14496 Src.getOperand(5), // offset
14497 Src.getOperand(6), Src.getOperand(7)};
14498 // replace with BUFFER_LOAD_BYTE/SHORT
14499 SDVTList ResList =
14500 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14501 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14502 ? AMDGPUISD::BUFFER_LOAD_BYTE
14503 : AMDGPUISD::BUFFER_LOAD_SHORT;
14504 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14505 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14506 return DCI.DAG.getMergeValues(
14507 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14508 }
14509 return SDValue();
14510}
14511
14512SDValue SITargetLowering::performClassCombine(SDNode *N,
14513 DAGCombinerInfo &DCI) const {
14514 SelectionDAG &DAG = DCI.DAG;
14515 SDValue Mask = N->getOperand(1);
14516
14517 // fp_class x, 0 -> false
14518 if (isNullConstant(Mask))
14519 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14520
14521 if (N->getOperand(0).isUndef())
14522 return DAG.getUNDEF(MVT::i1);
14523
14524 return SDValue();
14525}
14526
14527SDValue SITargetLowering::performRcpCombine(SDNode *N,
14528 DAGCombinerInfo &DCI) const {
14529 EVT VT = N->getValueType(0);
14530 SDValue N0 = N->getOperand(0);
14531
14532 if (N0.isUndef()) {
14533 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14534 SDLoc(N), VT);
14535 }
14536
14537 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14538 N0.getOpcode() == ISD::SINT_TO_FP)) {
14539 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14540 N->getFlags());
14541 }
14542
14543 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14544 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14545 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14546 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14547 N->getFlags());
14548 }
14549
14551}
14552
14554 unsigned MaxDepth) const {
14555 unsigned Opcode = Op.getOpcode();
14556 if (Opcode == ISD::FCANONICALIZE)
14557 return true;
14558
14559 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14560 const auto &F = CFP->getValueAPF();
14561 if (F.isNaN() && F.isSignaling())
14562 return false;
14563 if (!F.isDenormal())
14564 return true;
14565
14566 DenormalMode Mode =
14567 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14568 return Mode == DenormalMode::getIEEE();
14569 }
14570
14571 // If source is a result of another standard FP operation it is already in
14572 // canonical form.
14573 if (MaxDepth == 0)
14574 return false;
14575
14576 switch (Opcode) {
14577 // These will flush denorms if required.
14578 case ISD::FADD:
14579 case ISD::FSUB:
14580 case ISD::FMUL:
14581 case ISD::FCEIL:
14582 case ISD::FFLOOR:
14583 case ISD::FMA:
14584 case ISD::FMAD:
14585 case ISD::FSQRT:
14586 case ISD::FDIV:
14587 case ISD::FREM:
14588 case ISD::FP_ROUND:
14589 case ISD::FP_EXTEND:
14590 case ISD::FP16_TO_FP:
14591 case ISD::FP_TO_FP16:
14592 case ISD::BF16_TO_FP:
14593 case ISD::FP_TO_BF16:
14594 case ISD::FLDEXP:
14595 case AMDGPUISD::FMUL_LEGACY:
14596 case AMDGPUISD::FMAD_FTZ:
14597 case AMDGPUISD::RCP:
14598 case AMDGPUISD::RSQ:
14599 case AMDGPUISD::RSQ_CLAMP:
14600 case AMDGPUISD::RCP_LEGACY:
14601 case AMDGPUISD::RCP_IFLAG:
14602 case AMDGPUISD::LOG:
14603 case AMDGPUISD::EXP:
14604 case AMDGPUISD::DIV_SCALE:
14605 case AMDGPUISD::DIV_FMAS:
14606 case AMDGPUISD::DIV_FIXUP:
14607 case AMDGPUISD::FRACT:
14608 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14609 case AMDGPUISD::CVT_F32_UBYTE0:
14610 case AMDGPUISD::CVT_F32_UBYTE1:
14611 case AMDGPUISD::CVT_F32_UBYTE2:
14612 case AMDGPUISD::CVT_F32_UBYTE3:
14613 case AMDGPUISD::FP_TO_FP16:
14614 case AMDGPUISD::SIN_HW:
14615 case AMDGPUISD::COS_HW:
14616 return true;
14617
14618 // It can/will be lowered or combined as a bit operation.
14619 // Need to check their input recursively to handle.
14620 case ISD::FNEG:
14621 case ISD::FABS:
14622 case ISD::FCOPYSIGN:
14623 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14624
14625 case ISD::AND:
14626 if (Op.getValueType() == MVT::i32) {
14627 // Be careful as we only know it is a bitcast floating point type. It
14628 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14629 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14630 // is valid to optimize for all types.
14631 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14632 if (RHS->getZExtValue() == 0xffff0000) {
14633 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14634 }
14635 }
14636 }
14637 break;
14638
14639 case ISD::FSIN:
14640 case ISD::FCOS:
14641 case ISD::FSINCOS:
14642 return Op.getValueType().getScalarType() != MVT::f16;
14643
14644 case ISD::FMINNUM:
14645 case ISD::FMAXNUM:
14646 case ISD::FMINNUM_IEEE:
14647 case ISD::FMAXNUM_IEEE:
14648 case ISD::FMINIMUM:
14649 case ISD::FMAXIMUM:
14650 case ISD::FMINIMUMNUM:
14651 case ISD::FMAXIMUMNUM:
14652 case AMDGPUISD::CLAMP:
14653 case AMDGPUISD::FMED3:
14654 case AMDGPUISD::FMAX3:
14655 case AMDGPUISD::FMIN3:
14656 case AMDGPUISD::FMAXIMUM3:
14657 case AMDGPUISD::FMINIMUM3: {
14658 // FIXME: Shouldn't treat the generic operations different based these.
14659 // However, we aren't really required to flush the result from
14660 // minnum/maxnum..
14661
14662 // snans will be quieted, so we only need to worry about denormals.
14663 if (Subtarget->supportsMinMaxDenormModes() ||
14664 // FIXME: denormalsEnabledForType is broken for dynamic
14665 denormalsEnabledForType(DAG, Op.getValueType()))
14666 return true;
14667
14668 // Flushing may be required.
14669 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14670 // targets need to check their input recursively.
14671
14672 // FIXME: Does this apply with clamp? It's implemented with max.
14673 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14674 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14675 return false;
14676 }
14677
14678 return true;
14679 }
14680 case ISD::SELECT: {
14681 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14682 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14683 }
14684 case ISD::BUILD_VECTOR: {
14685 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14686 SDValue SrcOp = Op.getOperand(i);
14687 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14688 return false;
14689 }
14690
14691 return true;
14692 }
14695 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14696 }
14698 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14699 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14700 }
14701 case ISD::UNDEF:
14702 // Could be anything.
14703 return false;
14704
14705 case ISD::BITCAST:
14706 // TODO: This is incorrect as it loses track of the operand's type. We may
14707 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14708 // same bits that are canonicalized in one type need not be in the other.
14709 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14710 case ISD::TRUNCATE: {
14711 // Hack round the mess we make when legalizing extract_vector_elt
14712 if (Op.getValueType() == MVT::i16) {
14713 SDValue TruncSrc = Op.getOperand(0);
14714 if (TruncSrc.getValueType() == MVT::i32 &&
14715 TruncSrc.getOpcode() == ISD::BITCAST &&
14716 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14717 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14718 }
14719 }
14720 return false;
14721 }
14723 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14724 // TODO: Handle more intrinsics
14725 switch (IntrinsicID) {
14726 case Intrinsic::amdgcn_cvt_pkrtz:
14727 case Intrinsic::amdgcn_cubeid:
14728 case Intrinsic::amdgcn_frexp_mant:
14729 case Intrinsic::amdgcn_fdot2:
14730 case Intrinsic::amdgcn_rcp:
14731 case Intrinsic::amdgcn_rsq:
14732 case Intrinsic::amdgcn_rsq_clamp:
14733 case Intrinsic::amdgcn_rcp_legacy:
14734 case Intrinsic::amdgcn_rsq_legacy:
14735 case Intrinsic::amdgcn_trig_preop:
14736 case Intrinsic::amdgcn_tanh:
14737 case Intrinsic::amdgcn_log:
14738 case Intrinsic::amdgcn_exp2:
14739 case Intrinsic::amdgcn_sqrt:
14740 return true;
14741 default:
14742 break;
14743 }
14744
14745 break;
14746 }
14747 default:
14748 break;
14749 }
14750
14751 // FIXME: denormalsEnabledForType is broken for dynamic
14752 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14753 DAG.isKnownNeverSNaN(Op);
14754}
14755
14757 unsigned MaxDepth) const {
14758 const MachineRegisterInfo &MRI = MF.getRegInfo();
14759 MachineInstr *MI = MRI.getVRegDef(Reg);
14760 unsigned Opcode = MI->getOpcode();
14761
14762 if (Opcode == AMDGPU::G_FCANONICALIZE)
14763 return true;
14764
14765 std::optional<FPValueAndVReg> FCR;
14766 // Constant splat (can be padded with undef) or scalar constant.
14768 if (FCR->Value.isSignaling())
14769 return false;
14770 if (!FCR->Value.isDenormal())
14771 return true;
14772
14773 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14774 return Mode == DenormalMode::getIEEE();
14775 }
14776
14777 if (MaxDepth == 0)
14778 return false;
14779
14780 switch (Opcode) {
14781 case AMDGPU::G_FADD:
14782 case AMDGPU::G_FSUB:
14783 case AMDGPU::G_FMUL:
14784 case AMDGPU::G_FCEIL:
14785 case AMDGPU::G_FFLOOR:
14786 case AMDGPU::G_FRINT:
14787 case AMDGPU::G_FNEARBYINT:
14788 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14789 case AMDGPU::G_INTRINSIC_TRUNC:
14790 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14791 case AMDGPU::G_FMA:
14792 case AMDGPU::G_FMAD:
14793 case AMDGPU::G_FSQRT:
14794 case AMDGPU::G_FDIV:
14795 case AMDGPU::G_FREM:
14796 case AMDGPU::G_FPOW:
14797 case AMDGPU::G_FPEXT:
14798 case AMDGPU::G_FLOG:
14799 case AMDGPU::G_FLOG2:
14800 case AMDGPU::G_FLOG10:
14801 case AMDGPU::G_FPTRUNC:
14802 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14803 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14804 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14805 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14806 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14807 return true;
14808 case AMDGPU::G_FNEG:
14809 case AMDGPU::G_FABS:
14810 case AMDGPU::G_FCOPYSIGN:
14811 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14812 case AMDGPU::G_FMINNUM:
14813 case AMDGPU::G_FMAXNUM:
14814 case AMDGPU::G_FMINNUM_IEEE:
14815 case AMDGPU::G_FMAXNUM_IEEE:
14816 case AMDGPU::G_FMINIMUM:
14817 case AMDGPU::G_FMAXIMUM:
14818 case AMDGPU::G_FMINIMUMNUM:
14819 case AMDGPU::G_FMAXIMUMNUM: {
14820 if (Subtarget->supportsMinMaxDenormModes() ||
14821 // FIXME: denormalsEnabledForType is broken for dynamic
14822 denormalsEnabledForType(MRI.getType(Reg), MF))
14823 return true;
14824
14825 [[fallthrough]];
14826 }
14827 case AMDGPU::G_BUILD_VECTOR:
14828 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14829 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14830 return false;
14831 return true;
14832 case AMDGPU::G_INTRINSIC:
14833 case AMDGPU::G_INTRINSIC_CONVERGENT:
14834 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14835 case Intrinsic::amdgcn_fmul_legacy:
14836 case Intrinsic::amdgcn_fmad_ftz:
14837 case Intrinsic::amdgcn_sqrt:
14838 case Intrinsic::amdgcn_fmed3:
14839 case Intrinsic::amdgcn_sin:
14840 case Intrinsic::amdgcn_cos:
14841 case Intrinsic::amdgcn_log:
14842 case Intrinsic::amdgcn_exp2:
14843 case Intrinsic::amdgcn_log_clamp:
14844 case Intrinsic::amdgcn_rcp:
14845 case Intrinsic::amdgcn_rcp_legacy:
14846 case Intrinsic::amdgcn_rsq:
14847 case Intrinsic::amdgcn_rsq_clamp:
14848 case Intrinsic::amdgcn_rsq_legacy:
14849 case Intrinsic::amdgcn_div_scale:
14850 case Intrinsic::amdgcn_div_fmas:
14851 case Intrinsic::amdgcn_div_fixup:
14852 case Intrinsic::amdgcn_fract:
14853 case Intrinsic::amdgcn_cvt_pkrtz:
14854 case Intrinsic::amdgcn_cubeid:
14855 case Intrinsic::amdgcn_cubema:
14856 case Intrinsic::amdgcn_cubesc:
14857 case Intrinsic::amdgcn_cubetc:
14858 case Intrinsic::amdgcn_frexp_mant:
14859 case Intrinsic::amdgcn_fdot2:
14860 case Intrinsic::amdgcn_trig_preop:
14861 case Intrinsic::amdgcn_tanh:
14862 return true;
14863 default:
14864 break;
14865 }
14866
14867 [[fallthrough]];
14868 default:
14869 return false;
14870 }
14871
14872 llvm_unreachable("invalid operation");
14873}
14874
14875// Constant fold canonicalize.
14876SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14877 const SDLoc &SL, EVT VT,
14878 const APFloat &C) const {
14879 // Flush denormals to 0 if not enabled.
14880 if (C.isDenormal()) {
14881 DenormalMode Mode =
14882 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14883 if (Mode == DenormalMode::getPreserveSign()) {
14884 return DAG.getConstantFP(
14885 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14886 }
14887
14888 if (Mode != DenormalMode::getIEEE())
14889 return SDValue();
14890 }
14891
14892 if (C.isNaN()) {
14893 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14894 if (C.isSignaling()) {
14895 // Quiet a signaling NaN.
14896 // FIXME: Is this supposed to preserve payload bits?
14897 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14898 }
14899
14900 // Make sure it is the canonical NaN bitpattern.
14901 //
14902 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14903 // immediate?
14904 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14905 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14906 }
14907
14908 // Already canonical.
14909 return DAG.getConstantFP(C, SL, VT);
14910}
14911
14913 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14914}
14915
14916SDValue
14917SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14918 DAGCombinerInfo &DCI) const {
14919 SelectionDAG &DAG = DCI.DAG;
14920 SDValue N0 = N->getOperand(0);
14921 EVT VT = N->getValueType(0);
14922
14923 // fcanonicalize undef -> qnan
14924 if (N0.isUndef()) {
14926 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14927 }
14928
14929 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14930 EVT VT = N->getValueType(0);
14931 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14932 }
14933
14934 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14935 // (fcanonicalize k)
14936 //
14937 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14938
14939 // TODO: This could be better with wider vectors that will be split to v2f16,
14940 // and to consider uses since there aren't that many packed operations.
14941 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14942 isTypeLegal(MVT::v2f16)) {
14943 SDLoc SL(N);
14944 SDValue NewElts[2];
14945 SDValue Lo = N0.getOperand(0);
14946 SDValue Hi = N0.getOperand(1);
14947 EVT EltVT = Lo.getValueType();
14948
14950 for (unsigned I = 0; I != 2; ++I) {
14951 SDValue Op = N0.getOperand(I);
14952 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14953 NewElts[I] =
14954 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14955 } else if (Op.isUndef()) {
14956 // Handled below based on what the other operand is.
14957 NewElts[I] = Op;
14958 } else {
14959 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14960 }
14961 }
14962
14963 // If one half is undef, and one is constant, prefer a splat vector rather
14964 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14965 // cheaper to use and may be free with a packed operation.
14966 if (NewElts[0].isUndef()) {
14967 if (isa<ConstantFPSDNode>(NewElts[1]))
14968 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14969 ? NewElts[1]
14970 : DAG.getConstantFP(0.0f, SL, EltVT);
14971 }
14972
14973 if (NewElts[1].isUndef()) {
14974 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14975 ? NewElts[0]
14976 : DAG.getConstantFP(0.0f, SL, EltVT);
14977 }
14978
14979 return DAG.getBuildVector(VT, SL, NewElts);
14980 }
14981 }
14982
14983 return SDValue();
14984}
14985
14986static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14987 switch (Opc) {
14988 case ISD::FMAXNUM:
14989 case ISD::FMAXNUM_IEEE:
14990 case ISD::FMAXIMUMNUM:
14991 return AMDGPUISD::FMAX3;
14992 case ISD::FMAXIMUM:
14993 return AMDGPUISD::FMAXIMUM3;
14994 case ISD::SMAX:
14995 return AMDGPUISD::SMAX3;
14996 case ISD::UMAX:
14997 return AMDGPUISD::UMAX3;
14998 case ISD::FMINNUM:
14999 case ISD::FMINNUM_IEEE:
15000 case ISD::FMINIMUMNUM:
15001 return AMDGPUISD::FMIN3;
15002 case ISD::FMINIMUM:
15003 return AMDGPUISD::FMINIMUM3;
15004 case ISD::SMIN:
15005 return AMDGPUISD::SMIN3;
15006 case ISD::UMIN:
15007 return AMDGPUISD::UMIN3;
15008 default:
15009 llvm_unreachable("Not a min/max opcode");
15010 }
15011}
15012
15013SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15014 const SDLoc &SL, SDValue Src,
15015 SDValue MinVal,
15016 SDValue MaxVal,
15017 bool Signed) const {
15018
15019 // med3 comes from
15020 // min(max(x, K0), K1), K0 < K1
15021 // max(min(x, K0), K1), K1 < K0
15022 //
15023 // "MinVal" and "MaxVal" respectively refer to the rhs of the
15024 // min/max op.
15025 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
15026 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
15027
15028 if (!MinK || !MaxK)
15029 return SDValue();
15030
15031 if (Signed) {
15032 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
15033 return SDValue();
15034 } else {
15035 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
15036 return SDValue();
15037 }
15038
15039 EVT VT = MinK->getValueType(0);
15040 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15041 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15042 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15043
15044 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15045 // not available, but this is unlikely to be profitable as constants
15046 // will often need to be materialized & extended, especially on
15047 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
15048 return SDValue();
15049}
15050
15053 return C;
15054
15056 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
15057 return C;
15058 }
15059
15060 return nullptr;
15061}
15062
15063SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
15064 const SDLoc &SL, SDValue Op0,
15065 SDValue Op1) const {
15066 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
15067 if (!K1)
15068 return SDValue();
15069
15070 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
15071 if (!K0)
15072 return SDValue();
15073
15074 // Ordered >= (although NaN inputs should have folded away by now).
15075 if (K0->getValueAPF() > K1->getValueAPF())
15076 return SDValue();
15077
15078 // med3 with a nan input acts like
15079 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15080 //
15081 // So the result depends on whether the IEEE mode bit is enabled or not with a
15082 // signaling nan input.
15083 // ieee=1
15084 // s0 snan: yields s2
15085 // s1 snan: yields s2
15086 // s2 snan: qnan
15087
15088 // s0 qnan: min(s1, s2)
15089 // s1 qnan: min(s0, s2)
15090 // s2 qnan: min(s0, s1)
15091
15092 // ieee=0
15093 // s0 snan: min(s1, s2)
15094 // s1 snan: min(s0, s2)
15095 // s2 snan: qnan
15096
15097 // s0 qnan: min(s1, s2)
15098 // s1 qnan: min(s0, s2)
15099 // s2 qnan: min(s0, s1)
15100 const MachineFunction &MF = DAG.getMachineFunction();
15101 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15102
15103 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15104 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15105 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15106 EVT VT = Op0.getValueType();
15107 if (Info->getMode().DX10Clamp) {
15108 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15109 // hardware fmed3 behavior converting to a min.
15110 // FIXME: Should this be allowing -0.0?
15111 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
15112 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
15113 }
15114
15115 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15116 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15117 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15118 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15119 // then give the other result, which is different from med3 with a NaN
15120 // input.
15121 SDValue Var = Op0.getOperand(0);
15122 if (!DAG.isKnownNeverSNaN(Var))
15123 return SDValue();
15124
15125 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15126
15127 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
15128 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
15129 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
15130 SDValue(K0, 0), SDValue(K1, 0));
15131 }
15132 }
15133
15134 return SDValue();
15135}
15136
15137/// \return true if the subtarget supports minimum3 and maximum3 with the given
15138/// base min/max opcode \p Opc for type \p VT.
15139static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15140 EVT VT) {
15141 switch (Opc) {
15142 case ISD::FMINNUM:
15143 case ISD::FMAXNUM:
15144 case ISD::FMINNUM_IEEE:
15145 case ISD::FMAXNUM_IEEE:
15146 case ISD::FMINIMUMNUM:
15147 case ISD::FMAXIMUMNUM:
15148 case AMDGPUISD::FMIN_LEGACY:
15149 case AMDGPUISD::FMAX_LEGACY:
15150 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15151 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15152 case ISD::FMINIMUM:
15153 case ISD::FMAXIMUM:
15154 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15155 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15156 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15157 case ISD::SMAX:
15158 case ISD::SMIN:
15159 case ISD::UMAX:
15160 case ISD::UMIN:
15161 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15162 default:
15163 return false;
15164 }
15165
15166 llvm_unreachable("not a min/max opcode");
15167}
15168
15169SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15170 DAGCombinerInfo &DCI) const {
15171 SelectionDAG &DAG = DCI.DAG;
15172
15173 EVT VT = N->getValueType(0);
15174 unsigned Opc = N->getOpcode();
15175 SDValue Op0 = N->getOperand(0);
15176 SDValue Op1 = N->getOperand(1);
15177
15178 // Only do this if the inner op has one use since this will just increases
15179 // register pressure for no benefit.
15180
15181 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
15182 // max(max(a, b), c) -> max3(a, b, c)
15183 // min(min(a, b), c) -> min3(a, b, c)
15184 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15185 SDLoc DL(N);
15186 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15187 Op0.getOperand(0), Op0.getOperand(1), Op1);
15188 }
15189
15190 // Try commuted.
15191 // max(a, max(b, c)) -> max3(a, b, c)
15192 // min(a, min(b, c)) -> min3(a, b, c)
15193 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15194 SDLoc DL(N);
15195 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15196 Op0, Op1.getOperand(0), Op1.getOperand(1));
15197 }
15198 }
15199
15200 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15201 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15202 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15203 if (SDValue Med3 = performIntMed3ImmCombine(
15204 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
15205 return Med3;
15206 }
15207 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15208 if (SDValue Med3 = performIntMed3ImmCombine(
15209 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
15210 return Med3;
15211 }
15212
15213 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15214 if (SDValue Med3 = performIntMed3ImmCombine(
15215 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
15216 return Med3;
15217 }
15218 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15219 if (SDValue Med3 = performIntMed3ImmCombine(
15220 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
15221 return Med3;
15222 }
15223
15224 // if !is_snan(x):
15225 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15226 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15227 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15228 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15229 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15232 (Opc == AMDGPUISD::FMIN_LEGACY &&
15233 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15234 (VT == MVT::f32 || VT == MVT::f64 ||
15235 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15236 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15237 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15238 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15239 Op0.hasOneUse()) {
15240 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
15241 return Res;
15242 }
15243
15244 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15245 // for some types, but at a higher cost since it's implemented with a 3
15246 // operand form.
15247 const SDNodeFlags Flags = N->getFlags();
15248 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
15249 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
15250 unsigned NewOpc =
15252 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15253 }
15254
15255 return SDValue();
15256}
15257
15261 // FIXME: Should this be allowing -0.0?
15262 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15263 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15264 }
15265 }
15266
15267 return false;
15268}
15269
15270// FIXME: Should only worry about snans for version with chain.
15271SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15272 DAGCombinerInfo &DCI) const {
15273 EVT VT = N->getValueType(0);
15274 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15275 // NaNs. With a NaN input, the order of the operands may change the result.
15276
15277 SelectionDAG &DAG = DCI.DAG;
15278 SDLoc SL(N);
15279
15280 SDValue Src0 = N->getOperand(0);
15281 SDValue Src1 = N->getOperand(1);
15282 SDValue Src2 = N->getOperand(2);
15283
15284 if (isClampZeroToOne(Src0, Src1)) {
15285 // const_a, const_b, x -> clamp is safe in all cases including signaling
15286 // nans.
15287 // FIXME: Should this be allowing -0.0?
15288 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15289 }
15290
15291 const MachineFunction &MF = DAG.getMachineFunction();
15292 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15293
15294 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15295 // handling no dx10-clamp?
15296 if (Info->getMode().DX10Clamp) {
15297 // If NaNs is clamped to 0, we are free to reorder the inputs.
15298
15299 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15300 std::swap(Src0, Src1);
15301
15302 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15303 std::swap(Src1, Src2);
15304
15305 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15306 std::swap(Src0, Src1);
15307
15308 if (isClampZeroToOne(Src1, Src2))
15309 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15310 }
15311
15312 return SDValue();
15313}
15314
15315SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15316 DAGCombinerInfo &DCI) const {
15317 SDValue Src0 = N->getOperand(0);
15318 SDValue Src1 = N->getOperand(1);
15319 if (Src0.isUndef() && Src1.isUndef())
15320 return DCI.DAG.getUNDEF(N->getValueType(0));
15321 return SDValue();
15322}
15323
15324// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15325// expanded into a set of cmp/select instructions.
15327 unsigned NumElem,
15328 bool IsDivergentIdx,
15329 const GCNSubtarget *Subtarget) {
15331 return false;
15332
15333 unsigned VecSize = EltSize * NumElem;
15334
15335 // Sub-dword vectors of size 2 dword or less have better implementation.
15336 if (VecSize <= 64 && EltSize < 32)
15337 return false;
15338
15339 // Always expand the rest of sub-dword instructions, otherwise it will be
15340 // lowered via memory.
15341 if (EltSize < 32)
15342 return true;
15343
15344 // Always do this if var-idx is divergent, otherwise it will become a loop.
15345 if (IsDivergentIdx)
15346 return true;
15347
15348 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15349 unsigned NumInsts = NumElem /* Number of compares */ +
15350 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15351
15352 // On some architectures (GFX9) movrel is not available and it's better
15353 // to expand.
15354 if (Subtarget->useVGPRIndexMode())
15355 return NumInsts <= 16;
15356
15357 // If movrel is available, use it instead of expanding for vector of 8
15358 // elements.
15359 if (Subtarget->hasMovrel())
15360 return NumInsts <= 15;
15361
15362 return true;
15363}
15364
15366 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15367 if (isa<ConstantSDNode>(Idx))
15368 return false;
15369
15370 SDValue Vec = N->getOperand(0);
15371 EVT VecVT = Vec.getValueType();
15372 EVT EltVT = VecVT.getVectorElementType();
15373 unsigned EltSize = EltVT.getSizeInBits();
15374 unsigned NumElem = VecVT.getVectorNumElements();
15375
15377 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15378}
15379
15380SDValue
15381SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15382 DAGCombinerInfo &DCI) const {
15383 SDValue Vec = N->getOperand(0);
15384 SelectionDAG &DAG = DCI.DAG;
15385
15386 EVT VecVT = Vec.getValueType();
15387 EVT VecEltVT = VecVT.getVectorElementType();
15388 EVT ResVT = N->getValueType(0);
15389
15390 unsigned VecSize = VecVT.getSizeInBits();
15391 unsigned VecEltSize = VecEltVT.getSizeInBits();
15392
15393 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15395 SDLoc SL(N);
15396 SDValue Idx = N->getOperand(1);
15397 SDValue Elt =
15398 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15399 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15400 }
15401
15402 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15403 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15404 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15405 // depending on the shift operand. See e.g. performSraCombine().
15406 // This combine ensures that the optimisation is compatible with v2i32
15407 // legalised AND.
15408 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15409 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15410
15412 if (!C || C->getZExtValue() != 0x1f)
15413 return SDValue();
15414
15415 SDLoc SL(N);
15416 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15417 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15418 Vec->getOperand(0), N->getOperand(1));
15419 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15420 DAG.ReplaceAllUsesWith(N, A.getNode());
15421 }
15422
15423 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15424 // =>
15425 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15426 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15427 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15428 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15429 SDLoc SL(N);
15430 SDValue Idx = N->getOperand(1);
15431 unsigned Opc = Vec.getOpcode();
15432
15433 switch (Opc) {
15434 default:
15435 break;
15436 // TODO: Support other binary operations.
15437 case ISD::FADD:
15438 case ISD::FSUB:
15439 case ISD::FMUL:
15440 case ISD::ADD:
15441 case ISD::UMIN:
15442 case ISD::UMAX:
15443 case ISD::SMIN:
15444 case ISD::SMAX:
15445 case ISD::FMAXNUM:
15446 case ISD::FMINNUM:
15447 case ISD::FMAXNUM_IEEE:
15448 case ISD::FMINNUM_IEEE:
15449 case ISD::FMAXIMUM:
15450 case ISD::FMINIMUM: {
15451 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15452 Vec.getOperand(0), Idx);
15453 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15454 Vec.getOperand(1), Idx);
15455
15456 DCI.AddToWorklist(Elt0.getNode());
15457 DCI.AddToWorklist(Elt1.getNode());
15458 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15459 }
15460 }
15461 }
15462
15463 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15465 SDLoc SL(N);
15466 SDValue Idx = N->getOperand(1);
15467 SDValue V;
15468 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15469 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15470 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15471 if (I == 0)
15472 V = Elt;
15473 else
15474 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15475 }
15476 return V;
15477 }
15478
15479 if (!DCI.isBeforeLegalize())
15480 return SDValue();
15481
15482 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15483 // elements. This exposes more load reduction opportunities by replacing
15484 // multiple small extract_vector_elements with a single 32-bit extract.
15485 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15486 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15487 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15488 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15489
15490 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15491 unsigned EltIdx = BitIndex / 32;
15492 unsigned LeftoverBitIdx = BitIndex % 32;
15493 SDLoc SL(N);
15494
15495 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15496 DCI.AddToWorklist(Cast.getNode());
15497
15498 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15499 DAG.getConstant(EltIdx, SL, MVT::i32));
15500 DCI.AddToWorklist(Elt.getNode());
15501 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15502 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15503 DCI.AddToWorklist(Srl.getNode());
15504
15505 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15506 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15507 DCI.AddToWorklist(Trunc.getNode());
15508
15509 if (VecEltVT == ResVT) {
15510 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15511 }
15512
15513 assert(ResVT.isScalarInteger());
15514 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15515 }
15516
15517 return SDValue();
15518}
15519
15520SDValue
15521SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15522 DAGCombinerInfo &DCI) const {
15523 SDValue Vec = N->getOperand(0);
15524 SDValue Idx = N->getOperand(2);
15525 EVT VecVT = Vec.getValueType();
15526 EVT EltVT = VecVT.getVectorElementType();
15527
15528 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15529 // => BUILD_VECTOR n x select (e, const-idx)
15531 return SDValue();
15532
15533 SelectionDAG &DAG = DCI.DAG;
15534 SDLoc SL(N);
15535 SDValue Ins = N->getOperand(1);
15536 EVT IdxVT = Idx.getValueType();
15537
15539 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15540 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15541 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15542 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15543 Ops.push_back(V);
15544 }
15545
15546 return DAG.getBuildVector(VecVT, SL, Ops);
15547}
15548
15549/// Return the source of an fp_extend from f16 to f32, or a converted FP
15550/// constant.
15552 if (Src.getOpcode() == ISD::FP_EXTEND &&
15553 Src.getOperand(0).getValueType() == MVT::f16) {
15554 return Src.getOperand(0);
15555 }
15556
15557 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15558 APFloat Val = CFP->getValueAPF();
15559 bool LosesInfo = true;
15561 if (!LosesInfo)
15562 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15563 }
15564
15565 return SDValue();
15566}
15567
15568SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15569 DAGCombinerInfo &DCI) const {
15570 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15571 "combine only useful on gfx8");
15572
15573 SDValue TruncSrc = N->getOperand(0);
15574 EVT VT = N->getValueType(0);
15575 if (VT != MVT::f16)
15576 return SDValue();
15577
15578 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15579 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15580 return SDValue();
15581
15582 SelectionDAG &DAG = DCI.DAG;
15583 SDLoc SL(N);
15584
15585 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15586 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15587 // casting back.
15588
15589 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15590 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15591 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15592 if (!A)
15593 return SDValue();
15594
15595 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15596 if (!B)
15597 return SDValue();
15598
15599 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15600 if (!C)
15601 return SDValue();
15602
15603 // This changes signaling nan behavior. If an input is a signaling nan, it
15604 // would have been quieted by the fpext originally. We don't care because
15605 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15606 // we would be worse off than just doing the promotion.
15607 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15608 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15609 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15610 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15611}
15612
15613unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15614 const SDNode *N0,
15615 const SDNode *N1) const {
15616 EVT VT = N0->getValueType(0);
15617
15618 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15619 // support denormals ever.
15620 if (((VT == MVT::f32 &&
15622 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15625 return ISD::FMAD;
15626
15627 const TargetOptions &Options = DAG.getTarget().Options;
15628 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15629 (N0->getFlags().hasAllowContract() &&
15630 N1->getFlags().hasAllowContract())) &&
15632 return ISD::FMA;
15633 }
15634
15635 return 0;
15636}
15637
15638// For a reassociatable opcode perform:
15639// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15640SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15641 SelectionDAG &DAG) const {
15642 EVT VT = N->getValueType(0);
15643 if (VT != MVT::i32 && VT != MVT::i64)
15644 return SDValue();
15645
15646 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15647 return SDValue();
15648
15649 unsigned Opc = N->getOpcode();
15650 SDValue Op0 = N->getOperand(0);
15651 SDValue Op1 = N->getOperand(1);
15652
15653 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15654 return SDValue();
15655
15656 if (Op0->isDivergent())
15657 std::swap(Op0, Op1);
15658
15659 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15660 return SDValue();
15661
15662 SDValue Op2 = Op1.getOperand(1);
15663 Op1 = Op1.getOperand(0);
15664 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15665 return SDValue();
15666
15667 if (Op1->isDivergent())
15668 std::swap(Op1, Op2);
15669
15670 SDLoc SL(N);
15671 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15672 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15673}
15674
15675static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15676 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15678 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15679 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15680 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15681}
15682
15683// Fold
15684// y = lshr i64 x, 32
15685// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15686// with Const.hi == -1
15687// To
15688// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15690 SDValue MulLHS, SDValue MulRHS,
15691 SDValue AddRHS) {
15692 if (MulRHS.getOpcode() == ISD::SRL)
15693 std::swap(MulLHS, MulRHS);
15694
15695 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15696 return SDValue();
15697
15698 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15699 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15700 MulLHS.getOperand(0) != AddRHS)
15701 return SDValue();
15702
15704 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15705 return SDValue();
15706
15707 SDValue ConstMul =
15708 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15709 return getMad64_32(DAG, SL, MVT::i64,
15710 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15711 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15712}
15713
15714// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15715// multiplies, if any.
15716//
15717// Full 64-bit multiplies that feed into an addition are lowered here instead
15718// of using the generic expansion. The generic expansion ends up with
15719// a tree of ADD nodes that prevents us from using the "add" part of the
15720// MAD instruction. The expansion produced here results in a chain of ADDs
15721// instead of a tree.
15722SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15723 DAGCombinerInfo &DCI) const {
15724 assert(N->isAnyAdd());
15725
15726 SelectionDAG &DAG = DCI.DAG;
15727 EVT VT = N->getValueType(0);
15728 SDLoc SL(N);
15729 SDValue LHS = N->getOperand(0);
15730 SDValue RHS = N->getOperand(1);
15731
15732 if (VT.isVector())
15733 return SDValue();
15734
15735 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15736 // result in scalar registers for uniform values.
15737 if (!N->isDivergent() && Subtarget->hasSMulHi())
15738 return SDValue();
15739
15740 unsigned NumBits = VT.getScalarSizeInBits();
15741 if (NumBits <= 32 || NumBits > 64)
15742 return SDValue();
15743
15744 if (LHS.getOpcode() != ISD::MUL) {
15745 assert(RHS.getOpcode() == ISD::MUL);
15746 std::swap(LHS, RHS);
15747 }
15748
15749 // Avoid the fold if it would unduly increase the number of multiplies due to
15750 // multiple uses, except on hardware with full-rate multiply-add (which is
15751 // part of full-rate 64-bit ops).
15752 if (!Subtarget->hasFullRate64Ops()) {
15753 unsigned NumUsers = 0;
15754 for (SDNode *User : LHS->users()) {
15755 // There is a use that does not feed into addition, so the multiply can't
15756 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15757 if (!User->isAnyAdd())
15758 return SDValue();
15759
15760 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15761 // MUL + 3xADD + 3xADDC over 3xMAD.
15762 ++NumUsers;
15763 if (NumUsers >= 3)
15764 return SDValue();
15765 }
15766 }
15767
15768 SDValue MulLHS = LHS.getOperand(0);
15769 SDValue MulRHS = LHS.getOperand(1);
15770 SDValue AddRHS = RHS;
15771
15772 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15773 return FoldedMAD;
15774
15775 // Always check whether operands are small unsigned values, since that
15776 // knowledge is useful in more cases. Check for small signed values only if
15777 // doing so can unlock a shorter code sequence.
15778 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15779 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15780
15781 bool MulSignedLo = false;
15782 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15783 MulSignedLo =
15784 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15785 }
15786
15787 // The operands and final result all have the same number of bits. If
15788 // operands need to be extended, they can be extended with garbage. The
15789 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15790 // truncated away in the end.
15791 if (VT != MVT::i64) {
15792 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15793 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15794 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15795 }
15796
15797 // The basic code generated is conceptually straightforward. Pseudo code:
15798 //
15799 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15800 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15801 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15802 //
15803 // The second and third lines are optional, depending on whether the factors
15804 // are {sign,zero}-extended or not.
15805 //
15806 // The actual DAG is noisier than the pseudo code, but only due to
15807 // instructions that disassemble values into low and high parts, and
15808 // assemble the final result.
15809 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15810
15811 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15812 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15813 SDValue Accum =
15814 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15815
15816 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15817 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15818
15819 if (!MulLHSUnsigned32) {
15820 auto MulLHSHi =
15821 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15822 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15823 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15824 }
15825
15826 if (!MulRHSUnsigned32) {
15827 auto MulRHSHi =
15828 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15829 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15830 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15831 }
15832
15833 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15834 Accum = DAG.getBitcast(MVT::i64, Accum);
15835 }
15836
15837 if (VT != MVT::i64)
15838 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15839 return Accum;
15840}
15841
15842SDValue
15843SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15844 DAGCombinerInfo &DCI) const {
15845 SDValue RHS = N->getOperand(1);
15846 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15847 if (!CRHS)
15848 return SDValue();
15849
15850 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15851 // common.
15852 uint64_t Val = CRHS->getZExtValue();
15853 if (countr_zero(Val) >= 32) {
15854 SelectionDAG &DAG = DCI.DAG;
15855 SDLoc SL(N);
15856 SDValue LHS = N->getOperand(0);
15857
15858 // Avoid carry machinery if we know the low half of the add does not
15859 // contribute to the final result.
15860 //
15861 // add i64:x, K if computeTrailingZeros(K) >= 32
15862 // => build_pair (add x.hi, K.hi), x.lo
15863
15864 // Breaking the 64-bit add here with this strange constant is unlikely
15865 // to interfere with addressing mode patterns.
15866
15867 SDValue Hi = getHiHalf64(LHS, DAG);
15868 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15869 unsigned Opcode = N->getOpcode();
15870 if (Opcode == ISD::PTRADD)
15871 Opcode = ISD::ADD;
15872 SDValue AddHi =
15873 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15874
15875 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15876 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15877 }
15878
15879 return SDValue();
15880}
15881
15882// Collect the ultimate src of each of the mul node's operands, and confirm
15883// each operand is 8 bytes.
15884static std::optional<ByteProvider<SDValue>>
15885handleMulOperand(const SDValue &MulOperand) {
15886 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15887 if (!Byte0 || Byte0->isConstantZero()) {
15888 return std::nullopt;
15889 }
15890 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15891 if (Byte1 && !Byte1->isConstantZero()) {
15892 return std::nullopt;
15893 }
15894 return Byte0;
15895}
15896
15897static unsigned addPermMasks(unsigned First, unsigned Second) {
15898 unsigned FirstCs = First & 0x0c0c0c0c;
15899 unsigned SecondCs = Second & 0x0c0c0c0c;
15900 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15901 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15902
15903 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15904 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15905 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15906 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15907
15908 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15909}
15910
15911struct DotSrc {
15913 int64_t PermMask;
15915};
15916
15920 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15921
15922 assert(Src0.Src.has_value() && Src1.Src.has_value());
15923 // Src0s and Src1s are empty, just place arbitrarily.
15924 if (Step == 0) {
15925 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15926 Src0.SrcOffset / 4});
15927 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15928 Src1.SrcOffset / 4});
15929 return;
15930 }
15931
15932 for (int BPI = 0; BPI < 2; BPI++) {
15933 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15934 if (BPI == 1) {
15935 BPP = {Src1, Src0};
15936 }
15937 unsigned ZeroMask = 0x0c0c0c0c;
15938 unsigned FMask = 0xFF << (8 * (3 - Step));
15939
15940 unsigned FirstMask =
15941 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15942 unsigned SecondMask =
15943 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15944 // Attempt to find Src vector which contains our SDValue, if so, add our
15945 // perm mask to the existing one. If we are unable to find a match for the
15946 // first SDValue, attempt to find match for the second.
15947 int FirstGroup = -1;
15948 for (int I = 0; I < 2; I++) {
15949 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15950 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15951 return IterElt.SrcOp == *BPP.first.Src &&
15952 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15953 };
15954
15955 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15956 if (Match != Srcs.end()) {
15957 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15958 FirstGroup = I;
15959 break;
15960 }
15961 }
15962 if (FirstGroup != -1) {
15963 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15964 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15965 return IterElt.SrcOp == *BPP.second.Src &&
15966 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15967 };
15968 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15969 if (Match != Srcs.end()) {
15970 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15971 } else
15972 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15973 return;
15974 }
15975 }
15976
15977 // If we have made it here, then we could not find a match in Src0s or Src1s
15978 // for either Src0 or Src1, so just place them arbitrarily.
15979
15980 unsigned ZeroMask = 0x0c0c0c0c;
15981 unsigned FMask = 0xFF << (8 * (3 - Step));
15982
15983 Src0s.push_back(
15984 {*Src0.Src,
15985 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15986 Src0.SrcOffset / 4});
15987 Src1s.push_back(
15988 {*Src1.Src,
15989 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15990 Src1.SrcOffset / 4});
15991}
15992
15994 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15995 bool IsAny) {
15996
15997 // If we just have one source, just permute it accordingly.
15998 if (Srcs.size() == 1) {
15999 auto *Elt = Srcs.begin();
16000 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
16001
16002 // v_perm will produce the original value
16003 if (Elt->PermMask == 0x3020100)
16004 return EltOp;
16005
16006 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16007 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
16008 }
16009
16010 auto *FirstElt = Srcs.begin();
16011 auto *SecondElt = std::next(FirstElt);
16012
16014
16015 // If we have multiple sources in the chain, combine them via perms (using
16016 // calculated perm mask) and Ors.
16017 while (true) {
16018 auto FirstMask = FirstElt->PermMask;
16019 auto SecondMask = SecondElt->PermMask;
16020
16021 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16022 unsigned FirstPlusFour = FirstMask | 0x04040404;
16023 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
16024 // original 0x0C.
16025 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16026
16027 auto PermMask = addPermMasks(FirstMask, SecondMask);
16028 auto FirstVal =
16029 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16030 auto SecondVal =
16031 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
16032
16033 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
16034 SecondVal,
16035 DAG.getConstant(PermMask, SL, MVT::i32)));
16036
16037 FirstElt = std::next(SecondElt);
16038 if (FirstElt == Srcs.end())
16039 break;
16040
16041 SecondElt = std::next(FirstElt);
16042 // If we only have a FirstElt, then just combine that into the cumulative
16043 // source node.
16044 if (SecondElt == Srcs.end()) {
16045 auto EltOp =
16046 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16047
16048 Perms.push_back(
16049 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16050 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
16051 break;
16052 }
16053 }
16054
16055 assert(Perms.size() == 1 || Perms.size() == 2);
16056 return Perms.size() == 2
16057 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
16058 : Perms[0];
16059}
16060
16061static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
16062 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16063 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16064 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16065 EntryMask += ZeroMask;
16066 }
16067}
16068
16069static bool isMul(const SDValue Op) {
16070 auto Opcode = Op.getOpcode();
16071
16072 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16073 Opcode == AMDGPUISD::MUL_I24);
16074}
16075
16076static std::optional<bool>
16078 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16079 const SDValue &S1Op, const SelectionDAG &DAG) {
16080 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16081 // of the dot4 is irrelevant.
16082 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16083 return false;
16084
16085 auto Known0 = DAG.computeKnownBits(S0Op, 0);
16086 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16087 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16088 auto Known1 = DAG.computeKnownBits(S1Op, 0);
16089 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16090 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16091
16092 assert(!(S0IsUnsigned && S0IsSigned));
16093 assert(!(S1IsUnsigned && S1IsSigned));
16094
16095 // There are 9 possible permutations of
16096 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16097
16098 // In two permutations, the sign bits are known to be the same for both Ops,
16099 // so simply return Signed / Unsigned corresponding to the MSB
16100
16101 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16102 return S0IsSigned;
16103
16104 // In another two permutations, the sign bits are known to be opposite. In
16105 // this case return std::nullopt to indicate a bad match.
16106
16107 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16108 return std::nullopt;
16109
16110 // In the remaining five permutations, we don't know the value of the sign
16111 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16112 // the upper bits must be extension bits. Thus, the only ways for the sign
16113 // bit to be unknown is if it was sign extended from unknown value, or if it
16114 // was any extended. In either case, it is correct to use the signed
16115 // version of the signedness semantics of dot4
16116
16117 // In two of such permutations, we known the sign bit is set for
16118 // one op, and the other is unknown. It is okay to used signed version of
16119 // dot4.
16120 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16121 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16122 return true;
16123
16124 // In one such permutation, we don't know either of the sign bits. It is okay
16125 // to used the signed version of dot4.
16126 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16127 return true;
16128
16129 // In two of such permutations, we known the sign bit is unset for
16130 // one op, and the other is unknown. Return std::nullopt to indicate a
16131 // bad match.
16132 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16133 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16134 return std::nullopt;
16135
16136 llvm_unreachable("Fully covered condition");
16137}
16138
16139SDValue SITargetLowering::performAddCombine(SDNode *N,
16140 DAGCombinerInfo &DCI) const {
16141 SelectionDAG &DAG = DCI.DAG;
16142 EVT VT = N->getValueType(0);
16143 SDLoc SL(N);
16144 SDValue LHS = N->getOperand(0);
16145 SDValue RHS = N->getOperand(1);
16146
16147 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16148 if (Subtarget->hasMad64_32()) {
16149 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16150 return Folded;
16151 }
16152 }
16153
16154 if (SDValue V = reassociateScalarOps(N, DAG)) {
16155 return V;
16156 }
16157
16158 if (VT == MVT::i64) {
16159 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16160 return Folded;
16161 }
16162
16163 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
16164 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16165 SDValue TempNode(N, 0);
16166 std::optional<bool> IsSigned;
16170
16171 // Match the v_dot4 tree, while collecting src nodes.
16172 int ChainLength = 0;
16173 for (int I = 0; I < 4; I++) {
16174 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
16175 if (MulIdx == -1)
16176 break;
16177 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16178 if (!Src0)
16179 break;
16180 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16181 if (!Src1)
16182 break;
16183
16184 auto IterIsSigned = checkDot4MulSignedness(
16185 TempNode->getOperand(MulIdx), *Src0, *Src1,
16186 TempNode->getOperand(MulIdx)->getOperand(0),
16187 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16188 if (!IterIsSigned)
16189 break;
16190 if (!IsSigned)
16191 IsSigned = *IterIsSigned;
16192 if (*IterIsSigned != *IsSigned)
16193 break;
16194 placeSources(*Src0, *Src1, Src0s, Src1s, I);
16195 auto AddIdx = 1 - MulIdx;
16196 // Allow the special case where add (add (mul24, 0), mul24) became ->
16197 // add (mul24, mul24).
16198 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
16199 Src2s.push_back(TempNode->getOperand(AddIdx));
16200 auto Src0 =
16201 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
16202 if (!Src0)
16203 break;
16204 auto Src1 =
16205 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
16206 if (!Src1)
16207 break;
16208 auto IterIsSigned = checkDot4MulSignedness(
16209 TempNode->getOperand(AddIdx), *Src0, *Src1,
16210 TempNode->getOperand(AddIdx)->getOperand(0),
16211 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16212 if (!IterIsSigned)
16213 break;
16214 assert(IsSigned);
16215 if (*IterIsSigned != *IsSigned)
16216 break;
16217 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
16218 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
16219 ChainLength = I + 2;
16220 break;
16221 }
16222
16223 TempNode = TempNode->getOperand(AddIdx);
16224 Src2s.push_back(TempNode);
16225 ChainLength = I + 1;
16226 if (TempNode->getNumOperands() < 2)
16227 break;
16228 LHS = TempNode->getOperand(0);
16229 RHS = TempNode->getOperand(1);
16230 }
16231
16232 if (ChainLength < 2)
16233 return SDValue();
16234
16235 // Masks were constructed with assumption that we would find a chain of
16236 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16237 // 0x0c) so they do not affect dot calculation.
16238 if (ChainLength < 4) {
16239 fixMasks(Src0s, ChainLength);
16240 fixMasks(Src1s, ChainLength);
16241 }
16242
16243 SDValue Src0, Src1;
16244
16245 // If we are just using a single source for both, and have permuted the
16246 // bytes consistently, we can just use the sources without permuting
16247 // (commutation).
16248 bool UseOriginalSrc = false;
16249 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16250 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16251 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16252 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16253 SmallVector<unsigned, 4> SrcBytes;
16254 auto Src0Mask = Src0s.begin()->PermMask;
16255 SrcBytes.push_back(Src0Mask & 0xFF000000);
16256 bool UniqueEntries = true;
16257 for (auto I = 1; I < 4; I++) {
16258 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16259
16260 if (is_contained(SrcBytes, NextByte)) {
16261 UniqueEntries = false;
16262 break;
16263 }
16264 SrcBytes.push_back(NextByte);
16265 }
16266
16267 if (UniqueEntries) {
16268 UseOriginalSrc = true;
16269
16270 auto *FirstElt = Src0s.begin();
16271 auto FirstEltOp =
16272 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16273
16274 auto *SecondElt = Src1s.begin();
16275 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16276 SecondElt->DWordOffset);
16277
16278 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16279 MVT::getIntegerVT(32));
16280 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16281 MVT::getIntegerVT(32));
16282 }
16283 }
16284
16285 if (!UseOriginalSrc) {
16286 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16287 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16288 }
16289
16290 assert(IsSigned);
16291 SDValue Src2 =
16292 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16293
16294 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16295 : Intrinsic::amdgcn_udot4,
16296 SL, MVT::i64);
16297
16298 assert(!VT.isVector());
16299 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16300 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16301
16302 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16303 }
16304
16305 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16306 return SDValue();
16307
16308 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16309 // add x, sext (setcc) => usubo_carry x, 0, setcc
16310 unsigned Opc = LHS.getOpcode();
16313 std::swap(RHS, LHS);
16314
16315 Opc = RHS.getOpcode();
16316 switch (Opc) {
16317 default:
16318 break;
16319 case ISD::ZERO_EXTEND:
16320 case ISD::SIGN_EXTEND:
16321 case ISD::ANY_EXTEND: {
16322 auto Cond = RHS.getOperand(0);
16323 // If this won't be a real VOPC output, we would still need to insert an
16324 // extra instruction anyway.
16325 if (!isBoolSGPR(Cond))
16326 break;
16327 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16328 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16330 return DAG.getNode(Opc, SL, VTList, Args);
16331 }
16332 case ISD::UADDO_CARRY: {
16333 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16334 if (!isNullConstant(RHS.getOperand(1)))
16335 break;
16336 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16337 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16338 }
16339 }
16340 return SDValue();
16341}
16342
16343SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16344 DAGCombinerInfo &DCI) const {
16345 SelectionDAG &DAG = DCI.DAG;
16346 SDLoc DL(N);
16347 EVT VT = N->getValueType(0);
16348 SDValue N0 = N->getOperand(0);
16349 SDValue N1 = N->getOperand(1);
16350
16351 // The following folds transform PTRADDs into regular arithmetic in cases
16352 // where the PTRADD wouldn't be folded as an immediate offset into memory
16353 // instructions anyway. They are target-specific in that other targets might
16354 // prefer to not lose information about the pointer arithmetic.
16355
16356 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16357 // Adapted from DAGCombiner::visitADDLikeCommutative.
16358 SDValue V, K;
16359 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16360 SDNodeFlags ShlFlags = N1->getFlags();
16361 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16362 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16363 // preserved.
16364 SDNodeFlags NewShlFlags =
16365 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16367 : SDNodeFlags();
16368 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16369 DCI.AddToWorklist(Inner.getNode());
16370 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16371 }
16372
16373 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16374 // performAddCombine.
16375 if (N1.getOpcode() == ISD::MUL) {
16376 if (Subtarget->hasMad64_32()) {
16377 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16378 return Folded;
16379 }
16380 }
16381
16382 // If the 32 low bits of the constant are all zero, there is nothing to fold
16383 // into an immediate offset, so it's better to eliminate the unnecessary
16384 // addition for the lower 32 bits than to preserve the PTRADD.
16385 // Analogous to a fold in performAddCombine.
16386 if (VT == MVT::i64) {
16387 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16388 return Folded;
16389 }
16390
16391 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16392 return SDValue();
16393
16394 SDValue X = N0;
16395 SDValue Y = N1.getOperand(0);
16396 SDValue Z = N1.getOperand(1);
16397 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16398 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16399
16400 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16401 Y->isDivergent() != Z->isDivergent()) {
16402 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16403 // y are uniform and z isn't.
16404 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16405 // z are uniform and y isn't.
16406 // The goal is to push uniform operands up in the computation, so that they
16407 // can be handled with scalar operations. We can't use reassociateScalarOps
16408 // for this since it requires two identical commutative operations to
16409 // reassociate.
16410 if (Y->isDivergent())
16411 std::swap(Y, Z);
16412 // If both additions in the original were NUW, reassociation preserves that.
16413 SDNodeFlags ReassocFlags =
16414 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16415 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16416 DCI.AddToWorklist(UniformInner.getNode());
16417 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16418 }
16419
16420 return SDValue();
16421}
16422
16423SDValue SITargetLowering::performSubCombine(SDNode *N,
16424 DAGCombinerInfo &DCI) const {
16425 SelectionDAG &DAG = DCI.DAG;
16426 EVT VT = N->getValueType(0);
16427
16428 if (VT == MVT::i64) {
16429 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16430 return Folded;
16431 }
16432
16433 if (VT != MVT::i32)
16434 return SDValue();
16435
16436 SDLoc SL(N);
16437 SDValue LHS = N->getOperand(0);
16438 SDValue RHS = N->getOperand(1);
16439
16440 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16441 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16442 unsigned Opc = RHS.getOpcode();
16443 switch (Opc) {
16444 default:
16445 break;
16446 case ISD::ZERO_EXTEND:
16447 case ISD::SIGN_EXTEND:
16448 case ISD::ANY_EXTEND: {
16449 auto Cond = RHS.getOperand(0);
16450 // If this won't be a real VOPC output, we would still need to insert an
16451 // extra instruction anyway.
16452 if (!isBoolSGPR(Cond))
16453 break;
16454 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16455 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16457 return DAG.getNode(Opc, SL, VTList, Args);
16458 }
16459 }
16460
16461 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16462 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16463 if (!isNullConstant(LHS.getOperand(1)))
16464 return SDValue();
16465 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16466 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16467 }
16468 return SDValue();
16469}
16470
16471SDValue
16472SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16473 DAGCombinerInfo &DCI) const {
16474
16475 if (N->getValueType(0) != MVT::i32)
16476 return SDValue();
16477
16478 if (!isNullConstant(N->getOperand(1)))
16479 return SDValue();
16480
16481 SelectionDAG &DAG = DCI.DAG;
16482 SDValue LHS = N->getOperand(0);
16483
16484 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16485 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16486 unsigned LHSOpc = LHS.getOpcode();
16487 unsigned Opc = N->getOpcode();
16488 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16489 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16490 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16491 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16492 }
16493 return SDValue();
16494}
16495
16496SDValue SITargetLowering::performFAddCombine(SDNode *N,
16497 DAGCombinerInfo &DCI) const {
16498 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16499 return SDValue();
16500
16501 SelectionDAG &DAG = DCI.DAG;
16502 EVT VT = N->getValueType(0);
16503
16504 SDLoc SL(N);
16505 SDValue LHS = N->getOperand(0);
16506 SDValue RHS = N->getOperand(1);
16507
16508 // These should really be instruction patterns, but writing patterns with
16509 // source modifiers is a pain.
16510
16511 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16512 if (LHS.getOpcode() == ISD::FADD) {
16513 SDValue A = LHS.getOperand(0);
16514 if (A == LHS.getOperand(1)) {
16515 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16516 if (FusedOp != 0) {
16517 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16518 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16519 }
16520 }
16521 }
16522
16523 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16524 if (RHS.getOpcode() == ISD::FADD) {
16525 SDValue A = RHS.getOperand(0);
16526 if (A == RHS.getOperand(1)) {
16527 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16528 if (FusedOp != 0) {
16529 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16530 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16531 }
16532 }
16533 }
16534
16535 return SDValue();
16536}
16537
16538SDValue SITargetLowering::performFSubCombine(SDNode *N,
16539 DAGCombinerInfo &DCI) const {
16540 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16541 return SDValue();
16542
16543 SelectionDAG &DAG = DCI.DAG;
16544 SDLoc SL(N);
16545 EVT VT = N->getValueType(0);
16546 assert(!VT.isVector());
16547
16548 // Try to get the fneg to fold into the source modifier. This undoes generic
16549 // DAG combines and folds them into the mad.
16550 //
16551 // Only do this if we are not trying to support denormals. v_mad_f32 does
16552 // not support denormals ever.
16553 SDValue LHS = N->getOperand(0);
16554 SDValue RHS = N->getOperand(1);
16555 if (LHS.getOpcode() == ISD::FADD) {
16556 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16557 SDValue A = LHS.getOperand(0);
16558 if (A == LHS.getOperand(1)) {
16559 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16560 if (FusedOp != 0) {
16561 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16562 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16563
16564 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16565 }
16566 }
16567 }
16568
16569 if (RHS.getOpcode() == ISD::FADD) {
16570 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16571
16572 SDValue A = RHS.getOperand(0);
16573 if (A == RHS.getOperand(1)) {
16574 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16575 if (FusedOp != 0) {
16576 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16577 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16578 }
16579 }
16580 }
16581
16582 return SDValue();
16583}
16584
16585SDValue SITargetLowering::performFDivCombine(SDNode *N,
16586 DAGCombinerInfo &DCI) const {
16587 SelectionDAG &DAG = DCI.DAG;
16588 SDLoc SL(N);
16589 EVT VT = N->getValueType(0);
16590
16591 // fsqrt legality correlates to rsq availability.
16592 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
16593 return SDValue();
16594
16595 SDValue LHS = N->getOperand(0);
16596 SDValue RHS = N->getOperand(1);
16597
16598 SDNodeFlags Flags = N->getFlags();
16599 SDNodeFlags RHSFlags = RHS->getFlags();
16600 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16601 !RHS->hasOneUse())
16602 return SDValue();
16603
16604 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16605 bool IsNegative = false;
16606 if (CLHS->isExactlyValue(1.0) ||
16607 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16608 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16609 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16610 if (RHS.getOpcode() == ISD::FSQRT) {
16611 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16612 SDValue Rsq =
16613 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16614 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16615 }
16616 }
16617 }
16618
16619 return SDValue();
16620}
16621
16622SDValue SITargetLowering::performFMulCombine(SDNode *N,
16623 DAGCombinerInfo &DCI) const {
16624 SelectionDAG &DAG = DCI.DAG;
16625 EVT VT = N->getValueType(0);
16626 EVT ScalarVT = VT.getScalarType();
16627 EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);
16628
16629 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16630 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16631 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16632 return SDValue();
16633 }
16634
16635 SDValue LHS = N->getOperand(0);
16636 SDValue RHS = N->getOperand(1);
16637
16638 // It is cheaper to realize i32 inline constants as compared against
16639 // materializing f16 or f64 (or even non-inline f32) values,
16640 // possible via ldexp usage, as shown below :
16641 //
16642 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16643 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16644 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16645 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16646 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16647 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16648 if (!TrueNode)
16649 return SDValue();
16650 const ConstantFPSDNode *FalseNode =
16651 isConstOrConstSplatFP(RHS.getOperand(2));
16652 if (!FalseNode)
16653 return SDValue();
16654
16655 if (TrueNode->isNegative() != FalseNode->isNegative())
16656 return SDValue();
16657
16658 // For f32, only non-inline constants should be transformed.
16659 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16660 if (ScalarVT == MVT::f32 &&
16661 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16662 TII->isInlineConstant(FalseNode->getValueAPF()))
16663 return SDValue();
16664
16665 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16666 if (TrueNodeExpVal == INT_MIN)
16667 return SDValue();
16668 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16669 if (FalseNodeExpVal == INT_MIN)
16670 return SDValue();
16671
16672 SDLoc SL(N);
16673 SDValue SelectNode =
16674 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16675 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16676 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16677
16678 LHS = TrueNode->isNegative()
16679 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16680 : LHS;
16681
16682 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16683 }
16684
16685 return SDValue();
16686}
16687
16688SDValue SITargetLowering::performFMACombine(SDNode *N,
16689 DAGCombinerInfo &DCI) const {
16690 SelectionDAG &DAG = DCI.DAG;
16691 EVT VT = N->getValueType(0);
16692 SDLoc SL(N);
16693
16694 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16695 return SDValue();
16696
16697 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16698 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16699 SDValue Op1 = N->getOperand(0);
16700 SDValue Op2 = N->getOperand(1);
16701 SDValue FMA = N->getOperand(2);
16702
16703 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16704 Op2.getOpcode() != ISD::FP_EXTEND)
16705 return SDValue();
16706
16707 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16708 // regardless of the denorm mode setting. Therefore,
16709 // fp-contract is sufficient to allow generating fdot2.
16710 const TargetOptions &Options = DAG.getTarget().Options;
16711 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16712 (N->getFlags().hasAllowContract() &&
16713 FMA->getFlags().hasAllowContract())) {
16714 Op1 = Op1.getOperand(0);
16715 Op2 = Op2.getOperand(0);
16716 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16718 return SDValue();
16719
16720 SDValue Vec1 = Op1.getOperand(0);
16721 SDValue Idx1 = Op1.getOperand(1);
16722 SDValue Vec2 = Op2.getOperand(0);
16723
16724 SDValue FMAOp1 = FMA.getOperand(0);
16725 SDValue FMAOp2 = FMA.getOperand(1);
16726 SDValue FMAAcc = FMA.getOperand(2);
16727
16728 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16729 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16730 return SDValue();
16731
16732 FMAOp1 = FMAOp1.getOperand(0);
16733 FMAOp2 = FMAOp2.getOperand(0);
16734 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16736 return SDValue();
16737
16738 SDValue Vec3 = FMAOp1.getOperand(0);
16739 SDValue Vec4 = FMAOp2.getOperand(0);
16740 SDValue Idx2 = FMAOp1.getOperand(1);
16741
16742 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16743 // Idx1 and Idx2 cannot be the same.
16744 Idx1 == Idx2)
16745 return SDValue();
16746
16747 if (Vec1 == Vec2 || Vec3 == Vec4)
16748 return SDValue();
16749
16750 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16751 return SDValue();
16752
16753 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16754 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16755 DAG.getTargetConstant(0, SL, MVT::i1));
16756 }
16757 }
16758 return SDValue();
16759}
16760
16761SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16762 DAGCombinerInfo &DCI) const {
16763 SelectionDAG &DAG = DCI.DAG;
16764 SDLoc SL(N);
16765
16766 SDValue LHS = N->getOperand(0);
16767 SDValue RHS = N->getOperand(1);
16768 EVT VT = LHS.getValueType();
16769 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16770
16771 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16772 if (!CRHS) {
16774 if (CRHS) {
16775 std::swap(LHS, RHS);
16776 CC = getSetCCSwappedOperands(CC);
16777 }
16778 }
16779
16780 if (CRHS) {
16781 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16782 isBoolSGPR(LHS.getOperand(0))) {
16783 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16784 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16785 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16786 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16787 if ((CRHS->isAllOnes() &&
16788 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16789 (CRHS->isZero() &&
16790 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16791 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16792 DAG.getAllOnesConstant(SL, MVT::i1));
16793 if ((CRHS->isAllOnes() &&
16794 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16795 (CRHS->isZero() &&
16796 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16797 return LHS.getOperand(0);
16798 }
16799
16800 const APInt &CRHSVal = CRHS->getAPIntValue();
16801 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16802 LHS.getOpcode() == ISD::SELECT &&
16803 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16804 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16805 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16806 isBoolSGPR(LHS.getOperand(0))) {
16807 // Given CT != FT:
16808 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16809 // setcc (select cc, CT, CF), CF, ne => cc
16810 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16811 // setcc (select cc, CT, CF), CT, eq => cc
16812 const APInt &CT = LHS.getConstantOperandAPInt(1);
16813 const APInt &CF = LHS.getConstantOperandAPInt(2);
16814
16815 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16816 (CT == CRHSVal && CC == ISD::SETNE))
16817 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16818 DAG.getAllOnesConstant(SL, MVT::i1));
16819 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16820 (CT == CRHSVal && CC == ISD::SETEQ))
16821 return LHS.getOperand(0);
16822 }
16823 }
16824
16825 // Eliminate setcc by using carryout from add/sub instruction
16826
16827 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
16828 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
16829 // similarly for subtraction
16830
16831 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
16832 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
16833
16834 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
16836 (CC == ISD::SETUGT &&
16838 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16839 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16840 bool IsAdd = LHS.getOpcode() == ISD::ADD;
16841
16842 SDValue Op0 = LHS.getOperand(0);
16843 SDValue Op1 = LHS.getOperand(1);
16844
16845 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
16846 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
16847
16848 SDValue Op0Hi = getHiHalf64(Op0, DAG);
16849 SDValue Op1Hi = getHiHalf64(Op1, DAG);
16850
16851 SDValue NodeLo =
16852 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
16853 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16854
16855 SDValue CarryInHi = NodeLo.getValue(1);
16856 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
16857 SL, DAG.getVTList(MVT::i32, MVT::i1),
16858 {Op0Hi, Op1Hi, CarryInHi});
16859
16860 SDValue ResultLo = NodeLo.getValue(0);
16861 SDValue ResultHi = NodeHi.getValue(0);
16862
16863 SDValue JoinedResult =
16864 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
16865
16866 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
16867 SDValue Overflow = NodeHi.getValue(1);
16868 DCI.CombineTo(LHS.getNode(), Result);
16869 return Overflow;
16870 }
16871
16872 if (VT != MVT::f32 && VT != MVT::f64 &&
16873 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16874 return SDValue();
16875
16876 // Match isinf/isfinite pattern
16877 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16878 // (fcmp one (fabs x), inf) -> (fp_class x,
16879 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16880 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16881 LHS.getOpcode() == ISD::FABS) {
16882 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16883 if (!CRHS)
16884 return SDValue();
16885
16886 const APFloat &APF = CRHS->getValueAPF();
16887 if (APF.isInfinity() && !APF.isNegative()) {
16888 const unsigned IsInfMask =
16890 const unsigned IsFiniteMask =
16894 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16895 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16896 DAG.getConstant(Mask, SL, MVT::i32));
16897 }
16898 }
16899
16900 return SDValue();
16901}
16902
16903SDValue
16904SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16905 DAGCombinerInfo &DCI) const {
16906 SelectionDAG &DAG = DCI.DAG;
16907 SDLoc SL(N);
16908 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16909
16910 SDValue Src = N->getOperand(0);
16911 SDValue Shift = N->getOperand(0);
16912
16913 // TODO: Extend type shouldn't matter (assuming legal types).
16914 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16915 Shift = Shift.getOperand(0);
16916
16917 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16918 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16919 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16920 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16921 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16922 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16923 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16924 SDValue Shifted = DAG.getZExtOrTrunc(
16925 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16926
16927 unsigned ShiftOffset = 8 * Offset;
16928 if (Shift.getOpcode() == ISD::SHL)
16929 ShiftOffset -= C->getZExtValue();
16930 else
16931 ShiftOffset += C->getZExtValue();
16932
16933 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16934 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16935 MVT::f32, Shifted);
16936 }
16937 }
16938 }
16939
16940 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16941 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16942 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16943 // We simplified Src. If this node is not dead, visit it again so it is
16944 // folded properly.
16945 if (N->getOpcode() != ISD::DELETED_NODE)
16946 DCI.AddToWorklist(N);
16947 return SDValue(N, 0);
16948 }
16949
16950 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16951 if (SDValue DemandedSrc =
16952 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16953 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16954
16955 return SDValue();
16956}
16957
16958SDValue SITargetLowering::performClampCombine(SDNode *N,
16959 DAGCombinerInfo &DCI) const {
16960 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16961 if (!CSrc)
16962 return SDValue();
16963
16964 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16965 const APFloat &F = CSrc->getValueAPF();
16966 APFloat Zero = APFloat::getZero(F.getSemantics());
16967 if (F < Zero ||
16968 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16969 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16970 }
16971
16972 APFloat One(F.getSemantics(), "1.0");
16973 if (F > One)
16974 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16975
16976 return SDValue(CSrc, 0);
16977}
16978
16979SDValue SITargetLowering::performSelectCombine(SDNode *N,
16980 DAGCombinerInfo &DCI) const {
16981
16982 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16983 // integer).
16984 // Detect when CMP and SELECT use the same constant and fold them to avoid
16985 // loading the constant twice. Specifically handles patterns like:
16986 // %cmp = icmp eq i32 %val, 4242
16987 // %sel = select i1 %cmp, i32 4242, i32 %other
16988 // It can be optimized to reuse %val instead of 4242 in select.
16989 SDValue Cond = N->getOperand(0);
16990 SDValue TrueVal = N->getOperand(1);
16991 SDValue FalseVal = N->getOperand(2);
16992
16993 // Check if condition is a comparison.
16994 if (Cond.getOpcode() != ISD::SETCC)
16995 return SDValue();
16996
16997 SDValue LHS = Cond.getOperand(0);
16998 SDValue RHS = Cond.getOperand(1);
16999 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
17000
17001 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
17002 bool isInteger = LHS.getValueType().isInteger();
17003
17004 // Handle simple floating-point and integer types only.
17005 if (!isFloatingPoint && !isInteger)
17006 return SDValue();
17007
17008 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
17009 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
17010 if (!isEquality && !isNonEquality)
17011 return SDValue();
17012
17013 SDValue ArgVal, ConstVal;
17014 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
17015 (isInteger && isa<ConstantSDNode>(RHS))) {
17016 ConstVal = RHS;
17017 ArgVal = LHS;
17018 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
17019 (isInteger && isa<ConstantSDNode>(LHS))) {
17020 ConstVal = LHS;
17021 ArgVal = RHS;
17022 } else {
17023 return SDValue();
17024 }
17025
17026 // Skip optimization for inlinable immediates.
17027 if (isFloatingPoint) {
17028 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
17029 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17030 return SDValue();
17031 } else {
17033 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
17034 return SDValue();
17035 }
17036
17037 // For equality and non-equality comparisons, patterns:
17038 // select (setcc x, const), const, y -> select (setcc x, const), x, y
17039 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
17040 if (!(isEquality && TrueVal == ConstVal) &&
17041 !(isNonEquality && FalseVal == ConstVal))
17042 return SDValue();
17043
17044 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
17045 SDValue SelectRHS =
17046 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
17047 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
17048 SelectLHS, SelectRHS);
17049}
17050
17052 DAGCombinerInfo &DCI) const {
17053 switch (N->getOpcode()) {
17054 case ISD::ADD:
17055 case ISD::SUB:
17056 case ISD::SHL:
17057 case ISD::SRL:
17058 case ISD::SRA:
17059 case ISD::AND:
17060 case ISD::OR:
17061 case ISD::XOR:
17062 case ISD::MUL:
17063 case ISD::SETCC:
17064 case ISD::SELECT:
17065 case ISD::SMIN:
17066 case ISD::SMAX:
17067 case ISD::UMIN:
17068 case ISD::UMAX:
17069 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
17070 return Res;
17071 break;
17072 default:
17073 break;
17074 }
17075
17076 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
17077 return SDValue();
17078
17079 switch (N->getOpcode()) {
17080 case ISD::ADD:
17081 return performAddCombine(N, DCI);
17082 case ISD::PTRADD:
17083 return performPtrAddCombine(N, DCI);
17084 case ISD::SUB:
17085 return performSubCombine(N, DCI);
17086 case ISD::UADDO_CARRY:
17087 case ISD::USUBO_CARRY:
17088 return performAddCarrySubCarryCombine(N, DCI);
17089 case ISD::FADD:
17090 return performFAddCombine(N, DCI);
17091 case ISD::FSUB:
17092 return performFSubCombine(N, DCI);
17093 case ISD::FDIV:
17094 return performFDivCombine(N, DCI);
17095 case ISD::FMUL:
17096 return performFMulCombine(N, DCI);
17097 case ISD::SETCC:
17098 return performSetCCCombine(N, DCI);
17099 case ISD::SELECT:
17100 if (auto Res = performSelectCombine(N, DCI))
17101 return Res;
17102 break;
17103 case ISD::FMAXNUM:
17104 case ISD::FMINNUM:
17105 case ISD::FMAXNUM_IEEE:
17106 case ISD::FMINNUM_IEEE:
17107 case ISD::FMAXIMUM:
17108 case ISD::FMINIMUM:
17109 case ISD::FMAXIMUMNUM:
17110 case ISD::FMINIMUMNUM:
17111 case ISD::SMAX:
17112 case ISD::SMIN:
17113 case ISD::UMAX:
17114 case ISD::UMIN:
17115 case AMDGPUISD::FMIN_LEGACY:
17116 case AMDGPUISD::FMAX_LEGACY:
17117 return performMinMaxCombine(N, DCI);
17118 case ISD::FMA:
17119 return performFMACombine(N, DCI);
17120 case ISD::AND:
17121 return performAndCombine(N, DCI);
17122 case ISD::OR:
17123 return performOrCombine(N, DCI);
17124 case ISD::FSHR: {
17126 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
17127 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17128 return matchPERM(N, DCI);
17129 }
17130 break;
17131 }
17132 case ISD::XOR:
17133 return performXorCombine(N, DCI);
17134 case ISD::ZERO_EXTEND:
17135 return performZeroExtendCombine(N, DCI);
17137 return performSignExtendInRegCombine(N, DCI);
17138 case AMDGPUISD::FP_CLASS:
17139 return performClassCombine(N, DCI);
17140 case ISD::FCANONICALIZE:
17141 return performFCanonicalizeCombine(N, DCI);
17142 case AMDGPUISD::RCP:
17143 return performRcpCombine(N, DCI);
17144 case ISD::FLDEXP:
17145 case AMDGPUISD::FRACT:
17146 case AMDGPUISD::RSQ:
17147 case AMDGPUISD::RCP_LEGACY:
17148 case AMDGPUISD::RCP_IFLAG:
17149 case AMDGPUISD::RSQ_CLAMP: {
17150 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17151 SDValue Src = N->getOperand(0);
17152 if (Src.isUndef())
17153 return Src;
17154 break;
17155 }
17156 case ISD::SINT_TO_FP:
17157 case ISD::UINT_TO_FP:
17158 return performUCharToFloatCombine(N, DCI);
17159 case ISD::FCOPYSIGN:
17160 return performFCopySignCombine(N, DCI);
17161 case AMDGPUISD::CVT_F32_UBYTE0:
17162 case AMDGPUISD::CVT_F32_UBYTE1:
17163 case AMDGPUISD::CVT_F32_UBYTE2:
17164 case AMDGPUISD::CVT_F32_UBYTE3:
17165 return performCvtF32UByteNCombine(N, DCI);
17166 case AMDGPUISD::FMED3:
17167 return performFMed3Combine(N, DCI);
17168 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17169 return performCvtPkRTZCombine(N, DCI);
17170 case AMDGPUISD::CLAMP:
17171 return performClampCombine(N, DCI);
17172 case ISD::SCALAR_TO_VECTOR: {
17173 SelectionDAG &DAG = DCI.DAG;
17174 EVT VT = N->getValueType(0);
17175
17176 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17177 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17178 SDLoc SL(N);
17179 SDValue Src = N->getOperand(0);
17180 EVT EltVT = Src.getValueType();
17181 if (EltVT != MVT::i16)
17182 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
17183
17184 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
17185 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
17186 }
17187
17188 break;
17189 }
17191 return performExtractVectorEltCombine(N, DCI);
17193 return performInsertVectorEltCombine(N, DCI);
17194 case ISD::FP_ROUND:
17195 return performFPRoundCombine(N, DCI);
17196 case ISD::LOAD: {
17197 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
17198 return Widened;
17199 [[fallthrough]];
17200 }
17201 default: {
17202 if (!DCI.isBeforeLegalize()) {
17203 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
17204 return performMemSDNodeCombine(MemNode, DCI);
17205 }
17206
17207 break;
17208 }
17209 }
17210
17212}
17213
17214/// Helper function for adjustWritemask
17215static unsigned SubIdx2Lane(unsigned Idx) {
17216 switch (Idx) {
17217 default:
17218 return ~0u;
17219 case AMDGPU::sub0:
17220 return 0;
17221 case AMDGPU::sub1:
17222 return 1;
17223 case AMDGPU::sub2:
17224 return 2;
17225 case AMDGPU::sub3:
17226 return 3;
17227 case AMDGPU::sub4:
17228 return 4; // Possible with TFE/LWE
17229 }
17230}
17231
17232/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17233SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17234 SelectionDAG &DAG) const {
17235 unsigned Opcode = Node->getMachineOpcode();
17236
17237 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17238 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17239 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
17240 return Node; // not implemented for D16
17241
17242 SDNode *Users[5] = {nullptr};
17243 unsigned Lane = 0;
17244 unsigned DmaskIdx =
17245 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17246 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
17247 unsigned NewDmask = 0;
17248 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17249 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17250 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
17251 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
17252 unsigned TFCLane = 0;
17253 bool HasChain = Node->getNumValues() > 1;
17254
17255 if (OldDmask == 0) {
17256 // These are folded out, but on the chance it happens don't assert.
17257 return Node;
17258 }
17259
17260 unsigned OldBitsSet = llvm::popcount(OldDmask);
17261 // Work out which is the TFE/LWE lane if that is enabled.
17262 if (UsesTFC) {
17263 TFCLane = OldBitsSet;
17264 }
17265
17266 // Try to figure out the used register components
17267 for (SDUse &Use : Node->uses()) {
17268
17269 // Don't look at users of the chain.
17270 if (Use.getResNo() != 0)
17271 continue;
17272
17273 SDNode *User = Use.getUser();
17274
17275 // Abort if we can't understand the usage
17276 if (!User->isMachineOpcode() ||
17277 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17278 return Node;
17279
17280 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17281 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17282 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17283 // set, etc.
17284 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17285 if (Lane == ~0u)
17286 return Node;
17287
17288 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17289 if (UsesTFC && Lane == TFCLane) {
17290 Users[Lane] = User;
17291 } else {
17292 // Set which texture component corresponds to the lane.
17293 unsigned Comp;
17294 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17295 Comp = llvm::countr_zero(Dmask);
17296 Dmask &= ~(1 << Comp);
17297 }
17298
17299 // Abort if we have more than one user per component.
17300 if (Users[Lane])
17301 return Node;
17302
17303 Users[Lane] = User;
17304 NewDmask |= 1 << Comp;
17305 }
17306 }
17307
17308 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17309 bool NoChannels = !NewDmask;
17310 if (NoChannels) {
17311 if (!UsesTFC) {
17312 // No uses of the result and not using TFC. Then do nothing.
17313 return Node;
17314 }
17315 // If the original dmask has one channel - then nothing to do
17316 if (OldBitsSet == 1)
17317 return Node;
17318 // Use an arbitrary dmask - required for the instruction to work
17319 NewDmask = 1;
17320 }
17321 // Abort if there's no change
17322 if (NewDmask == OldDmask)
17323 return Node;
17324
17325 unsigned BitsSet = llvm::popcount(NewDmask);
17326
17327 // Check for TFE or LWE - increase the number of channels by one to account
17328 // for the extra return value
17329 // This will need adjustment for D16 if this is also included in
17330 // adjustWriteMask (this function) but at present D16 are excluded.
17331 unsigned NewChannels = BitsSet + UsesTFC;
17332
17333 int NewOpcode =
17334 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17335 assert(NewOpcode != -1 &&
17336 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17337 "failed to find equivalent MIMG op");
17338
17339 // Adjust the writemask in the node
17341 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17342 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17343 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17344
17345 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17346
17347 MVT ResultVT = NewChannels == 1
17348 ? SVT
17349 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17350 : NewChannels == 5 ? 8
17351 : NewChannels);
17352 SDVTList NewVTList =
17353 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17354
17355 MachineSDNode *NewNode =
17356 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17357
17358 if (HasChain) {
17359 // Update chain.
17360 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17361 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17362 }
17363
17364 if (NewChannels == 1) {
17365 assert(Node->hasNUsesOfValue(1, 0));
17366 SDNode *Copy =
17367 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17368 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17369 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17370 return nullptr;
17371 }
17372
17373 // Update the users of the node with the new indices
17374 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17375 SDNode *User = Users[i];
17376 if (!User) {
17377 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17378 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17379 if (i || !NoChannels)
17380 continue;
17381 } else {
17382 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17383 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17384 if (NewUser != User) {
17385 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17386 DAG.RemoveDeadNode(User);
17387 }
17388 }
17389
17390 switch (Idx) {
17391 default:
17392 break;
17393 case AMDGPU::sub0:
17394 Idx = AMDGPU::sub1;
17395 break;
17396 case AMDGPU::sub1:
17397 Idx = AMDGPU::sub2;
17398 break;
17399 case AMDGPU::sub2:
17400 Idx = AMDGPU::sub3;
17401 break;
17402 case AMDGPU::sub3:
17403 Idx = AMDGPU::sub4;
17404 break;
17405 }
17406 }
17407
17408 DAG.RemoveDeadNode(Node);
17409 return nullptr;
17410}
17411
17413 if (Op.getOpcode() == ISD::AssertZext)
17414 Op = Op.getOperand(0);
17415
17416 return isa<FrameIndexSDNode>(Op);
17417}
17418
17419/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17420/// with frame index operands.
17421/// LLVM assumes that inputs are to these instructions are registers.
17422SDNode *
17424 SelectionDAG &DAG) const {
17425 if (Node->getOpcode() == ISD::CopyToReg) {
17426 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17427 SDValue SrcVal = Node->getOperand(2);
17428
17429 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17430 // to try understanding copies to physical registers.
17431 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17432 SDLoc SL(Node);
17434 SDValue VReg = DAG.getRegister(
17435 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17436
17437 SDNode *Glued = Node->getGluedNode();
17438 SDValue ToVReg = DAG.getCopyToReg(
17439 Node->getOperand(0), SL, VReg, SrcVal,
17440 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17441 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17442 VReg, ToVReg.getValue(1));
17443 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17444 DAG.RemoveDeadNode(Node);
17445 return ToResultReg.getNode();
17446 }
17447 }
17448
17450 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17451 if (!isFrameIndexOp(Node->getOperand(i))) {
17452 Ops.push_back(Node->getOperand(i));
17453 continue;
17454 }
17455
17456 SDLoc DL(Node);
17457 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17458 Node->getOperand(i).getValueType(),
17459 Node->getOperand(i)),
17460 0));
17461 }
17462
17463 return DAG.UpdateNodeOperands(Node, Ops);
17464}
17465
17466/// Fold the instructions after selecting them.
17467/// Returns null if users were already updated.
17469 SelectionDAG &DAG) const {
17471 unsigned Opcode = Node->getMachineOpcode();
17472
17473 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17474 !TII->isGather4(Opcode) &&
17475 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17476 return adjustWritemask(Node, DAG);
17477 }
17478
17479 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17481 return Node;
17482 }
17483
17484 switch (Opcode) {
17485 case AMDGPU::V_DIV_SCALE_F32_e64:
17486 case AMDGPU::V_DIV_SCALE_F64_e64: {
17487 // Satisfy the operand register constraint when one of the inputs is
17488 // undefined. Ordinarily each undef value will have its own implicit_def of
17489 // a vreg, so force these to use a single register.
17490 SDValue Src0 = Node->getOperand(1);
17491 SDValue Src1 = Node->getOperand(3);
17492 SDValue Src2 = Node->getOperand(5);
17493
17494 if ((Src0.isMachineOpcode() &&
17495 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17496 (Src0 == Src1 || Src0 == Src2))
17497 break;
17498
17499 MVT VT = Src0.getValueType().getSimpleVT();
17500 const TargetRegisterClass *RC =
17501 getRegClassFor(VT, Src0.getNode()->isDivergent());
17502
17504 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17505
17506 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17507 Src0, SDValue());
17508
17509 // src0 must be the same register as src1 or src2, even if the value is
17510 // undefined, so make sure we don't violate this constraint.
17511 if (Src0.isMachineOpcode() &&
17512 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17513 if (Src1.isMachineOpcode() &&
17514 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17515 Src0 = Src1;
17516 else if (Src2.isMachineOpcode() &&
17517 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17518 Src0 = Src2;
17519 else {
17520 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17521 Src0 = UndefReg;
17522 Src1 = UndefReg;
17523 }
17524 } else
17525 break;
17526
17528 Ops[1] = Src0;
17529 Ops[3] = Src1;
17530 Ops[5] = Src2;
17531 Ops.push_back(ImpDef.getValue(1));
17532 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17533 }
17534 default:
17535 break;
17536 }
17537
17538 return Node;
17539}
17540
17541// Any MIMG instructions that use tfe or lwe require an initialization of the
17542// result register that will be written in the case of a memory access failure.
17543// The required code is also added to tie this init code to the result of the
17544// img instruction.
17547 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17548 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17549 MachineBasicBlock &MBB = *MI.getParent();
17550
17551 int DstIdx =
17552 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17553 unsigned InitIdx = 0;
17554
17555 if (TII->isImage(MI)) {
17556 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17557 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17558 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17559
17560 if (!TFE && !LWE) // intersect_ray
17561 return;
17562
17563 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17564 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17565 unsigned D16Val = D16 ? D16->getImm() : 0;
17566
17567 if (!TFEVal && !LWEVal)
17568 return;
17569
17570 // At least one of TFE or LWE are non-zero
17571 // We have to insert a suitable initialization of the result value and
17572 // tie this to the dest of the image instruction.
17573
17574 // Calculate which dword we have to initialize to 0.
17575 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17576
17577 // check that dmask operand is found.
17578 assert(MO_Dmask && "Expected dmask operand in instruction");
17579
17580 unsigned dmask = MO_Dmask->getImm();
17581 // Determine the number of active lanes taking into account the
17582 // Gather4 special case
17583 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17584
17585 bool Packed = !Subtarget->hasUnpackedD16VMem();
17586
17587 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17588
17589 // Abandon attempt if the dst size isn't large enough
17590 // - this is in fact an error but this is picked up elsewhere and
17591 // reported correctly.
17592 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17593
17594 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
17595 if (DstSize < InitIdx)
17596 return;
17597 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17598 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17599 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
17600 } else {
17601 return;
17602 }
17603
17604 const DebugLoc &DL = MI.getDebugLoc();
17605
17606 // Create a register for the initialization value.
17607 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17608 unsigned NewDst = 0; // Final initialized value will be in here
17609
17610 // If PRTStrictNull feature is enabled (the default) then initialize
17611 // all the result registers to 0, otherwise just the error indication
17612 // register (VGPRn+1)
17613 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17614 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17615
17616 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17617 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17618 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17619 // Initialize dword
17620 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17621 // clang-format off
17622 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17623 .addImm(0);
17624 // clang-format on
17625 // Insert into the super-reg
17626 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17627 .addReg(PrevDst)
17628 .addReg(SubReg)
17630
17631 PrevDst = NewDst;
17632 }
17633
17634 // Add as an implicit operand
17635 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17636
17637 // Tie the just added implicit operand to the dst
17638 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17639}
17640
17641/// Assign the register class depending on the number of
17642/// bits set in the writemask
17644 SDNode *Node) const {
17646
17647 MachineFunction *MF = MI.getMF();
17649
17650 if (TII->isVOP3(MI.getOpcode())) {
17651 // Make sure constant bus requirements are respected.
17652 TII->legalizeOperandsVOP3(MRI, MI);
17653
17654 if (TII->isMAI(MI)) {
17655 // The ordinary src0, src1, src2 were legalized above.
17656 //
17657 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17658 // as a separate instruction.
17659 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17660 AMDGPU::OpName::scale_src0);
17661 if (Src0Idx != -1) {
17662 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17663 AMDGPU::OpName::scale_src1);
17664 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17665 TII->usesConstantBus(MRI, MI, Src1Idx))
17666 TII->legalizeOpWithMove(MI, Src1Idx);
17667 }
17668 }
17669
17670 return;
17671 }
17672
17673 if (TII->isImage(MI))
17674 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17675}
17676
17678 uint64_t Val) {
17679 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17680 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17681}
17682
17684 const SDLoc &DL,
17685 SDValue Ptr) const {
17687
17688 // Build the half of the subregister with the constants before building the
17689 // full 128-bit register. If we are building multiple resource descriptors,
17690 // this will allow CSEing of the 2-component register.
17691 const SDValue Ops0[] = {
17692 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17693 buildSMovImm32(DAG, DL, 0),
17694 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17695 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17696 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17697
17698 SDValue SubRegHi = SDValue(
17699 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17700
17701 // Combine the constants and the pointer.
17702 const SDValue Ops1[] = {
17703 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17704 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17705 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17706
17707 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17708}
17709
17710/// Return a resource descriptor with the 'Add TID' bit enabled
17711/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17712/// of the resource descriptor) to create an offset, which is added to
17713/// the resource pointer.
17715 SDValue Ptr, uint32_t RsrcDword1,
17716 uint64_t RsrcDword2And3) const {
17717 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17718 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17719 if (RsrcDword1) {
17720 PtrHi =
17721 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17722 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17723 0);
17724 }
17725
17726 SDValue DataLo =
17727 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17728 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17729
17730 const SDValue Ops[] = {
17731 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17732 PtrLo,
17733 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17734 PtrHi,
17735 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17736 DataLo,
17737 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17738 DataHi,
17739 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17740
17741 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17742}
17743
17744//===----------------------------------------------------------------------===//
17745// SI Inline Assembly Support
17746//===----------------------------------------------------------------------===//
17747
17748std::pair<unsigned, const TargetRegisterClass *>
17750 StringRef Constraint,
17751 MVT VT) const {
17752 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17753
17754 const TargetRegisterClass *RC = nullptr;
17755 if (Constraint.size() == 1) {
17756 // Check if we cannot determine the bit size of the given value type. This
17757 // can happen, for example, in this situation where we have an empty struct
17758 // (size 0): `call void asm "", "v"({} poison)`-
17759 if (VT == MVT::Other)
17760 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17761 const unsigned BitWidth = VT.getSizeInBits();
17762 switch (Constraint[0]) {
17763 default:
17764 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17765 case 's':
17766 case 'r':
17767 switch (BitWidth) {
17768 case 16:
17769 RC = &AMDGPU::SReg_32RegClass;
17770 break;
17771 case 64:
17772 RC = &AMDGPU::SGPR_64RegClass;
17773 break;
17774 default:
17776 if (!RC)
17777 return std::pair(0U, nullptr);
17778 break;
17779 }
17780 break;
17781 case 'v':
17782 switch (BitWidth) {
17783 case 1:
17784 return std::pair(0U, nullptr);
17785 case 16:
17786 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17787 : &AMDGPU::VGPR_32_Lo256RegClass;
17788 break;
17789 default:
17790 RC = Subtarget->has1024AddressableVGPRs()
17791 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17792 : TRI->getVGPRClassForBitWidth(BitWidth);
17793 if (!RC)
17794 return std::pair(0U, nullptr);
17795 break;
17796 }
17797 break;
17798 case 'a':
17799 if (!Subtarget->hasMAIInsts())
17800 break;
17801 switch (BitWidth) {
17802 case 1:
17803 return std::pair(0U, nullptr);
17804 case 16:
17805 RC = &AMDGPU::AGPR_32RegClass;
17806 break;
17807 default:
17808 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17809 if (!RC)
17810 return std::pair(0U, nullptr);
17811 break;
17812 }
17813 break;
17814 }
17815 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17816 const unsigned BitWidth = VT.getSizeInBits();
17817 switch (BitWidth) {
17818 case 16:
17819 RC = &AMDGPU::AV_32RegClass;
17820 break;
17821 default:
17822 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17823 if (!RC)
17824 return std::pair(0U, nullptr);
17825 break;
17826 }
17827 }
17828
17829 // We actually support i128, i16 and f16 as inline parameters
17830 // even if they are not reported as legal
17831 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17832 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17833 return std::pair(0U, RC);
17834
17835 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17836 if (Kind != '\0') {
17837 if (Kind == 'v') {
17838 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17839 } else if (Kind == 's') {
17840 RC = &AMDGPU::SGPR_32RegClass;
17841 } else if (Kind == 'a') {
17842 RC = &AMDGPU::AGPR_32RegClass;
17843 }
17844
17845 if (RC) {
17846 if (NumRegs > 1) {
17847 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17848 return std::pair(0U, nullptr);
17849
17850 uint32_t Width = NumRegs * 32;
17851 // Prohibit constraints for register ranges with a width that does not
17852 // match the required type.
17853 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17854 return std::pair(0U, nullptr);
17855
17856 MCRegister Reg = RC->getRegister(Idx);
17858 RC = TRI->getVGPRClassForBitWidth(Width);
17859 else if (SIRegisterInfo::isSGPRClass(RC))
17860 RC = TRI->getSGPRClassForBitWidth(Width);
17861 else if (SIRegisterInfo::isAGPRClass(RC))
17862 RC = TRI->getAGPRClassForBitWidth(Width);
17863 if (RC) {
17864 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17865 if (!Reg) {
17866 // The register class does not contain the requested register,
17867 // e.g., because it is an SGPR pair that would violate alignment
17868 // requirements.
17869 return std::pair(0U, nullptr);
17870 }
17871 return std::pair(Reg, RC);
17872 }
17873 }
17874
17875 // Check for lossy scalar/vector conversions.
17876 if (VT.isVector() && VT.getSizeInBits() != 32)
17877 return std::pair(0U, nullptr);
17878 if (Idx < RC->getNumRegs())
17879 return std::pair(RC->getRegister(Idx), RC);
17880 return std::pair(0U, nullptr);
17881 }
17882 }
17883
17884 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17885 if (Ret.first)
17886 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17887
17888 return Ret;
17889}
17890
17891static bool isImmConstraint(StringRef Constraint) {
17892 if (Constraint.size() == 1) {
17893 switch (Constraint[0]) {
17894 default:
17895 break;
17896 case 'I':
17897 case 'J':
17898 case 'A':
17899 case 'B':
17900 case 'C':
17901 return true;
17902 }
17903 } else if (Constraint == "DA" || Constraint == "DB") {
17904 return true;
17905 }
17906 return false;
17907}
17908
17911 if (Constraint.size() == 1) {
17912 switch (Constraint[0]) {
17913 default:
17914 break;
17915 case 's':
17916 case 'v':
17917 case 'a':
17918 return C_RegisterClass;
17919 }
17920 } else if (Constraint.size() == 2) {
17921 if (Constraint == "VA")
17922 return C_RegisterClass;
17923 }
17924 if (isImmConstraint(Constraint)) {
17925 return C_Other;
17926 }
17927 return TargetLowering::getConstraintType(Constraint);
17928}
17929
17930static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17932 Val = Val & maskTrailingOnes<uint64_t>(Size);
17933 }
17934 return Val;
17935}
17936
17938 StringRef Constraint,
17939 std::vector<SDValue> &Ops,
17940 SelectionDAG &DAG) const {
17941 if (isImmConstraint(Constraint)) {
17942 uint64_t Val;
17943 if (getAsmOperandConstVal(Op, Val) &&
17944 checkAsmConstraintVal(Op, Constraint, Val)) {
17945 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17946 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17947 }
17948 } else {
17950 }
17951}
17952
17954 unsigned Size = Op.getScalarValueSizeInBits();
17955 if (Size > 64)
17956 return false;
17957
17958 if (Size == 16 && !Subtarget->has16BitInsts())
17959 return false;
17960
17962 Val = C->getSExtValue();
17963 return true;
17964 }
17966 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17967 return true;
17968 }
17970 if (Size != 16 || Op.getNumOperands() != 2)
17971 return false;
17972 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17973 return false;
17974 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17975 Val = C->getSExtValue();
17976 return true;
17977 }
17978 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17979 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17980 return true;
17981 }
17982 }
17983
17984 return false;
17985}
17986
17988 uint64_t Val) const {
17989 if (Constraint.size() == 1) {
17990 switch (Constraint[0]) {
17991 case 'I':
17993 case 'J':
17994 return isInt<16>(Val);
17995 case 'A':
17996 return checkAsmConstraintValA(Op, Val);
17997 case 'B':
17998 return isInt<32>(Val);
17999 case 'C':
18000 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
18002 default:
18003 break;
18004 }
18005 } else if (Constraint.size() == 2) {
18006 if (Constraint == "DA") {
18007 int64_t HiBits = static_cast<int32_t>(Val >> 32);
18008 int64_t LoBits = static_cast<int32_t>(Val);
18009 return checkAsmConstraintValA(Op, HiBits, 32) &&
18010 checkAsmConstraintValA(Op, LoBits, 32);
18011 }
18012 if (Constraint == "DB") {
18013 return true;
18014 }
18015 }
18016 llvm_unreachable("Invalid asm constraint");
18017}
18018
18020 unsigned MaxSize) const {
18021 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
18022 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18023 if (Size == 16) {
18024 MVT VT = Op.getSimpleValueType();
18025 switch (VT.SimpleTy) {
18026 default:
18027 return false;
18028 case MVT::i16:
18029 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
18030 case MVT::f16:
18031 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
18032 case MVT::bf16:
18033 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
18034 case MVT::v2i16:
18035 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
18036 case MVT::v2f16:
18037 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
18038 case MVT::v2bf16:
18039 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
18040 }
18041 }
18042 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
18043 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
18044 return true;
18045 return false;
18046}
18047
18048static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
18049 switch (UnalignedClassID) {
18050 case AMDGPU::VReg_64RegClassID:
18051 return AMDGPU::VReg_64_Align2RegClassID;
18052 case AMDGPU::VReg_96RegClassID:
18053 return AMDGPU::VReg_96_Align2RegClassID;
18054 case AMDGPU::VReg_128RegClassID:
18055 return AMDGPU::VReg_128_Align2RegClassID;
18056 case AMDGPU::VReg_160RegClassID:
18057 return AMDGPU::VReg_160_Align2RegClassID;
18058 case AMDGPU::VReg_192RegClassID:
18059 return AMDGPU::VReg_192_Align2RegClassID;
18060 case AMDGPU::VReg_224RegClassID:
18061 return AMDGPU::VReg_224_Align2RegClassID;
18062 case AMDGPU::VReg_256RegClassID:
18063 return AMDGPU::VReg_256_Align2RegClassID;
18064 case AMDGPU::VReg_288RegClassID:
18065 return AMDGPU::VReg_288_Align2RegClassID;
18066 case AMDGPU::VReg_320RegClassID:
18067 return AMDGPU::VReg_320_Align2RegClassID;
18068 case AMDGPU::VReg_352RegClassID:
18069 return AMDGPU::VReg_352_Align2RegClassID;
18070 case AMDGPU::VReg_384RegClassID:
18071 return AMDGPU::VReg_384_Align2RegClassID;
18072 case AMDGPU::VReg_512RegClassID:
18073 return AMDGPU::VReg_512_Align2RegClassID;
18074 case AMDGPU::VReg_1024RegClassID:
18075 return AMDGPU::VReg_1024_Align2RegClassID;
18076 case AMDGPU::AReg_64RegClassID:
18077 return AMDGPU::AReg_64_Align2RegClassID;
18078 case AMDGPU::AReg_96RegClassID:
18079 return AMDGPU::AReg_96_Align2RegClassID;
18080 case AMDGPU::AReg_128RegClassID:
18081 return AMDGPU::AReg_128_Align2RegClassID;
18082 case AMDGPU::AReg_160RegClassID:
18083 return AMDGPU::AReg_160_Align2RegClassID;
18084 case AMDGPU::AReg_192RegClassID:
18085 return AMDGPU::AReg_192_Align2RegClassID;
18086 case AMDGPU::AReg_256RegClassID:
18087 return AMDGPU::AReg_256_Align2RegClassID;
18088 case AMDGPU::AReg_512RegClassID:
18089 return AMDGPU::AReg_512_Align2RegClassID;
18090 case AMDGPU::AReg_1024RegClassID:
18091 return AMDGPU::AReg_1024_Align2RegClassID;
18092 default:
18093 return -1;
18094 }
18095}
18096
18097// Figure out which registers should be reserved for stack access. Only after
18098// the function is legalized do we know all of the non-spill stack objects or if
18099// calls are present.
18103 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18104 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18105 const SIInstrInfo *TII = ST.getInstrInfo();
18106
18107 if (Info->isEntryFunction()) {
18108 // Callable functions have fixed registers used for stack access.
18110 }
18111
18112 // TODO: Move this logic to getReservedRegs()
18113 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18114 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18115 Register SReg = ST.isWave32()
18116 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18117 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18118 &AMDGPU::SGPR_64RegClass);
18119 Info->setSGPRForEXECCopy(SReg);
18120
18121 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18122 Info->getStackPtrOffsetReg()));
18123 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18124 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18125
18126 // We need to worry about replacing the default register with itself in case
18127 // of MIR testcases missing the MFI.
18128 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18129 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18130
18131 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18132 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18133
18134 Info->limitOccupancy(MF);
18135
18136 if (ST.isWave32() && !MF.empty()) {
18137 for (auto &MBB : MF) {
18138 for (auto &MI : MBB) {
18139 TII->fixImplicitOperands(MI);
18140 }
18141 }
18142 }
18143
18144 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18145 // classes if required. Ideally the register class constraints would differ
18146 // per-subtarget, but there's no easy way to achieve that right now. This is
18147 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18148 // from using them as the register class for legal types.
18149 if (ST.needsAlignedVGPRs()) {
18150 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18151 const Register Reg = Register::index2VirtReg(I);
18152 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18153 if (!RC)
18154 continue;
18155 int NewClassID = getAlignedAGPRClassID(RC->getID());
18156 if (NewClassID != -1)
18157 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
18158 }
18159 }
18160
18162}
18163
18165 KnownBits &Known,
18166 const APInt &DemandedElts,
18167 const SelectionDAG &DAG,
18168 unsigned Depth) const {
18169 Known.resetAll();
18170 unsigned Opc = Op.getOpcode();
18171 switch (Opc) {
18173 unsigned IID = Op.getConstantOperandVal(0);
18174 switch (IID) {
18175 case Intrinsic::amdgcn_mbcnt_lo:
18176 case Intrinsic::amdgcn_mbcnt_hi: {
18177 const GCNSubtarget &ST =
18179 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18180 // most 31 + src1.
18181 Known.Zero.setBitsFrom(
18182 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18183 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
18184 Known = KnownBits::add(Known, Known2);
18185 return;
18186 }
18187 }
18188 break;
18189 }
18190 }
18192 Op, Known, DemandedElts, DAG, Depth);
18193}
18194
18196 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18198
18199 // Set the high bits to zero based on the maximum allowed scratch size per
18200 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18201 // calculation won't overflow, so assume the sign bit is never set.
18202 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18203}
18204
18206 GISelValueTracking &VT, KnownBits &Known,
18207 unsigned Dim) {
18208 unsigned MaxValue =
18209 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
18210 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
18211}
18212
18214 KnownBits &Known, const APInt &DemandedElts,
18215 unsigned BFEWidth, bool SExt, unsigned Depth) {
18217 const MachineOperand &Src1 = MI.getOperand(2);
18218
18219 unsigned Src1Cst = 0;
18220 if (Src1.isImm()) {
18221 Src1Cst = Src1.getImm();
18222 } else if (Src1.isReg()) {
18223 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18224 if (!Cst)
18225 return;
18226 Src1Cst = Cst->Value.getZExtValue();
18227 } else {
18228 return;
18229 }
18230
18231 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18232 // Width is always [22:16].
18233 const unsigned Offset =
18234 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18235 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18236
18237 if (Width >= BFEWidth) // Ill-formed.
18238 return;
18239
18240 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18241 Depth + 1);
18242
18243 Known = Known.extractBits(Width, Offset);
18244
18245 if (SExt)
18246 Known = Known.sext(BFEWidth);
18247 else
18248 Known = Known.zext(BFEWidth);
18249}
18250
18252 GISelValueTracking &VT, Register R, KnownBits &Known,
18253 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18254 unsigned Depth) const {
18255 Known.resetAll();
18256 const MachineInstr *MI = MRI.getVRegDef(R);
18257 switch (MI->getOpcode()) {
18258 case AMDGPU::S_BFE_I32:
18259 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18260 /*SExt=*/true, Depth);
18261 case AMDGPU::S_BFE_U32:
18262 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18263 /*SExt=*/false, Depth);
18264 case AMDGPU::S_BFE_I64:
18265 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18266 /*SExt=*/true, Depth);
18267 case AMDGPU::S_BFE_U64:
18268 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18269 /*SExt=*/false, Depth);
18270 case AMDGPU::G_INTRINSIC:
18271 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18272 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18273 switch (IID) {
18274 case Intrinsic::amdgcn_workitem_id_x:
18275 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18276 break;
18277 case Intrinsic::amdgcn_workitem_id_y:
18278 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18279 break;
18280 case Intrinsic::amdgcn_workitem_id_z:
18281 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18282 break;
18283 case Intrinsic::amdgcn_mbcnt_lo:
18284 case Intrinsic::amdgcn_mbcnt_hi: {
18285 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18286 // most 31 + src1.
18287 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18288 ? getSubtarget()->getWavefrontSizeLog2()
18289 : 5);
18290 KnownBits Known2;
18291 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18292 Depth + 1);
18293 Known = KnownBits::add(Known, Known2);
18294 break;
18295 }
18296 case Intrinsic::amdgcn_groupstaticsize: {
18297 // We can report everything over the maximum size as 0. We can't report
18298 // based on the actual size because we don't know if it's accurate or not
18299 // at any given point.
18300 Known.Zero.setHighBits(
18301 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18302 break;
18303 }
18304 }
18305 break;
18306 }
18307 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18308 Known.Zero.setHighBits(24);
18309 break;
18310 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18311 Known.Zero.setHighBits(16);
18312 break;
18313 case AMDGPU::G_AMDGPU_SMED3:
18314 case AMDGPU::G_AMDGPU_UMED3: {
18315 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18316
18317 KnownBits Known2;
18318 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18319 if (Known2.isUnknown())
18320 break;
18321
18322 KnownBits Known1;
18323 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18324 if (Known1.isUnknown())
18325 break;
18326
18327 KnownBits Known0;
18328 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18329 if (Known0.isUnknown())
18330 break;
18331
18332 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18333 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18334 Known.One = Known0.One & Known1.One & Known2.One;
18335 break;
18336 }
18337 }
18338}
18339
18342 unsigned Depth) const {
18343 const MachineInstr *MI = MRI.getVRegDef(R);
18344 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18345 // FIXME: Can this move to generic code? What about the case where the call
18346 // site specifies a lower alignment?
18347 Intrinsic::ID IID = GI->getIntrinsicID();
18349 AttributeList Attrs =
18350 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18351 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18352 return *RetAlign;
18353 }
18354 return Align(1);
18355}
18356
18359 const Align CacheLineAlign = Align(64);
18360
18361 // Pre-GFX10 target did not benefit from loop alignment
18362 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18363 getSubtarget()->hasInstFwdPrefetchBug())
18364 return PrefAlign;
18365
18366 // On GFX10 I$ is 4 x 64 bytes cache lines.
18367 // By default prefetcher keeps one cache line behind and reads two ahead.
18368 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18369 // behind and one ahead.
18370 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18371 // If loop fits 64 bytes it always spans no more than two cache lines and
18372 // does not need an alignment.
18373 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18374 // Else if loop is less or equal 192 bytes we need two lines behind.
18375
18377 const MachineBasicBlock *Header = ML->getHeader();
18378 if (Header->getAlignment() != PrefAlign)
18379 return Header->getAlignment(); // Already processed.
18380
18381 unsigned LoopSize = 0;
18382 for (const MachineBasicBlock *MBB : ML->blocks()) {
18383 // If inner loop block is aligned assume in average half of the alignment
18384 // size to be added as nops.
18385 if (MBB != Header)
18386 LoopSize += MBB->getAlignment().value() / 2;
18387
18388 for (const MachineInstr &MI : *MBB) {
18389 LoopSize += TII->getInstSizeInBytes(MI);
18390 if (LoopSize > 192)
18391 return PrefAlign;
18392 }
18393 }
18394
18395 if (LoopSize <= 64)
18396 return PrefAlign;
18397
18398 if (LoopSize <= 128)
18399 return CacheLineAlign;
18400
18401 // If any of parent loops is surrounded by prefetch instructions do not
18402 // insert new for inner loop, which would reset parent's settings.
18403 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18404 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18405 auto I = Exit->getFirstNonDebugInstr();
18406 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18407 return CacheLineAlign;
18408 }
18409 }
18410
18411 MachineBasicBlock *Pre = ML->getLoopPreheader();
18412 MachineBasicBlock *Exit = ML->getExitBlock();
18413
18414 if (Pre && Exit) {
18415 auto PreTerm = Pre->getFirstTerminator();
18416 if (PreTerm == Pre->begin() ||
18417 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18418 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18419 .addImm(1); // prefetch 2 lines behind PC
18420
18421 auto ExitHead = Exit->getFirstNonDebugInstr();
18422 if (ExitHead == Exit->end() ||
18423 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18424 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18425 .addImm(2); // prefetch 1 line behind PC
18426 }
18427
18428 return CacheLineAlign;
18429}
18430
18431[[maybe_unused]]
18432static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18433 assert(N->getOpcode() == ISD::CopyFromReg);
18434 do {
18435 // Follow the chain until we find an INLINEASM node.
18436 N = N->getOperand(0).getNode();
18437 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18438 return true;
18439 } while (N->getOpcode() == ISD::CopyFromReg);
18440 return false;
18441}
18442
18445 UniformityInfo *UA) const {
18446 switch (N->getOpcode()) {
18447 case ISD::CopyFromReg: {
18448 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18449 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18450 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18451 Register Reg = R->getReg();
18452
18453 // FIXME: Why does this need to consider isLiveIn?
18454 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18455 return !TRI->isSGPRReg(MRI, Reg);
18456
18457 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18458 return UA->isDivergent(V);
18459
18461 return !TRI->isSGPRReg(MRI, Reg);
18462 }
18463 case ISD::LOAD: {
18464 const LoadSDNode *L = cast<LoadSDNode>(N);
18465 unsigned AS = L->getAddressSpace();
18466 // A flat load may access private memory.
18468 }
18469 case ISD::CALLSEQ_END:
18470 return true;
18472 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18474 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18475 case AMDGPUISD::ATOMIC_CMP_SWAP:
18476 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18477 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18478 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18479 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18480 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18481 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18482 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18483 case AMDGPUISD::BUFFER_ATOMIC_AND:
18484 case AMDGPUISD::BUFFER_ATOMIC_OR:
18485 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18486 case AMDGPUISD::BUFFER_ATOMIC_INC:
18487 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18488 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18489 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18490 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18491 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18492 // Target-specific read-modify-write atomics are sources of divergence.
18493 return true;
18494 default:
18495 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18496 // Generic read-modify-write atomics are sources of divergence.
18497 return A->readMem() && A->writeMem();
18498 }
18499 return false;
18500 }
18501}
18502
18504 EVT VT) const {
18505 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18506 case MVT::f32:
18508 case MVT::f64:
18509 case MVT::f16:
18511 default:
18512 return false;
18513 }
18514}
18515
18517 LLT Ty, const MachineFunction &MF) const {
18518 switch (Ty.getScalarSizeInBits()) {
18519 case 32:
18520 return !denormalModeIsFlushAllF32(MF);
18521 case 64:
18522 case 16:
18523 return !denormalModeIsFlushAllF64F16(MF);
18524 default:
18525 return false;
18526 }
18527}
18528
18530 const APInt &DemandedElts,
18531 const SelectionDAG &DAG,
18532 bool SNaN,
18533 unsigned Depth) const {
18534 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18535 const MachineFunction &MF = DAG.getMachineFunction();
18537
18538 if (Info->getMode().DX10Clamp)
18539 return true; // Clamped to 0.
18540 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18541 }
18542
18544 DAG, SNaN, Depth);
18545}
18546
18547// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18548// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18550 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18551 return true;
18552
18553 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
18554 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18555 if (DenormMode == DenormalMode::getPreserveSign())
18556 return true;
18557
18558 // TODO: Remove this.
18559 return RMW->getFunction()
18560 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18561 .getValueAsBool();
18562}
18563
18565 LLVMContext &Ctx = RMW->getContext();
18566 StringRef MemScope =
18567 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18568
18569 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18570 << "Hardware instruction generated for atomic "
18571 << RMW->getOperationName(RMW->getOperation())
18572 << " operation at memory scope " << MemScope;
18573}
18574
18575static bool isV2F16OrV2BF16(Type *Ty) {
18576 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18577 Type *EltTy = VT->getElementType();
18578 return VT->getNumElements() == 2 &&
18579 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18580 }
18581
18582 return false;
18583}
18584
18585static bool isV2F16(Type *Ty) {
18587 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18588}
18589
18590static bool isV2BF16(Type *Ty) {
18592 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18593}
18594
18595/// \return true if atomicrmw integer ops work for the type.
18596static bool isAtomicRMWLegalIntTy(Type *Ty) {
18597 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18598 unsigned BW = IT->getBitWidth();
18599 return BW == 32 || BW == 64;
18600 }
18601
18602 return false;
18603}
18604
18605/// \return true if this atomicrmw xchg type can be selected.
18606static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18607 Type *Ty = RMW->getType();
18608 if (isAtomicRMWLegalIntTy(Ty))
18609 return true;
18610
18611 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18612 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18613 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18614 return BW == 32 || BW == 64;
18615 }
18616
18617 if (Ty->isFloatTy() || Ty->isDoubleTy())
18618 return true;
18619
18621 return VT->getNumElements() == 2 &&
18622 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18623 }
18624
18625 return false;
18626}
18627
18628/// \returns true if it's valid to emit a native instruction for \p RMW, based
18629/// on the properties of the target memory.
18630static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18631 const AtomicRMWInst *RMW,
18632 bool HasSystemScope) {
18633 // The remote/fine-grained access logic is different from the integer
18634 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18635 // fine-grained access does not work, even for a device local allocation.
18636 //
18637 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18638 // allocations work.
18639 if (HasSystemScope) {
18641 RMW->hasMetadata("amdgpu.no.remote.memory"))
18642 return true;
18643 if (Subtarget.hasEmulatedSystemScopeAtomics())
18644 return true;
18646 return true;
18647
18648 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18649}
18650
18651/// \return Action to perform on AtomicRMWInsts for integer operations.
18658
18659/// Return if a flat address space atomicrmw can access private memory.
18661 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18662 return !MD ||
18664}
18665
18673
18676 unsigned AS = RMW->getPointerAddressSpace();
18677 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18679
18680 // 64-bit flat atomics that dynamically reside in private memory will silently
18681 // be dropped.
18682 //
18683 // Note that we will emit a new copy of the original atomic in the expansion,
18684 // which will be incrementally relegalized.
18685 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18686 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18687 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18690
18691 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18693 ORE.emit([=]() {
18694 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18695 });
18696 return Kind;
18697 };
18698
18699 auto SSID = RMW->getSyncScopeID();
18700 bool HasSystemScope =
18701 SSID == SyncScope::System ||
18702 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18703
18704 auto Op = RMW->getOperation();
18705 switch (Op) {
18707 // PCIe supports add and xchg for system atomics.
18708 return isAtomicRMWLegalXChgTy(RMW)
18711 case AtomicRMWInst::Add:
18712 // PCIe supports add and xchg for system atomics.
18714 case AtomicRMWInst::Sub:
18715 case AtomicRMWInst::And:
18716 case AtomicRMWInst::Or:
18717 case AtomicRMWInst::Xor:
18718 case AtomicRMWInst::Max:
18719 case AtomicRMWInst::Min:
18726 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
18728 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
18731 auto *IT = dyn_cast<IntegerType>(RMW->getType());
18732 if (!IT || IT->getBitWidth() != 32)
18734 }
18735
18738 if (Subtarget->hasEmulatedSystemScopeAtomics())
18740
18741 // On most subtargets, for atomicrmw operations other than add/xchg,
18742 // whether or not the instructions will behave correctly depends on where
18743 // the address physically resides and what interconnect is used in the
18744 // system configuration. On some some targets the instruction will nop,
18745 // and in others synchronization will only occur at degraded device scope.
18746 //
18747 // If the allocation is known local to the device, the instructions should
18748 // work correctly.
18749 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18751
18752 // If fine-grained remote memory works at device scope, we don't need to
18753 // do anything.
18754 if (!HasSystemScope &&
18755 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18757
18758 // If we are targeting a remote allocated address, it depends what kind of
18759 // allocation the address belongs to.
18760 //
18761 // If the allocation is fine-grained (in host memory, or in PCIe peer
18762 // device memory), the operation will fail depending on the target.
18763 //
18764 // Note fine-grained host memory access does work on APUs or if XGMI is
18765 // used, but we do not know if we are targeting an APU or the system
18766 // configuration from the ISA version/target-cpu.
18767 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18769
18772 // Atomic sub/or/xor do not work over PCI express, but atomic add
18773 // does. InstCombine transforms these with 0 to or, so undo that.
18774 if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18775 ConstVal && ConstVal->isNullValue())
18777 }
18778
18779 // If the allocation could be in remote, fine-grained memory, the rmw
18780 // instructions may fail. cmpxchg should work, so emit that. On some
18781 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18782 // even work, so you're out of luck anyway.
18783
18784 // In summary:
18785 //
18786 // Cases that may fail:
18787 // - fine-grained pinned host memory
18788 // - fine-grained migratable host memory
18789 // - fine-grained PCIe peer device
18790 //
18791 // Cases that should work, but may be treated overly conservatively.
18792 // - fine-grained host memory on an APU
18793 // - fine-grained XGMI peer device
18795 }
18796
18798 }
18799 case AtomicRMWInst::FAdd: {
18800 Type *Ty = RMW->getType();
18801
18802 // TODO: Handle REGION_ADDRESS
18803 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18804 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18805 // is fixed to round-to-nearest-even.
18806 //
18807 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18808 // round-to-nearest-even.
18809 //
18810 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18811 // suggests it is OK if the floating-point mode may not match the calling
18812 // thread.
18813 if (Ty->isFloatTy()) {
18814 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18816 }
18817
18818 if (Ty->isDoubleTy()) {
18819 // Ignores denormal mode, but we don't consider flushing mandatory.
18820 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18822 }
18823
18824 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18826
18828 }
18829
18830 // LDS atomics respect the denormal mode from the mode register.
18831 //
18832 // Traditionally f32 global/buffer memory atomics would unconditionally
18833 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18834 // flush.
18835 //
18836 // On targets with flat atomic fadd, denormals would flush depending on
18837 // whether the target address resides in LDS or global memory. We consider
18838 // this flat-maybe-flush as will-flush.
18839 if (Ty->isFloatTy() &&
18840 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18843
18844 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18845 // safe. The message phrasing also should be better.
18846 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18847 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18848 // gfx942, gfx12
18849 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18850 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18851 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18852 // gfx90a, gfx942, gfx12
18853 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18854 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18855
18856 // gfx942, gfx12
18857 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18858 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18859 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18860 // gfx90a, gfx942, gfx12
18861 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18862 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18863
18864 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18865 // buffer. gfx12 does have the buffer version.
18866 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18867 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18868 }
18869
18870 // global and flat atomic fadd f64: gfx90a, gfx942.
18871 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18872 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18873
18874 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18875 if (Ty->isFloatTy()) {
18876 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18877 // gfx11+.
18878 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18879 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18880 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18881 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18882 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18883 } else {
18884 // gfx908
18885 if (RMW->use_empty() &&
18886 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18887 isV2F16(Ty))
18888 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18889 }
18890 }
18891
18892 // flat atomic fadd f32: gfx942, gfx11+.
18893 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18894 if (Subtarget->hasFlatAtomicFaddF32Inst())
18895 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18896
18897 // If it is in flat address space, and the type is float, we will try to
18898 // expand it, if the target supports global and lds atomic fadd. The
18899 // reason we need that is, in the expansion, we emit the check of
18900 // address space. If it is in global address space, we emit the global
18901 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18902 // fadd.
18903 if (Subtarget->hasLDSFPAtomicAddF32()) {
18904 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18906 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18908 }
18909 }
18910 }
18911
18913 }
18915 case AtomicRMWInst::FMax: {
18916 Type *Ty = RMW->getType();
18917
18918 // LDS float and double fmin/fmax were always supported.
18919 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18920 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18922 }
18923
18924 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18925 // For flat and global cases:
18926 // float, double in gfx7. Manual claims denormal support.
18927 // Removed in gfx8.
18928 // float, double restored in gfx10.
18929 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18930 //
18931 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18932 // no f32.
18933 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18934 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18935 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18936 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18937 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18938 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18940 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18941 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18942 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18943 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18944 }
18945 }
18946
18948 }
18951 default:
18953 }
18954
18955 llvm_unreachable("covered atomicrmw op switch");
18956}
18957
18964
18971
18974 const AtomicCmpXchgInst *CmpX) const {
18975 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18976 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18978
18979 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18981
18982 const DataLayout &DL = CmpX->getDataLayout();
18983
18984 Type *ValTy = CmpX->getNewValOperand()->getType();
18985
18986 // If a 64-bit flat atomic may alias private, we need to avoid using the
18987 // atomic in the private case.
18988 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18990}
18991
18992const TargetRegisterClass *
18993SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18995 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18996 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18997 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18998 : &AMDGPU::SReg_32RegClass;
18999 if (!TRI->isSGPRClass(RC) && !isDivergent)
19000 return TRI->getEquivalentSGPRClass(RC);
19001 if (TRI->isSGPRClass(RC) && isDivergent) {
19002 if (Subtarget->hasGFX90AInsts())
19003 return TRI->getEquivalentAVClass(RC);
19004 return TRI->getEquivalentVGPRClass(RC);
19005 }
19006
19007 return RC;
19008}
19009
19010// FIXME: This is a workaround for DivergenceAnalysis not understanding always
19011// uniform values (as produced by the mask results of control flow intrinsics)
19012// used outside of divergent blocks. The phi users need to also be treated as
19013// always uniform.
19014//
19015// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
19016static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
19017 unsigned WaveSize) {
19018 // FIXME: We assume we never cast the mask results of a control flow
19019 // intrinsic.
19020 // Early exit if the type won't be consistent as a compile time hack.
19021 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
19022 if (!IT || IT->getBitWidth() != WaveSize)
19023 return false;
19024
19025 if (!isa<Instruction>(V))
19026 return false;
19027 if (!Visited.insert(V).second)
19028 return false;
19029 bool Result = false;
19030 for (const auto *U : V->users()) {
19032 if (V == U->getOperand(1)) {
19033 switch (Intrinsic->getIntrinsicID()) {
19034 default:
19035 Result = false;
19036 break;
19037 case Intrinsic::amdgcn_if_break:
19038 case Intrinsic::amdgcn_if:
19039 case Intrinsic::amdgcn_else:
19040 Result = true;
19041 break;
19042 }
19043 }
19044 if (V == U->getOperand(0)) {
19045 switch (Intrinsic->getIntrinsicID()) {
19046 default:
19047 Result = false;
19048 break;
19049 case Intrinsic::amdgcn_end_cf:
19050 case Intrinsic::amdgcn_loop:
19051 Result = true;
19052 break;
19053 }
19054 }
19055 } else {
19056 Result = hasCFUser(U, Visited, WaveSize);
19057 }
19058 if (Result)
19059 break;
19060 }
19061 return Result;
19062}
19063
19065 const Value *V) const {
19066 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
19067 if (CI->isInlineAsm()) {
19068 // FIXME: This cannot give a correct answer. This should only trigger in
19069 // the case where inline asm returns mixed SGPR and VGPR results, used
19070 // outside the defining block. We don't have a specific result to
19071 // consider, so this assumes if any value is SGPR, the overall register
19072 // also needs to be SGPR.
19073 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
19075 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
19076 for (auto &TC : TargetConstraints) {
19077 if (TC.Type == InlineAsm::isOutput) {
19079 const TargetRegisterClass *RC =
19080 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
19081 TC.ConstraintVT)
19082 .second;
19083 if (RC && SIRI->isSGPRClass(RC))
19084 return true;
19085 }
19086 }
19087 }
19088 }
19090 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19091}
19092
19094 for (SDUse &Use : N->uses()) {
19096 if (getBasePtrIndex(M) == Use.getOperandNo())
19097 return true;
19098 }
19099 }
19100 return false;
19101}
19102
19104 SDValue N1) const {
19105 if (!N0.hasOneUse())
19106 return false;
19107 // Take care of the opportunity to keep N0 uniform
19108 if (N0->isDivergent() || !N1->isDivergent())
19109 return true;
19110 // Check if we have a good chance to form the memory access pattern with the
19111 // base and offset
19112 return (DAG.isBaseWithConstantOffset(N0) &&
19114}
19115
19117 Register N0, Register N1) const {
19118 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
19119}
19120
19123 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19125 if (I.getMetadata("amdgpu.noclobber"))
19126 Flags |= MONoClobber;
19127 if (I.getMetadata("amdgpu.last.use"))
19128 Flags |= MOLastUse;
19129 return Flags;
19130}
19131
19133 Instruction *AI) const {
19134 // Given: atomicrmw fadd ptr %addr, float %val ordering
19135 //
19136 // With this expansion we produce the following code:
19137 // [...]
19138 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19139 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19140 //
19141 // atomicrmw.shared:
19142 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19143 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19144 // float %val ordering
19145 // br label %atomicrmw.phi
19146 //
19147 // atomicrmw.check.private:
19148 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19149 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19150 //
19151 // atomicrmw.private:
19152 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19153 // %loaded.private = load float, ptr addrspace(5) %cast.private
19154 // %val.new = fadd float %loaded.private, %val
19155 // store float %val.new, ptr addrspace(5) %cast.private
19156 // br label %atomicrmw.phi
19157 //
19158 // atomicrmw.global:
19159 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19160 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19161 // float %val ordering
19162 // br label %atomicrmw.phi
19163 //
19164 // atomicrmw.phi:
19165 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19166 // [ %loaded.private, %atomicrmw.private ],
19167 // [ %loaded.global, %atomicrmw.global ]
19168 // br label %atomicrmw.end
19169 //
19170 // atomicrmw.end:
19171 // [...]
19172 //
19173 //
19174 // For 64-bit atomics which may reside in private memory, we perform a simpler
19175 // version that only inserts the private check, and uses the flat operation.
19176
19177 IRBuilder<> Builder(AI);
19178 LLVMContext &Ctx = Builder.getContext();
19179
19180 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
19181 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19183 Value *Addr = AI->getOperand(PtrOpIdx);
19184
19185 /// TODO: Only need to check private, then emit flat-known-not private (no
19186 /// need for shared block, or cast to global).
19188
19189 Align Alignment;
19190 if (RMW)
19191 Alignment = RMW->getAlign();
19192 else if (CX)
19193 Alignment = CX->getAlign();
19194 else
19195 llvm_unreachable("unhandled atomic operation");
19196
19197 // FullFlatEmulation is true if we need to issue the private, shared, and
19198 // global cases.
19199 //
19200 // If this is false, we are only dealing with the flat-targeting-private case,
19201 // where we only insert a check for private and still use the flat instruction
19202 // for global and shared.
19203
19204 bool FullFlatEmulation =
19205 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19206 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19207 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19208 RMW->getType()->isDoubleTy()));
19209
19210 // If the return value isn't used, do not introduce a false use in the phi.
19211 bool ReturnValueIsUsed = !AI->use_empty();
19212
19213 BasicBlock *BB = Builder.GetInsertBlock();
19214 Function *F = BB->getParent();
19215 BasicBlock *ExitBB =
19216 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19217 BasicBlock *SharedBB = nullptr;
19218
19219 BasicBlock *CheckPrivateBB = BB;
19220 if (FullFlatEmulation) {
19221 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19222 CheckPrivateBB =
19223 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19224 }
19225
19226 BasicBlock *PrivateBB =
19227 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19228 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19229 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19230
19231 std::prev(BB->end())->eraseFromParent();
19232 Builder.SetInsertPoint(BB);
19233
19234 Value *LoadedShared = nullptr;
19235 if (FullFlatEmulation) {
19236 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19237 {Addr}, nullptr, "is.shared");
19238 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19239 Builder.SetInsertPoint(SharedBB);
19240 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19242
19243 Instruction *Clone = AI->clone();
19244 Clone->insertInto(SharedBB, SharedBB->end());
19245 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19246 LoadedShared = Clone;
19247
19248 Builder.CreateBr(PhiBB);
19249 Builder.SetInsertPoint(CheckPrivateBB);
19250 }
19251
19252 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19253 {Addr}, nullptr, "is.private");
19254 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19255
19256 Builder.SetInsertPoint(PrivateBB);
19257
19258 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19260
19261 Value *LoadedPrivate;
19262 if (RMW) {
19263 LoadedPrivate = Builder.CreateAlignedLoad(
19264 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19265
19266 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19267 LoadedPrivate, RMW->getValOperand());
19268
19269 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19270 } else {
19271 auto [ResultLoad, Equal] =
19272 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19273 CX->getNewValOperand(), CX->getAlign());
19274
19275 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19276 ResultLoad, 0);
19277 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19278 }
19279
19280 Builder.CreateBr(PhiBB);
19281
19282 Builder.SetInsertPoint(GlobalBB);
19283
19284 // Continue using a flat instruction if we only emitted the check for private.
19285 Instruction *LoadedGlobal = AI;
19286 if (FullFlatEmulation) {
19287 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19289 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19290 }
19291
19292 AI->removeFromParent();
19293 AI->insertInto(GlobalBB, GlobalBB->end());
19294
19295 // The new atomicrmw may go through another round of legalization later.
19296 if (!FullFlatEmulation) {
19297 // We inserted the runtime check already, make sure we do not try to
19298 // re-expand this.
19299 // TODO: Should union with any existing metadata.
19300 MDBuilder MDB(F->getContext());
19301 MDNode *RangeNotPrivate =
19304 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19305 RangeNotPrivate);
19306 }
19307
19308 Builder.CreateBr(PhiBB);
19309
19310 Builder.SetInsertPoint(PhiBB);
19311
19312 if (ReturnValueIsUsed) {
19313 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19314 AI->replaceAllUsesWith(Loaded);
19315 if (FullFlatEmulation)
19316 Loaded->addIncoming(LoadedShared, SharedBB);
19317 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19318 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19319 Loaded->takeName(AI);
19320 }
19321
19322 Builder.CreateBr(ExitBB);
19323}
19324
19326 unsigned PtrOpIdx) {
19327 Value *PtrOp = I->getOperand(PtrOpIdx);
19330
19331 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19332 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19333 I->getIterator());
19334 I->setOperand(PtrOpIdx, ASCast);
19335}
19336
19339
19342
19345 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19346 ConstVal && ConstVal->isNullValue()) {
19347 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19349
19350 // We may still need the private-alias-flat handling below.
19351
19352 // TODO: Skip this for cases where we cannot access remote memory.
19353 }
19354 }
19355
19356 // The non-flat expansions should only perform the de-canonicalization of
19357 // identity values.
19359 return;
19360
19362}
19363
19370
19374
19376 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19377}
19378
19380 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19381 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19382
19384 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19385}
19386
19387LoadInst *
19389 IRBuilder<> Builder(AI);
19390 auto Order = AI->getOrdering();
19391
19392 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19393 // must be flushed if the atomic ordering had a release semantics. This is
19394 // not necessary a fence, a release fence just coincides to do that flush.
19395 // Avoid replacing of an atomicrmw with a release semantics.
19396 if (isReleaseOrStronger(Order))
19397 return nullptr;
19398
19399 LoadInst *LI = Builder.CreateAlignedLoad(
19400 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19401 LI->setAtomic(Order, AI->getSyncScopeID());
19402 LI->copyMetadata(*AI);
19403 LI->takeName(AI);
19404 AI->replaceAllUsesWith(LI);
19405 AI->eraseFromParent();
19406 return LI;
19407}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1258
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1255
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1183
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5975
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1560
bool isNegative() const
Definition APFloat.h:1512
bool isNormal() const
Definition APFloat.h:1516
APInt bitcastToAPInt() const
Definition APFloat.h:1416
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1201
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1161
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1142
bool isInfinity() const
Definition APFloat.h:1509
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1400
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1394
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1648
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1238
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:215
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:806
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool isWave64() const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2762
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:226
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:220
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:223
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:427
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:153
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:818
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:787
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:778
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:852
@ ATOMIC_LOAD_USUB_COND
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ GlobalAddress
Definition ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:879
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:746
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:992
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ ATOMIC_LOAD_USUB_SAT
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:974
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:843
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:664
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:786
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:795
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:969
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:703
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:764
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:849
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:810
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:887
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:726
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:977
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:804
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ ATOMIC_LOAD_UDEC_WRAP
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:925
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:738
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:958
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:996
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:855
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:832
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ ATOMIC_LOAD_UINC_WRAP
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)