LLVM 23.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
41#include "llvm/IR/IRBuilder.h"
43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
45#include "llvm/IR/MDBuilder.h"
48#include "llvm/Support/ModRef.h"
50#include <optional>
51
52using namespace llvm;
53using namespace llvm::SDPatternMatch;
54
55#define DEBUG_TYPE "si-lower"
56
57STATISTIC(NumTailCalls, "Number of tail calls");
58
59static cl::opt<bool>
60 DisableLoopAlignment("amdgpu-disable-loop-alignment",
61 cl::desc("Do not align and prefetch loops"),
62 cl::init(false));
63
65 "amdgpu-use-divergent-register-indexing", cl::Hidden,
66 cl::desc("Use indirect register addressing for divergent indexes"),
67 cl::init(false));
68
73
78
79static unsigned findFirstFreeSGPR(CCState &CCInfo) {
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
82 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
83 return AMDGPU::SGPR0 + Reg;
84 }
85 }
86 llvm_unreachable("Cannot allocate sgpr");
87}
88
90 const GCNSubtarget &STI)
91 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
92 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
93 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
94
95 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
96
97 const SIRegisterInfo *TRI = STI.getRegisterInfo();
98 const TargetRegisterClass *V32RegClass =
99 TRI->getDefaultVectorSuperClassForBitWidth(32);
100 addRegisterClass(MVT::f32, V32RegClass);
101
102 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
103
104 const TargetRegisterClass *V64RegClass =
105 TRI->getDefaultVectorSuperClassForBitWidth(64);
106
107 addRegisterClass(MVT::f64, V64RegClass);
108 addRegisterClass(MVT::v2f32, V64RegClass);
109 addRegisterClass(MVT::Untyped, V64RegClass);
110
111 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
112 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
113
114 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
115 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
116
117 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
118 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
119
120 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
121 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
122
123 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
124 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
125
126 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
127 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
128
129 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
130 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
131
132 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
133 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
134
135 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
136 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
137
138 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
139 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
140
141 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
142 addRegisterClass(MVT::v10f32,
143 TRI->getDefaultVectorSuperClassForBitWidth(320));
144
145 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(MVT::v11f32,
147 TRI->getDefaultVectorSuperClassForBitWidth(352));
148
149 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(MVT::v12f32,
151 TRI->getDefaultVectorSuperClassForBitWidth(384));
152
153 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
154 addRegisterClass(MVT::v16f32,
155 TRI->getDefaultVectorSuperClassForBitWidth(512));
156
157 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
158 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
159
160 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
161 addRegisterClass(MVT::v16f64,
162 TRI->getDefaultVectorSuperClassForBitWidth(1024));
163
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
166 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
167 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
169 } else {
170 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
171 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
173 }
174
175 // Unless there are also VOP3P operations, not operations are really legal.
176 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
177 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
180 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
183 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
186 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
189 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
191 }
192
193 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
194 addRegisterClass(MVT::v32f32,
195 TRI->getDefaultVectorSuperClassForBitWidth(1024));
196
197 computeRegisterProperties(Subtarget->getRegisterInfo());
198
201
202 // The boolean content concept here is too inflexible. Compares only ever
203 // really produce a 1-bit result. Any copy/extend from these will turn into a
204 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
205 // it's what most targets use.
208
209 // We need to custom lower vector stores from local memory
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
215 Custom);
216
218 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
219 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
220 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
221 MVT::i1, MVT::v32i32},
222 Custom);
223
224 if (isTypeLegal(MVT::bf16)) {
225 for (unsigned Opc :
234 ISD::SETCC}) {
235 setOperationAction(Opc, MVT::bf16, Promote);
236 }
237
239
241 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
242
246
247 // We only need to custom lower because we can't specify an action for bf16
248 // sources.
251 }
252
253 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
254 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
259 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
264 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
265 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
266 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
267 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
268 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
269
270 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
271 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
272 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
273 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
274 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
275 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
276 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
277
278 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
279 setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);
280
284 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
285
286 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
287
289 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
290
292 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
293 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
294
296 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
297 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
298 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
299 Expand);
301 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
302 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
303 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
304 Expand);
305
307 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
308 MVT::v3i16, MVT::v4i16, MVT::Other},
309 Custom);
310
313 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
314
316
318
320 Expand);
321
322#if 0
324#endif
325
326 // We only support LOAD/STORE and vector manipulation ops for vectors
327 // with > 4 elements.
328 for (MVT VT :
329 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
330 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
331 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
332 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
333 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
334 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
335 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
336 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
337 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
338 switch (Op) {
339 case ISD::LOAD:
340 case ISD::STORE:
342 case ISD::BITCAST:
343 case ISD::UNDEF:
347 case ISD::IS_FPCLASS:
348 break;
353 break;
354 default:
356 break;
357 }
358 }
359 }
360
362
363 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
364 // is expanded to avoid having two separate loops in case the index is a VGPR.
365
366 // Most operations are naturally 32-bit vector operations. We only support
367 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
368 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
370 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
371
373 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
374
376 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
377
379 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
380 }
381
382 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
384 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
385
387 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
388
390 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
391
393 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
394 }
395
396 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
398 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
399
401 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
402
404 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
405
407 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
408 }
409
410 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
412 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
413
415 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
416
418 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
419
421 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
422 }
423
424 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
426 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
427
429 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
430
432 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
433
435 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
436 }
437
439 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
440 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
441 Custom);
442
443 if (Subtarget->hasPkMovB32()) {
444 // TODO: 16-bit element vectors should be legal with even aligned elements.
445 // TODO: Can be legal with wider source types than the result with
446 // subregister extracts.
447 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
448 }
449
451 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
452 // instead lower to cndmask in SITargetLowering::LowerSELECT().
454 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
455 // alignbit.
456 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
457
458 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
459 Custom);
460
461 // Avoid stack access for these.
462 // TODO: Generalize to more vector types.
464 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
465 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
466 Custom);
467
468 // Deal with vec3 vector operations when widened to vec4.
470 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
471
472 // Deal with vec5/6/7 vector operations when widened to vec8.
474 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
475 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
476 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
477 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
478 Custom);
479
480 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
481 // and output demarshalling
482 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
483
484 // We can't return success/failure, only the old value,
485 // let LLVM add the comparison
487 Expand);
488
489 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
490
491 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
492
493 // FIXME: This should be narrowed to i32, but that only happens if i64 is
494 // illegal.
495 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
496 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
497
498 // On SI this is s_memtime and s_memrealtime on VI.
500
501 if (Subtarget->hasSMemRealTime() ||
502 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
505
506 if (Subtarget->has16BitInsts()) {
509 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
512 } else {
514 }
515
516 if (Subtarget->hasMadMacF32Insts())
518
522
523 // We only really have 32-bit BFE instructions (and 16-bit on VI).
524 //
525 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
526 // effort to match them now. We want this to be false for i64 cases when the
527 // extraction isn't restricted to the upper or lower half. Ideally we would
528 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
529 // span the midpoint are probably relatively rare, so don't worry about them
530 // for now.
532
533 // Clamp modifier on add/sub
534 if (Subtarget->hasIntClamp())
536
537 if (Subtarget->hasAddNoCarryInsts())
538 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
539 Legal);
540
543 {MVT::f32, MVT::f64}, Custom);
544
545 // These are really only legal for ieee_mode functions. We should be avoiding
546 // them for functions that don't have ieee_mode enabled, so just say they are
547 // legal.
549 {MVT::f32, MVT::f64}, Legal);
550
551 if (Subtarget->haveRoundOpsF64())
553 Legal);
554 else
556 MVT::f64, Custom);
557
559 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
560 Legal);
561 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
562
565
566 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
567 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
568
570 Custom);
572 Custom);
574 Custom);
575
576 // Custom lower these because we can't specify a rule based on an illegal
577 // source bf16.
580
581 if (Subtarget->has16BitInsts()) {
584 MVT::i16, Legal);
585
586 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
587
589 MVT::i16, Expand);
590
594 ISD::CTPOP},
595 MVT::i16, Promote);
596
598
599 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
600
602 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
604 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
605
609
611
612 // F16 - Constant Actions.
615
616 // F16 - Load/Store Actions.
618 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
620 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
621
622 // BF16 - Load/Store Actions.
624 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
626 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
627
628 // F16 - VOP1 Actions.
631 MVT::f16, Custom);
632
633 // BF16 - VOP1 Actions.
634 if (Subtarget->hasBF16TransInsts())
636
639 MVT::f16, Promote);
642 MVT::bf16, Promote);
643
644 // F16 - VOP2 Actions.
645 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
646 Expand);
650
651 // F16 - VOP3 Actions.
653 if (STI.hasMadF16())
655
656 for (MVT VT :
657 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
658 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
659 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
660 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
661 switch (Op) {
662 case ISD::LOAD:
663 case ISD::STORE:
665 case ISD::BITCAST:
666 case ISD::UNDEF:
671 case ISD::IS_FPCLASS:
672 break;
675 case ISD::FSIN:
676 case ISD::FCOS:
678 break;
679 default:
681 break;
682 }
683 }
684 }
685
686 // v_perm_b32 can handle either of these.
687 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
689
690 // XXX - Do these do anything? Vector constants turn into build_vector.
691 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
692
693 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
694 Legal);
695
697 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
699 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
700
702 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
704 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
705
706 setOperationAction(ISD::AND, MVT::v2i16, Promote);
707 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
708 setOperationAction(ISD::OR, MVT::v2i16, Promote);
709 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
710 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
711 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
712
714 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
716 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
717 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
718 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
719
721 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
723 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
725 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
726
728 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
730 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
731 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
732 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
733
735 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
737 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
738
740 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
742 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
744 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
745
746 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
747 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
748 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
749 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
750 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
751 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
752
754 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
756 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
757 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
758 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
759
760 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
761 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
762 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
763 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
764 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
765 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
766
768 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
770 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
771 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
772 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
773
775 MVT::v2i32, Expand);
777
779 MVT::v4i32, Expand);
780
782 MVT::v8i32, Expand);
783
784 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
785 Subtarget->hasVOP3PInsts() ? Legal : Custom);
786
787 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
788 // This isn't really legal, but this avoids the legalizer unrolling it (and
789 // allows matching fneg (fabs x) patterns)
790 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
791
792 // Can do this in one BFI plus a constant materialize.
794 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
795 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
796 MVT::v32f16, MVT::v32bf16},
797 Custom);
798
801 MVT::f16, Custom);
803
806 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
807 Custom);
808
810 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
811 Expand);
812
813 for (MVT Vec16 :
814 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
815 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
818 Vec16, Custom);
820 }
821 }
822
823 if (Subtarget->hasVOP3PInsts()) {
827 MVT::v2i16, Legal);
828
831 MVT::v2f16, Legal);
832
834 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
835
837 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
838 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
839 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
840 Custom);
841
842 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
843 // Split vector operations.
848 VT, Custom);
849
850 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
851 // Split vector operations.
853 VT, Custom);
854
857 {MVT::v2f16, MVT::v4f16}, Custom);
858
859 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
860 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
861 Custom);
862
863 if (Subtarget->hasBF16PackedInsts()) {
864 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
865 // Split vector operations.
867 VT, Custom);
868 }
869
870 if (Subtarget->hasPackedFP32Ops()) {
872 MVT::v2f32, Legal);
874 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
875 Custom);
876 }
877 }
878
880
881 if (Subtarget->has16BitInsts()) {
883 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
885 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
886 } else {
887 // Legalization hack.
888 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
889
891 }
892
894 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
895 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
896 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
897 MVT::v32f16, MVT::v32bf16},
898 Custom);
899
901
902 if (Subtarget->hasVectorMulU64())
904 else if (Subtarget->hasScalarSMulU64())
906
907 if (Subtarget->hasMad64_32())
909
910 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
912
913 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
915 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
916 } else {
917 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
918 if (Subtarget->hasMinimum3Maximum3F32())
920
921 if (Subtarget->hasMinimum3Maximum3PKF16()) {
923
924 // If only the vector form is available, we need to widen to a vector.
925 if (!Subtarget->hasMinimum3Maximum3F16())
927 }
928 }
929
930 if (Subtarget->hasVOP3PInsts()) {
931 // We want to break these into v2f16 pieces, not scalarize.
933 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
934 Custom);
935 }
936
937 if (Subtarget->hasIntMinMax64())
939 Legal);
940
942 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
943 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
944 MVT::i8},
945 Custom);
946
948 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
949 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
950 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
951 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
952 Custom);
953
955 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
956 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
957 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
958 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
959 Custom);
960
966
967 // TODO: Could move this to custom lowering, could benefit from combines on
968 // extract of relevant bits.
970
972
973 if (Subtarget->hasBF16ConversionInsts()) {
974 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
976 }
977
978 if (Subtarget->hasBF16PackedInsts()) {
981 MVT::v2bf16, Legal);
982 }
983
984 if (Subtarget->hasBF16TransInsts()) {
986 }
987
988 if (Subtarget->hasCvtPkF16F32Inst()) {
990 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
991 Custom);
992 }
993
997 ISD::SUB,
999 ISD::MUL,
1000 ISD::FADD,
1001 ISD::FSUB,
1002 ISD::FDIV,
1003 ISD::FMUL,
1012 ISD::FMA,
1013 ISD::SMIN,
1014 ISD::SMAX,
1015 ISD::UMIN,
1016 ISD::UMAX,
1017 ISD::SETCC,
1019 ISD::SMIN,
1020 ISD::SMAX,
1021 ISD::UMIN,
1022 ISD::UMAX,
1023 ISD::AND,
1024 ISD::OR,
1025 ISD::XOR,
1026 ISD::SHL,
1027 ISD::SRL,
1028 ISD::SRA,
1029 ISD::FSHR,
1040
1041 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1043
1044 // All memory operations. Some folding on the pointer operand is done to help
1045 // matching the constant offsets in the addressing modes.
1047 ISD::STORE,
1072
1073 // FIXME: In other contexts we pretend this is a per-function property.
1075
1077}
1078
1079const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1080
1082 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1083 return RCRegs;
1084}
1085
1086//===----------------------------------------------------------------------===//
1087// TargetLowering queries
1088//===----------------------------------------------------------------------===//
1089
1090// v_mad_mix* support a conversion from f16 to f32.
1091//
1092// There is only one special case when denormals are enabled we don't currently,
1093// where this is OK to use.
1094bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1095 EVT DestVT, EVT SrcVT) const {
1096 return DestVT.getScalarType() == MVT::f32 &&
1097 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1098 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1099 SrcVT.getScalarType() == MVT::f16) ||
1100 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1101 SrcVT.getScalarType() == MVT::bf16)) &&
1102 // TODO: This probably only requires no input flushing?
1104}
1105
1107 LLT DestTy, LLT SrcTy) const {
1108 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1109 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1110 DestTy.getScalarSizeInBits() == 32 &&
1111 SrcTy.getScalarSizeInBits() == 16 &&
1112 // TODO: This probably only requires no input flushing?
1113 denormalModeIsFlushAllF32(*MI.getMF());
1114}
1115
1117 // SI has some legal vector types, but no legal vector operations. Say no
1118 // shuffles are legal in order to prefer scalarizing some vector operations.
1119 return false;
1120}
1121
1123 CallingConv::ID CC,
1124 EVT VT) const {
1126 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1127
1128 if (VT.isVector()) {
1129 EVT ScalarVT = VT.getScalarType();
1130 unsigned Size = ScalarVT.getSizeInBits();
1131 if (Size == 16) {
1132 return Subtarget->has16BitInsts()
1133 ? MVT::getVectorVT(ScalarVT.getSimpleVT(), 2)
1134 : MVT::i32;
1135 }
1136
1137 if (Size < 16)
1138 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1139 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1140 }
1141
1142 if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
1143 return MVT::i32;
1144
1145 if (VT.getSizeInBits() > 32)
1146 return MVT::i32;
1147
1148 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1149}
1150
1152 CallingConv::ID CC,
1153 EVT VT) const {
1155 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1156
1157 if (VT.isVector()) {
1158 unsigned NumElts = VT.getVectorNumElements();
1159 EVT ScalarVT = VT.getScalarType();
1160 unsigned Size = ScalarVT.getSizeInBits();
1161
1162 // FIXME: Should probably promote 8-bit vectors to i16.
1163 if (Size == 16)
1164 return (NumElts + 1) / 2;
1165
1166 if (Size <= 32)
1167 return NumElts;
1168
1169 if (Size > 32)
1170 return NumElts * ((Size + 31) / 32);
1171 } else if (VT.getSizeInBits() > 32)
1172 return (VT.getSizeInBits() + 31) / 32;
1173
1174 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1175}
1176
1178 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1179 unsigned &NumIntermediates, MVT &RegisterVT) const {
1180 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1181 unsigned NumElts = VT.getVectorNumElements();
1182 EVT ScalarVT = VT.getScalarType();
1183 unsigned Size = ScalarVT.getSizeInBits();
1184 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1185 // support, but unless we can properly handle 3-vectors, it will be still be
1186 // inconsistent.
1187 if (Size == 16) {
1188 MVT SimpleIntermediateVT =
1190 IntermediateVT = SimpleIntermediateVT;
1191 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1192 NumIntermediates = (NumElts + 1) / 2;
1193 return (NumElts + 1) / 2;
1194 }
1195
1196 if (Size == 32) {
1197 RegisterVT = ScalarVT.getSimpleVT();
1198 IntermediateVT = RegisterVT;
1199 NumIntermediates = NumElts;
1200 return NumIntermediates;
1201 }
1202
1203 if (Size < 16 && Subtarget->has16BitInsts()) {
1204 // FIXME: Should probably form v2i16 pieces
1205 RegisterVT = MVT::i16;
1206 IntermediateVT = ScalarVT;
1207 NumIntermediates = NumElts;
1208 return NumIntermediates;
1209 }
1210
1211 if (Size != 16 && Size <= 32) {
1212 RegisterVT = MVT::i32;
1213 IntermediateVT = ScalarVT;
1214 NumIntermediates = NumElts;
1215 return NumIntermediates;
1216 }
1217
1218 if (Size > 32) {
1219 RegisterVT = MVT::i32;
1220 IntermediateVT = RegisterVT;
1221 NumIntermediates = NumElts * ((Size + 31) / 32);
1222 return NumIntermediates;
1223 }
1224 }
1225
1227 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1228}
1229
1231 const DataLayout &DL, Type *Ty,
1232 unsigned MaxNumLanes) {
1233 assert(MaxNumLanes != 0);
1234
1235 LLVMContext &Ctx = Ty->getContext();
1236 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1237 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1238 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1239 NumElts);
1240 }
1241
1242 return TLI.getValueType(DL, Ty);
1243}
1244
1245// Peek through TFE struct returns to only use the data size.
1247 const DataLayout &DL, Type *Ty,
1248 unsigned MaxNumLanes) {
1249 auto *ST = dyn_cast<StructType>(Ty);
1250 if (!ST)
1251 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1252
1253 // TFE intrinsics return an aggregate type.
1254 assert(ST->getNumContainedTypes() == 2 &&
1255 ST->getContainedType(1)->isIntegerTy(32));
1256 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1257}
1258
1259/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1260/// in-memory representation. This return value is a custom type because there
1261/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1262/// could cause issues during codegen, these address space 7 pointers will be
1263/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1264/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1265/// for cost modeling, to work. (This also sets us up decently for doing the
1266/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1268 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1269 return MVT::amdgpuBufferFatPointer;
1271 DL.getPointerSizeInBits(AS) == 192)
1272 return MVT::amdgpuBufferStridedPointer;
1274}
1275/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1276/// v8i32 when padding is added.
1277/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1278/// also v8i32 with padding.
1280 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1281 DL.getPointerSizeInBits(AS) == 160) ||
1283 DL.getPointerSizeInBits(AS) == 192))
1284 return MVT::v8i32;
1286}
1287
1288static unsigned getIntrMemWidth(unsigned IntrID) {
1289 switch (IntrID) {
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1293 return 8;
1294 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1295 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1296 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1297 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1298 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1299 case Intrinsic::amdgcn_flat_load_monitor_b32:
1300 case Intrinsic::amdgcn_global_load_monitor_b32:
1301 return 32;
1302 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1303 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1304 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1305 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1306 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1307 case Intrinsic::amdgcn_flat_load_monitor_b64:
1308 case Intrinsic::amdgcn_global_load_monitor_b64:
1309 return 64;
1310 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1311 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1312 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1313 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1314 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1315 case Intrinsic::amdgcn_flat_load_monitor_b128:
1316 case Intrinsic::amdgcn_global_load_monitor_b128:
1317 return 128;
1318 default:
1319 llvm_unreachable("Unknown width");
1320 }
1321}
1322
1324 unsigned ArgIdx) {
1325 Value *OrderingArg = CI.getArgOperand(ArgIdx);
1326 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1327 switch (AtomicOrderingCABI(Ord)) {
1330 break;
1333 break;
1336 break;
1337 default:
1339 }
1340}
1341
1342static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {
1343 MDNode *ScopeMD = cast<MDNode>(
1344 cast<MetadataAsValue>(CI.getArgOperand(ArgIdx))->getMetadata());
1345 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1346 return CI.getContext().getOrInsertSyncScopeID(Scope);
1347}
1348
1350 const CallBase &CI,
1351 MachineFunction &MF,
1352 unsigned IntrID) const {
1354 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1356 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1358 Flags |= getTargetMMOFlags(CI);
1359
1360 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1362 AttributeSet Attr =
1364 MemoryEffects ME = Attr.getMemoryEffects();
1365 if (ME.doesNotAccessMemory())
1366 return;
1367
1368 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1369 if (!IsSPrefetch) {
1370 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1371 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1373 }
1375
1376 IntrinsicInfo Info;
1377 // TODO: Should images get their own address space?
1379
1380 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1381 if (RsrcIntr->IsImage) {
1382 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1384 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1385 Info.align.reset();
1386 }
1387
1388 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1389 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1390 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1391 // We conservatively set the memory operand of a buffer intrinsic to the
1392 // base resource pointer, so that we can access alias information about
1393 // those pointers. Cases like "this points at the same value
1394 // but with a different offset" are handled in
1395 // areMemAccessesTriviallyDisjoint.
1396 Info.ptrVal = RsrcArg;
1397 }
1398
1399 if (ME.onlyReadsMemory()) {
1400 if (RsrcIntr->IsImage) {
1401 unsigned MaxNumLanes = 4;
1402
1403 if (!BaseOpcode->Gather4) {
1404 // If this isn't a gather, we may have excess loaded elements in the
1405 // IR type. Check the dmask for the real number of elements loaded.
1406 unsigned DMask =
1407 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1408 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1409 }
1410
1411 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1412 CI.getType(), MaxNumLanes);
1413 } else {
1414 Info.memVT =
1416 std::numeric_limits<unsigned>::max());
1417 }
1418
1419 // FIXME: What does alignment mean for an image?
1420 Info.opc = ISD::INTRINSIC_W_CHAIN;
1421 Info.flags = Flags | MachineMemOperand::MOLoad;
1422 } else if (ME.onlyWritesMemory()) {
1423 Info.opc = ISD::INTRINSIC_VOID;
1424
1425 Type *DataTy = CI.getArgOperand(0)->getType();
1426 if (RsrcIntr->IsImage) {
1427 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1428 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1429 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1430 DMaskLanes);
1431 } else
1432 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1433
1434 Info.flags = Flags | MachineMemOperand::MOStore;
1435 } else {
1436 // Atomic, NoReturn Sampler or prefetch
1437 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1439
1440 switch (IntrID) {
1441 default:
1442 Info.flags = Flags | MachineMemOperand::MOLoad;
1443 if (!IsSPrefetch)
1444 Info.flags |= MachineMemOperand::MOStore;
1445
1446 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1447 // Fake memory access type for no return sampler intrinsics
1448 Info.memVT = MVT::i32;
1449 } else {
1450 // XXX - Should this be volatile without known ordering?
1451 Info.flags |= MachineMemOperand::MOVolatile;
1452 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1453 }
1454 break;
1455 case Intrinsic::amdgcn_raw_buffer_load_lds:
1456 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1457 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1458 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1459 case Intrinsic::amdgcn_struct_buffer_load_lds:
1460 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1461 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1462 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1463 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1464
1465 // Entry 0: Load from buffer.
1466 // Don't set an offset, since the pointer value always represents the
1467 // base of the buffer.
1468 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1469 Info.flags = Flags | MachineMemOperand::MOLoad;
1470 Infos.push_back(Info);
1471
1472 // Entry 1: Store to LDS.
1473 // Instruction offset is applied, and an additional per-lane offset
1474 // which we simulate using a larger memory type.
1475 Info.memVT = EVT::getIntegerVT(
1476 CI.getContext(), Width * 8 * Subtarget->getWavefrontSize());
1477 Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
1478 Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))
1479 ->getZExtValue();
1480 Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
1481 Info.flags = Flags | MachineMemOperand::MOStore;
1482 Infos.push_back(Info);
1483 return;
1484 }
1485 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1486 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1487 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1488 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1489 Info.memVT =
1491 std::numeric_limits<unsigned>::max());
1492 Info.flags = Flags | MachineMemOperand::MOLoad;
1493 Infos.push_back(Info);
1494 return;
1495 }
1496 }
1497 }
1498 Infos.push_back(Info);
1499 return;
1500 }
1501
1502 IntrinsicInfo Info;
1503 switch (IntrID) {
1504 case Intrinsic::amdgcn_ds_ordered_add:
1505 case Intrinsic::amdgcn_ds_ordered_swap: {
1506 Info.opc = ISD::INTRINSIC_W_CHAIN;
1507 Info.memVT = MVT::getVT(CI.getType());
1508 Info.ptrVal = CI.getOperand(0);
1509 Info.align.reset();
1511
1512 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1513 if (!Vol->isZero())
1514 Info.flags |= MachineMemOperand::MOVolatile;
1515
1516 Infos.push_back(Info);
1517 return;
1518 }
1519 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1520 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1521 Info.opc = ISD::INTRINSIC_W_CHAIN;
1522 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1523 Info.ptrVal = nullptr;
1524 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1526 Infos.push_back(Info);
1527 return;
1528 }
1529 case Intrinsic::amdgcn_ds_append:
1530 case Intrinsic::amdgcn_ds_consume: {
1531 Info.opc = ISD::INTRINSIC_W_CHAIN;
1532 Info.memVT = MVT::getVT(CI.getType());
1533 Info.ptrVal = CI.getOperand(0);
1534 Info.align.reset();
1536
1537 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1538 if (!Vol->isZero())
1539 Info.flags |= MachineMemOperand::MOVolatile;
1540
1541 Infos.push_back(Info);
1542 return;
1543 }
1544 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1545 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1546 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1549 Info.memVT = MVT::getVT(CI.getType());
1550 Info.ptrVal = CI.getOperand(0);
1551 Info.memVT = MVT::i64;
1552 Info.size = 8;
1553 Info.align.reset();
1555 Infos.push_back(Info);
1556 return;
1557 }
1558 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1559 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1560 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1561 Info.opc = ISD::INTRINSIC_W_CHAIN;
1562 Info.memVT =
1563 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1564 ? CI.getType()
1566 ->getElementType(0)); // XXX: what is correct VT?
1567
1568 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1569 Info.align.reset();
1570 Info.flags = Flags | MachineMemOperand::MOLoad |
1572 Infos.push_back(Info);
1573 return;
1574 }
1575 case Intrinsic::amdgcn_global_atomic_fmin_num:
1576 case Intrinsic::amdgcn_global_atomic_fmax_num:
1577 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1578 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1579 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1580 Info.opc = ISD::INTRINSIC_W_CHAIN;
1581 Info.memVT = MVT::getVT(CI.getType());
1582 Info.ptrVal = CI.getOperand(0);
1583 Info.align.reset();
1584 Info.flags =
1587 Infos.push_back(Info);
1588 return;
1589 }
1590 case Intrinsic::amdgcn_cluster_load_b32:
1591 case Intrinsic::amdgcn_cluster_load_b64:
1592 case Intrinsic::amdgcn_cluster_load_b128:
1593 case Intrinsic::amdgcn_ds_load_tr6_b96:
1594 case Intrinsic::amdgcn_ds_load_tr4_b64:
1595 case Intrinsic::amdgcn_ds_load_tr8_b64:
1596 case Intrinsic::amdgcn_ds_load_tr16_b128:
1597 case Intrinsic::amdgcn_global_load_tr6_b96:
1598 case Intrinsic::amdgcn_global_load_tr4_b64:
1599 case Intrinsic::amdgcn_global_load_tr_b64:
1600 case Intrinsic::amdgcn_global_load_tr_b128:
1601 case Intrinsic::amdgcn_ds_read_tr4_b64:
1602 case Intrinsic::amdgcn_ds_read_tr6_b96:
1603 case Intrinsic::amdgcn_ds_read_tr8_b64:
1604 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1605 Info.opc = ISD::INTRINSIC_W_CHAIN;
1606 Info.memVT = MVT::getVT(CI.getType());
1607 Info.ptrVal = CI.getOperand(0);
1608 Info.align.reset();
1609 Info.flags = Flags | MachineMemOperand::MOLoad;
1610 Infos.push_back(Info);
1611 return;
1612 }
1613 case Intrinsic::amdgcn_flat_load_monitor_b32:
1614 case Intrinsic::amdgcn_flat_load_monitor_b64:
1615 case Intrinsic::amdgcn_flat_load_monitor_b128:
1616 case Intrinsic::amdgcn_global_load_monitor_b32:
1617 case Intrinsic::amdgcn_global_load_monitor_b64:
1618 case Intrinsic::amdgcn_global_load_monitor_b128: {
1619 Info.opc = ISD::INTRINSIC_W_CHAIN;
1620 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1621 Info.ptrVal = CI.getOperand(0);
1622 Info.align.reset();
1623 Info.flags = MachineMemOperand::MOLoad;
1624 Info.order = parseAtomicOrderingCABIArg(CI, 1);
1625 Info.ssid = parseSyncscopeMDArg(CI, 2);
1626 Infos.push_back(Info);
1627 return;
1628 }
1629 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1630 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1631 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1632 Info.opc = ISD::INTRINSIC_W_CHAIN;
1633 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1634 Info.ptrVal = CI.getOperand(0);
1635 Info.align.reset();
1637 Info.order = parseAtomicOrderingCABIArg(CI, 1);
1638 Info.ssid = parseSyncscopeMDArg(CI, 2);
1639 Infos.push_back(Info);
1640 return;
1641 }
1642 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1643 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1644 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1645 Info.opc = ISD::INTRINSIC_VOID;
1646 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1647 Info.ptrVal = CI.getArgOperand(0);
1648 Info.align.reset();
1650 Info.order = parseAtomicOrderingCABIArg(CI, 2);
1651 Info.ssid = parseSyncscopeMDArg(CI, 3);
1652 Infos.push_back(Info);
1653 return;
1654 }
1655 case Intrinsic::amdgcn_ds_gws_init:
1656 case Intrinsic::amdgcn_ds_gws_barrier:
1657 case Intrinsic::amdgcn_ds_gws_sema_v:
1658 case Intrinsic::amdgcn_ds_gws_sema_br:
1659 case Intrinsic::amdgcn_ds_gws_sema_p:
1660 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1661 Info.opc = ISD::INTRINSIC_VOID;
1662
1663 const GCNTargetMachine &TM =
1664 static_cast<const GCNTargetMachine &>(getTargetMachine());
1665
1667 Info.ptrVal = MFI->getGWSPSV(TM);
1668
1669 // This is an abstract access, but we need to specify a type and size.
1670 Info.memVT = MVT::i32;
1671 Info.size = 4;
1672 Info.align = Align(4);
1673
1674 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1675 Info.flags = Flags | MachineMemOperand::MOLoad;
1676 else
1677 Info.flags = Flags | MachineMemOperand::MOStore;
1678 Infos.push_back(Info);
1679 return;
1680 }
1681 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1682 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1683 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1684 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1685 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1686 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1687 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1688 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1689 // Entry 0: Load from source (global/flat).
1690 Info.opc = ISD::INTRINSIC_VOID;
1691 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1692 Info.ptrVal = CI.getArgOperand(0); // Global pointer
1693 Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
1694 Info.flags = Flags | MachineMemOperand::MOLoad;
1695 Infos.push_back(Info);
1696
1697 // Entry 1: Store to LDS (same offset).
1698 Info.flags = Flags | MachineMemOperand::MOStore;
1699 Info.ptrVal = CI.getArgOperand(1); // LDS pointer
1700 Infos.push_back(Info);
1701 return;
1702 }
1703 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1704 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1705 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1706 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1707 // Entry 0: Load from LDS.
1708 Info.opc = ISD::INTRINSIC_VOID;
1709 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1710 Info.ptrVal = CI.getArgOperand(1); // LDS pointer
1711 Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
1712 Info.flags = Flags | MachineMemOperand::MOLoad;
1713 Infos.push_back(Info);
1714
1715 // Entry 1: Store to global (same offset).
1716 Info.flags = Flags | MachineMemOperand::MOStore;
1717 Info.ptrVal = CI.getArgOperand(0); // Global pointer
1718 Infos.push_back(Info);
1719 return;
1720 }
1721 case Intrinsic::amdgcn_load_to_lds:
1722 case Intrinsic::amdgcn_load_async_to_lds:
1723 case Intrinsic::amdgcn_global_load_lds:
1724 case Intrinsic::amdgcn_global_load_async_lds: {
1725 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1726 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1727 bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
1728 if (IsVolatile)
1730
1731 // Entry 0: Load from source (global/flat).
1732 Info.opc = ISD::INTRINSIC_VOID;
1733 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1734 Info.ptrVal = CI.getArgOperand(0); // Source pointer
1735 Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue();
1736 Info.flags = Flags | MachineMemOperand::MOLoad;
1737 Infos.push_back(Info);
1738
1739 // Entry 1: Store to LDS.
1740 // Same offset from the instruction, but an additional per-lane offset is
1741 // added. Represent that using a wider memory type.
1742 Info.memVT = EVT::getIntegerVT(CI.getContext(),
1743 Width * 8 * Subtarget->getWavefrontSize());
1744 Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
1745 Info.flags = Flags | MachineMemOperand::MOStore;
1746 Infos.push_back(Info);
1747 return;
1748 }
1749 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1750 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1751 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1752 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1753 Info.opc = ISD::INTRINSIC_W_CHAIN;
1754
1755 const GCNTargetMachine &TM =
1756 static_cast<const GCNTargetMachine &>(getTargetMachine());
1757
1759 Info.ptrVal = MFI->getGWSPSV(TM);
1760
1761 // This is an abstract access, but we need to specify a type and size.
1762 Info.memVT = MVT::i32;
1763 Info.size = 4;
1764 Info.align = Align(4);
1765
1767 Infos.push_back(Info);
1768 return;
1769 }
1770 case Intrinsic::amdgcn_s_prefetch_data:
1771 case Intrinsic::amdgcn_flat_prefetch:
1772 case Intrinsic::amdgcn_global_prefetch: {
1773 Info.opc = ISD::INTRINSIC_VOID;
1774 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1775 Info.ptrVal = CI.getArgOperand(0);
1776 Info.flags = Flags | MachineMemOperand::MOLoad;
1777 Infos.push_back(Info);
1778 return;
1779 }
1780 default:
1781 return;
1782 }
1783}
1784
1786 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1788 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1789 // The DAG's ValueType loses the addrspaces.
1790 // Add them as 2 extra Constant operands "from" and "to".
1791 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1792 unsigned DstAS = I.getType()->getPointerAddressSpace();
1793 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1794 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1795 break;
1796 }
1797 default:
1798 break;
1799 }
1800}
1801
1804 Type *&AccessTy) const {
1805 Value *Ptr = nullptr;
1806 switch (II->getIntrinsicID()) {
1807 case Intrinsic::amdgcn_cluster_load_b128:
1808 case Intrinsic::amdgcn_cluster_load_b64:
1809 case Intrinsic::amdgcn_cluster_load_b32:
1810 case Intrinsic::amdgcn_ds_append:
1811 case Intrinsic::amdgcn_ds_consume:
1812 case Intrinsic::amdgcn_ds_load_tr8_b64:
1813 case Intrinsic::amdgcn_ds_load_tr16_b128:
1814 case Intrinsic::amdgcn_ds_load_tr4_b64:
1815 case Intrinsic::amdgcn_ds_load_tr6_b96:
1816 case Intrinsic::amdgcn_ds_read_tr4_b64:
1817 case Intrinsic::amdgcn_ds_read_tr6_b96:
1818 case Intrinsic::amdgcn_ds_read_tr8_b64:
1819 case Intrinsic::amdgcn_ds_read_tr16_b64:
1820 case Intrinsic::amdgcn_ds_ordered_add:
1821 case Intrinsic::amdgcn_ds_ordered_swap:
1822 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1823 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1824 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1825 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1826 case Intrinsic::amdgcn_global_atomic_fmax_num:
1827 case Intrinsic::amdgcn_global_atomic_fmin_num:
1828 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1829 case Intrinsic::amdgcn_global_load_tr_b64:
1830 case Intrinsic::amdgcn_global_load_tr_b128:
1831 case Intrinsic::amdgcn_global_load_tr4_b64:
1832 case Intrinsic::amdgcn_global_load_tr6_b96:
1833 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1834 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1835 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1836 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1837 Ptr = II->getArgOperand(0);
1838 break;
1839 case Intrinsic::amdgcn_load_to_lds:
1840 case Intrinsic::amdgcn_load_async_to_lds:
1841 case Intrinsic::amdgcn_global_load_lds:
1842 case Intrinsic::amdgcn_global_load_async_lds:
1843 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1844 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1845 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1846 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1847 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1848 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1849 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1850 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1851 Ptr = II->getArgOperand(1);
1852 break;
1853 default:
1854 return false;
1855 }
1856 AccessTy = II->getType();
1857 Ops.push_back(Ptr);
1858 return true;
1859}
1860
1862 unsigned AddrSpace) const {
1863 if (!Subtarget->hasFlatInstOffsets()) {
1864 // Flat instructions do not have offsets, and only have the register
1865 // address.
1866 return AM.BaseOffs == 0 && AM.Scale == 0;
1867 }
1868
1869 decltype(SIInstrFlags::FLAT) FlatVariant =
1873
1874 return AM.Scale == 0 &&
1875 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1876 AM.BaseOffs, AddrSpace, FlatVariant));
1877}
1878
1880 if (Subtarget->hasFlatGlobalInsts())
1882
1883 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1884 // Assume the we will use FLAT for all global memory accesses
1885 // on VI.
1886 // FIXME: This assumption is currently wrong. On VI we still use
1887 // MUBUF instructions for the r + i addressing mode. As currently
1888 // implemented, the MUBUF instructions only work on buffer < 4GB.
1889 // It may be possible to support > 4GB buffers with MUBUF instructions,
1890 // by setting the stride value in the resource descriptor which would
1891 // increase the size limit to (stride * 4GB). However, this is risky,
1892 // because it has never been validated.
1894 }
1895
1896 return isLegalMUBUFAddressingMode(AM);
1897}
1898
1899bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1900 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1901 // additionally can do r + r + i with addr64. 32-bit has more addressing
1902 // mode options. Depending on the resource constant, it can also do
1903 // (i64 r0) + (i32 r1) * (i14 i).
1904 //
1905 // Private arrays end up using a scratch buffer most of the time, so also
1906 // assume those use MUBUF instructions. Scratch loads / stores are currently
1907 // implemented as mubuf instructions with offen bit set, so slightly
1908 // different than the normal addr64.
1909 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1910 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1911 return false;
1912
1913 // FIXME: Since we can split immediate into soffset and immediate offset,
1914 // would it make sense to allow any immediate?
1915
1916 switch (AM.Scale) {
1917 case 0: // r + i or just i, depending on HasBaseReg.
1918 return true;
1919 case 1:
1920 return true; // We have r + r or r + i.
1921 case 2:
1922 if (AM.HasBaseReg) {
1923 // Reject 2 * r + r.
1924 return false;
1925 }
1926
1927 // Allow 2 * r as r + r
1928 // Or 2 * r + i is allowed as r + r + i.
1929 return true;
1930 default: // Don't allow n * r
1931 return false;
1932 }
1933}
1934
1936 const AddrMode &AM, Type *Ty,
1937 unsigned AS,
1938 Instruction *I) const {
1939 // No global is ever allowed as a base.
1940 if (AM.BaseGV)
1941 return false;
1942
1943 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1944 return isLegalGlobalAddressingMode(AM);
1945
1946 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1950 // If the offset isn't a multiple of 4, it probably isn't going to be
1951 // correctly aligned.
1952 // FIXME: Can we get the real alignment here?
1953 if (AM.BaseOffs % 4 != 0)
1954 return isLegalMUBUFAddressingMode(AM);
1955
1956 if (!Subtarget->hasScalarSubwordLoads()) {
1957 // There are no SMRD extloads, so if we have to do a small type access we
1958 // will use a MUBUF load.
1959 // FIXME?: We also need to do this if unaligned, but we don't know the
1960 // alignment here.
1961 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1962 return isLegalGlobalAddressingMode(AM);
1963 }
1964
1965 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1966 // SMRD instructions have an 8-bit, dword offset on SI.
1967 if (!isUInt<8>(AM.BaseOffs / 4))
1968 return false;
1969 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1970 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1971 // in 8-bits, it can use a smaller encoding.
1972 if (!isUInt<32>(AM.BaseOffs / 4))
1973 return false;
1974 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1975 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1976 if (!isUInt<20>(AM.BaseOffs))
1977 return false;
1978 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1979 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1980 // for S_BUFFER_* instructions).
1981 if (!isInt<21>(AM.BaseOffs))
1982 return false;
1983 } else {
1984 // On GFX12, all offsets are signed 24-bit in bytes.
1985 if (!isInt<24>(AM.BaseOffs))
1986 return false;
1987 }
1988
1989 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1991 AM.BaseOffs < 0) {
1992 // Scalar (non-buffer) loads can only use a negative offset if
1993 // soffset+offset is non-negative. Since the compiler can only prove that
1994 // in a few special cases, it is safer to claim that negative offsets are
1995 // not supported.
1996 return false;
1997 }
1998
1999 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2000 return true;
2001
2002 if (AM.Scale == 1 && AM.HasBaseReg)
2003 return true;
2004
2005 return false;
2006 }
2007
2008 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
2009 return Subtarget->hasFlatScratchEnabled()
2011 : isLegalMUBUFAddressingMode(AM);
2012
2013 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
2014 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
2015 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
2016 // field.
2017 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
2018 // an 8-bit dword offset but we don't know the alignment here.
2019 if (!isUInt<16>(AM.BaseOffs))
2020 return false;
2021
2022 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2023 return true;
2024
2025 if (AM.Scale == 1 && AM.HasBaseReg)
2026 return true;
2027
2028 return false;
2029 }
2030
2032 // For an unknown address space, this usually means that this is for some
2033 // reason being used for pure arithmetic, and not based on some addressing
2034 // computation. We don't have instructions that compute pointers with any
2035 // addressing modes, so treat them as having no offset like flat
2036 // instructions.
2038 }
2039
2040 // Assume a user alias of global for unknown address spaces.
2041 return isLegalGlobalAddressingMode(AM);
2042}
2043
2045 const MachineFunction &MF) const {
2047 return (MemVT.getSizeInBits() <= 4 * 32);
2048 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
2049 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
2050 return (MemVT.getSizeInBits() <= MaxPrivateBits);
2051 }
2053 return (MemVT.getSizeInBits() <= 2 * 32);
2054 return true;
2055}
2056
2058 unsigned Size, unsigned AddrSpace, Align Alignment,
2059 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
2060 if (IsFast)
2061 *IsFast = 0;
2062
2063 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
2064 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
2065 // Check if alignment requirements for ds_read/write instructions are
2066 // disabled.
2067 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
2068 return false;
2069
2070 Align RequiredAlignment(
2071 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
2072 if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
2073 Alignment < RequiredAlignment)
2074 return false;
2075
2076 // Either, the alignment requirements are "enabled", or there is an
2077 // unaligned LDS access related hardware bug though alignment requirements
2078 // are "disabled". In either case, we need to check for proper alignment
2079 // requirements.
2080 //
2081 switch (Size) {
2082 case 64:
2083 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2084 // address is negative, then the instruction is incorrectly treated as
2085 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2086 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2087 // load later in the SILoadStoreOptimizer.
2088 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2089 return false;
2090
2091 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2092 // can do a 4 byte aligned, 8 byte access in a single operation using
2093 // ds_read2/write2_b32 with adjacent offsets.
2094 RequiredAlignment = Align(4);
2095
2096 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2097 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2098 // ds_write2_b32 depending on the alignment. In either case with either
2099 // alignment there is no faster way of doing this.
2100
2101 // The numbers returned here and below are not additive, it is a 'speed
2102 // rank'. They are just meant to be compared to decide if a certain way
2103 // of lowering an operation is faster than another. For that purpose
2104 // naturally aligned operation gets it bitsize to indicate that "it
2105 // operates with a speed comparable to N-bit wide load". With the full
2106 // alignment ds128 is slower than ds96 for example. If underaligned it
2107 // is comparable to a speed of a single dword access, which would then
2108 // mean 32 < 128 and it is faster to issue a wide load regardless.
2109 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2110 // wider load which will not be aligned anymore the latter is slower.
2111 if (IsFast)
2112 *IsFast = (Alignment >= RequiredAlignment) ? 64
2113 : (Alignment < Align(4)) ? 32
2114 : 1;
2115 return true;
2116 }
2117
2118 break;
2119 case 96:
2120 if (!Subtarget->hasDS96AndDS128())
2121 return false;
2122
2123 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2124 // gfx8 and older.
2125
2126 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2127 // Naturally aligned access is fastest. However, also report it is Fast
2128 // if memory is aligned less than DWORD. A narrow load or store will be
2129 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2130 // be more of them, so overall we will pay less penalty issuing a single
2131 // instruction.
2132
2133 // See comment on the values above.
2134 if (IsFast)
2135 *IsFast = (Alignment >= RequiredAlignment) ? 96
2136 : (Alignment < Align(4)) ? 32
2137 : 1;
2138 return true;
2139 }
2140
2141 break;
2142 case 128:
2143 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2144 return false;
2145
2146 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2147 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2148 // single operation using ds_read2/write2_b64.
2149 RequiredAlignment = Align(8);
2150
2151 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2152 // Naturally aligned access is fastest. However, also report it is Fast
2153 // if memory is aligned less than DWORD. A narrow load or store will be
2154 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2155 // will be more of them, so overall we will pay less penalty issuing a
2156 // single instruction.
2157
2158 // See comment on the values above.
2159 if (IsFast)
2160 *IsFast = (Alignment >= RequiredAlignment) ? 128
2161 : (Alignment < Align(4)) ? 32
2162 : 1;
2163 return true;
2164 }
2165
2166 break;
2167 default:
2168 if (Size > 32)
2169 return false;
2170
2171 break;
2172 }
2173
2174 // See comment on the values above.
2175 // Note that we have a single-dword or sub-dword here, so if underaligned
2176 // it is a slowest possible access, hence returned value is 0.
2177 if (IsFast)
2178 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2179
2180 return Alignment >= RequiredAlignment ||
2181 Subtarget->hasUnalignedDSAccessEnabled();
2182 }
2183
2184 // FIXME: We have to be conservative here and assume that flat operations
2185 // will access scratch. If we had access to the IR function, then we
2186 // could determine if any private memory was used in the function.
2187 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2188 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2189 bool AlignedBy4 = Alignment >= Align(4);
2190 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2191 if (IsFast)
2192 *IsFast = AlignedBy4 ? Size : 1;
2193 return true;
2194 }
2195
2196 if (IsFast)
2197 *IsFast = AlignedBy4;
2198
2199 return AlignedBy4;
2200 }
2201
2202 // So long as they are correct, wide global memory operations perform better
2203 // than multiple smaller memory ops -- even when misaligned
2204 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2205 if (IsFast)
2206 *IsFast = Size;
2207
2208 return Alignment >= Align(4) ||
2209 Subtarget->hasUnalignedBufferAccessEnabled();
2210 }
2211
2212 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2213 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2214 // out-of-bounds behavior, but in the edge case where an access starts
2215 // out-of-bounds and then enter in-bounds, the entire access would be treated
2216 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2217 // natural alignment of buffer accesses.
2218 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2219 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2220 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2221 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2222 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2223 return false;
2224 }
2225
2226 // Smaller than dword value must be aligned.
2227 if (Size < 32)
2228 return false;
2229
2230 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2231 // byte-address are ignored, thus forcing Dword alignment.
2232 // This applies to private, global, and constant memory.
2233 if (IsFast)
2234 *IsFast = 1;
2235
2236 return Size >= 32 && Alignment >= Align(4);
2237}
2238
2240 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2241 unsigned *IsFast) const {
2243 Alignment, Flags, IsFast);
2244}
2245
2247 LLVMContext &Context, const MemOp &Op,
2248 const AttributeList &FuncAttributes) const {
2249 // FIXME: Should account for address space here.
2250
2251 // The default fallback uses the private pointer size as a guess for a type to
2252 // use. Make sure we switch these to 64-bit accesses.
2253
2254 if (Op.size() >= 16 &&
2255 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2256 return MVT::v4i32;
2257
2258 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2259 return MVT::v2i32;
2260
2261 // Use the default.
2262 return MVT::Other;
2263}
2264
2266 const MemSDNode *MemNode = cast<MemSDNode>(N);
2267 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2268}
2269
2274
2276 unsigned DestAS) const {
2277 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2278 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2279 Subtarget->hasGloballyAddressableScratch()) {
2280 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2281 return false;
2282 }
2283
2284 // Flat -> private/local is a simple truncate.
2285 // Flat -> global is no-op
2286 return true;
2287 }
2288
2289 const GCNTargetMachine &TM =
2290 static_cast<const GCNTargetMachine &>(getTargetMachine());
2291 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2292}
2293
2301
2303 Type *Ty) const {
2304 // FIXME: Could be smarter if called for vector constants.
2305 return true;
2306}
2307
2309 unsigned Index) const {
2311 return false;
2312
2313 // TODO: Add more cases that are cheap.
2314 return Index == 0;
2315}
2316
2317bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2318 // TODO: This should be more aggressive, particular for 16-bit element
2319 // vectors. However there are some mixed improvements and regressions.
2320 EVT EltTy = VT.getVectorElementType();
2321 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2322 return EltTy.getSizeInBits() % MinAlign == 0;
2323}
2324
2326 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2327 switch (Op) {
2328 case ISD::LOAD:
2329 case ISD::STORE:
2330 return true;
2331 default:
2332 return false;
2333 }
2334 }
2335
2336 // SimplifySetCC uses this function to determine whether or not it should
2337 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2338 if (VT == MVT::i1 && Op == ISD::SETCC)
2339 return false;
2340
2342}
2343
2346 // This isn't really a constant pool but close enough.
2349 return PtrInfo;
2350}
2351
2352SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2353 const SDLoc &SL,
2354 SDValue Chain,
2355 uint64_t Offset) const {
2356 const DataLayout &DL = DAG.getDataLayout();
2360
2361 auto [InputPtrReg, RC, ArgTy] =
2362 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2363
2364 // We may not have the kernarg segment argument if we have no kernel
2365 // arguments.
2366 if (!InputPtrReg)
2367 return DAG.getConstant(Offset, SL, PtrVT);
2368
2370 SDValue BasePtr = DAG.getCopyFromReg(
2371 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2372
2373 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2374}
2375
2376SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2377 const SDLoc &SL) const {
2380 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2381}
2382
2383SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2384 const SDLoc &SL) const {
2385
2387 std::optional<uint32_t> KnownSize =
2389 if (KnownSize.has_value())
2390 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2391 return SDValue();
2392}
2393
2394SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2395 const SDLoc &SL, SDValue Val,
2396 bool Signed,
2397 const ISD::InputArg *Arg) const {
2398 // First, if it is a widened vector, narrow it.
2399 if (VT.isVector() &&
2401 EVT NarrowedVT =
2404 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2405 DAG.getConstant(0, SL, MVT::i32));
2406 }
2407
2408 // Then convert the vector elements or scalar value.
2409 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2410 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2411 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2412 }
2413
2414 if (MemVT.isFloatingPoint()) {
2415 if (VT.isFloatingPoint()) {
2416 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2417 } else {
2418 assert(!MemVT.isVector());
2419 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
2420 SDValue Cast = DAG.getBitcast(IntVT, Val);
2421 Val = DAG.getAnyExtOrTrunc(Cast, SL, VT);
2422 }
2423 } else if (Signed)
2424 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2425 else
2426 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2427
2428 return Val;
2429}
2430
2431SDValue SITargetLowering::lowerKernargMemParameter(
2432 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2433 uint64_t Offset, Align Alignment, bool Signed,
2434 const ISD::InputArg *Arg) const {
2435
2436 MachinePointerInfo PtrInfo =
2438
2439 // Try to avoid using an extload by loading earlier than the argument address,
2440 // and extracting the relevant bits. The load should hopefully be merged with
2441 // the previous argument.
2442 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2443 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2444 int64_t AlignDownOffset = alignDown(Offset, 4);
2445 int64_t OffsetDiff = Offset - AlignDownOffset;
2446
2447 EVT IntVT = MemVT.changeTypeToInteger();
2448
2449 // TODO: If we passed in the base kernel offset we could have a better
2450 // alignment than 4, but we don't really need it.
2451 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2452 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2453 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2456
2457 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2458 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2459
2460 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2461 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2462 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2463
2464 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2465 }
2466
2467 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2468 SDValue Load = DAG.getLoad(
2469 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2471
2472 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2473 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2474}
2475
2476/// Coerce an argument which was passed in a different ABI type to the original
2477/// expected value type.
2478SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2479 SDValue Val,
2480 CCValAssign &VA,
2481 const SDLoc &SL) const {
2482 EVT ValVT = VA.getValVT();
2483
2484 // If this is an 8 or 16-bit value, it is really passed promoted
2485 // to 32 bits. Insert an assert[sz]ext to capture this, then
2486 // truncate to the right size.
2487 switch (VA.getLocInfo()) {
2488 case CCValAssign::Full:
2489 return Val;
2490 case CCValAssign::BCvt:
2491 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2492 case CCValAssign::SExt:
2493 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2494 DAG.getValueType(ValVT));
2495 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2496 case CCValAssign::ZExt:
2497 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2498 DAG.getValueType(ValVT));
2499 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2500 case CCValAssign::AExt:
2501 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2502 default:
2503 llvm_unreachable("Unknown loc info!");
2504 }
2505}
2506
2507SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2508 CCValAssign &VA, const SDLoc &SL,
2509 SDValue Chain,
2510 const ISD::InputArg &Arg) const {
2511 MachineFunction &MF = DAG.getMachineFunction();
2512 MachineFrameInfo &MFI = MF.getFrameInfo();
2513
2514 if (Arg.Flags.isByVal()) {
2515 unsigned Size = Arg.Flags.getByValSize();
2516 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2517 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2518 }
2519
2520 unsigned ArgOffset = VA.getLocMemOffset();
2521 unsigned ArgSize = VA.getValVT().getStoreSize();
2522
2523 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2524
2525 // Create load nodes to retrieve arguments from the stack.
2526 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2527
2528 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2530 MVT MemVT = VA.getValVT();
2531
2532 switch (VA.getLocInfo()) {
2533 default:
2534 break;
2535 case CCValAssign::BCvt:
2536 MemVT = VA.getLocVT();
2537 break;
2538 case CCValAssign::SExt:
2539 ExtType = ISD::SEXTLOAD;
2540 break;
2541 case CCValAssign::ZExt:
2542 ExtType = ISD::ZEXTLOAD;
2543 break;
2544 case CCValAssign::AExt:
2545 ExtType = ISD::EXTLOAD;
2546 break;
2547 }
2548
2549 SDValue ArgValue = DAG.getExtLoad(
2550 ExtType, SL, VA.getLocVT(), Chain, FIN,
2552
2553 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2554 if (ConvertedVal == ArgValue)
2555 return ConvertedVal;
2556
2557 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2558}
2559
2560SDValue SITargetLowering::lowerWorkGroupId(
2561 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2564 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2565 if (!Subtarget->hasClusters())
2566 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2567
2568 // Clusters are supported. Return the global position in the grid. If clusters
2569 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2570
2571 // WorkGroupIdXYZ = ClusterId == 0 ?
2572 // ClusterIdXYZ :
2573 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2574 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2575 SDLoc SL(ClusterIdXYZ);
2576 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2577 SDValue One = DAG.getConstant(1, SL, VT);
2578 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2579 SDValue ClusterWorkGroupIdXYZ =
2580 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2581 SDValue GlobalIdXYZ =
2582 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2583 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2584
2585 switch (MFI.getClusterDims().getKind()) {
2588 return GlobalIdXYZ;
2590 return ClusterIdXYZ;
2592 using namespace AMDGPU::Hwreg;
2593 SDValue ClusterIdField =
2594 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2595 SDNode *GetReg =
2596 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2597 SDValue ClusterId(GetReg, 0);
2598 SDValue Zero = DAG.getConstant(0, SL, VT);
2599 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2600 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2601 }
2602 }
2603
2604 llvm_unreachable("nothing should reach here");
2605}
2606
2607SDValue SITargetLowering::getPreloadedValue(
2608 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2610 const ArgDescriptor *Reg = nullptr;
2611 const TargetRegisterClass *RC;
2612 LLT Ty;
2613
2615 const ArgDescriptor WorkGroupIDX =
2616 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2617 // If GridZ is not programmed in an entry function then the hardware will set
2618 // it to all zeros, so there is no need to mask the GridY value in the low
2619 // order bits.
2620 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2621 AMDGPU::TTMP7,
2622 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2623 const ArgDescriptor WorkGroupIDZ =
2624 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2625 const ArgDescriptor ClusterWorkGroupIDX =
2626 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2627 const ArgDescriptor ClusterWorkGroupIDY =
2628 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2629 const ArgDescriptor ClusterWorkGroupIDZ =
2630 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2631 const ArgDescriptor ClusterWorkGroupMaxIDX =
2632 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2633 const ArgDescriptor ClusterWorkGroupMaxIDY =
2634 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2635 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2636 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2637 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2638 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2639
2640 auto LoadConstant = [&](unsigned N) {
2641 return DAG.getConstant(N, SDLoc(), VT);
2642 };
2643
2644 if (Subtarget->hasArchitectedSGPRs() &&
2646 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2647 bool HasFixedDims = ClusterDims.isFixedDims();
2648
2649 switch (PVID) {
2651 Reg = &WorkGroupIDX;
2652 RC = &AMDGPU::SReg_32RegClass;
2653 Ty = LLT::scalar(32);
2654 break;
2656 Reg = &WorkGroupIDY;
2657 RC = &AMDGPU::SReg_32RegClass;
2658 Ty = LLT::scalar(32);
2659 break;
2661 Reg = &WorkGroupIDZ;
2662 RC = &AMDGPU::SReg_32RegClass;
2663 Ty = LLT::scalar(32);
2664 break;
2666 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2667 return LoadConstant(0);
2668 Reg = &ClusterWorkGroupIDX;
2669 RC = &AMDGPU::SReg_32RegClass;
2670 Ty = LLT::scalar(32);
2671 break;
2673 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2674 return LoadConstant(0);
2675 Reg = &ClusterWorkGroupIDY;
2676 RC = &AMDGPU::SReg_32RegClass;
2677 Ty = LLT::scalar(32);
2678 break;
2680 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2681 return LoadConstant(0);
2682 Reg = &ClusterWorkGroupIDZ;
2683 RC = &AMDGPU::SReg_32RegClass;
2684 Ty = LLT::scalar(32);
2685 break;
2687 if (HasFixedDims)
2688 return LoadConstant(ClusterDims.getDims()[0] - 1);
2689 Reg = &ClusterWorkGroupMaxIDX;
2690 RC = &AMDGPU::SReg_32RegClass;
2691 Ty = LLT::scalar(32);
2692 break;
2694 if (HasFixedDims)
2695 return LoadConstant(ClusterDims.getDims()[1] - 1);
2696 Reg = &ClusterWorkGroupMaxIDY;
2697 RC = &AMDGPU::SReg_32RegClass;
2698 Ty = LLT::scalar(32);
2699 break;
2701 if (HasFixedDims)
2702 return LoadConstant(ClusterDims.getDims()[2] - 1);
2703 Reg = &ClusterWorkGroupMaxIDZ;
2704 RC = &AMDGPU::SReg_32RegClass;
2705 Ty = LLT::scalar(32);
2706 break;
2708 Reg = &ClusterWorkGroupMaxFlatID;
2709 RC = &AMDGPU::SReg_32RegClass;
2710 Ty = LLT::scalar(32);
2711 break;
2712 default:
2713 break;
2714 }
2715 }
2716
2717 if (!Reg)
2718 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2719 if (!Reg) {
2721 // It's possible for a kernarg intrinsic call to appear in a kernel with
2722 // no allocated segment, in which case we do not add the user sgpr
2723 // argument, so just return null.
2724 return DAG.getConstant(0, SDLoc(), VT);
2725 }
2726
2727 // It's undefined behavior if a function marked with the amdgpu-no-*
2728 // attributes uses the corresponding intrinsic.
2729 return DAG.getPOISON(VT);
2730 }
2731
2732 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2733}
2734
2736 CallingConv::ID CallConv,
2737 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2738 FunctionType *FType,
2739 SIMachineFunctionInfo *Info) {
2740 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2741 const ISD::InputArg *Arg = &Ins[I];
2742
2743 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2744 "vector type argument should have been split");
2745
2746 // First check if it's a PS input addr.
2747 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2748 PSInputNum <= 15) {
2749 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2750
2751 // Inconveniently only the first part of the split is marked as isSplit,
2752 // so skip to the end. We only want to increment PSInputNum once for the
2753 // entire split argument.
2754 if (Arg->Flags.isSplit()) {
2755 while (!Arg->Flags.isSplitEnd()) {
2756 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2757 "unexpected vector split in ps argument type");
2758 if (!SkipArg)
2759 Splits.push_back(*Arg);
2760 Arg = &Ins[++I];
2761 }
2762 }
2763
2764 if (SkipArg) {
2765 // We can safely skip PS inputs.
2766 Skipped.set(Arg->getOrigArgIndex());
2767 ++PSInputNum;
2768 continue;
2769 }
2770
2771 Info->markPSInputAllocated(PSInputNum);
2772 if (Arg->Used)
2773 Info->markPSInputEnabled(PSInputNum);
2774
2775 ++PSInputNum;
2776 }
2777
2778 Splits.push_back(*Arg);
2779 }
2780}
2781
2782// Allocate special inputs passed in VGPRs.
2784 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2785 SIMachineFunctionInfo &Info) const {
2786 const LLT S32 = LLT::scalar(32);
2787 MachineRegisterInfo &MRI = MF.getRegInfo();
2788
2789 if (Info.hasWorkItemIDX()) {
2790 Register Reg = AMDGPU::VGPR0;
2791 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2792
2793 CCInfo.AllocateReg(Reg);
2794 unsigned Mask =
2795 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2796 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2797 }
2798
2799 if (Info.hasWorkItemIDY()) {
2800 assert(Info.hasWorkItemIDX());
2801 if (Subtarget->hasPackedTID()) {
2802 Info.setWorkItemIDY(
2803 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2804 } else {
2805 unsigned Reg = AMDGPU::VGPR1;
2806 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2807
2808 CCInfo.AllocateReg(Reg);
2809 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2810 }
2811 }
2812
2813 if (Info.hasWorkItemIDZ()) {
2814 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2815 if (Subtarget->hasPackedTID()) {
2816 Info.setWorkItemIDZ(
2817 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2818 } else {
2819 unsigned Reg = AMDGPU::VGPR2;
2820 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2821
2822 CCInfo.AllocateReg(Reg);
2823 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2824 }
2825 }
2826}
2827
2828// Try to allocate a VGPR at the end of the argument list, or if no argument
2829// VGPRs are left allocating a stack slot.
2830// If \p Mask is is given it indicates bitfield position in the register.
2831// If \p Arg is given use it with new ]p Mask instead of allocating new.
2832static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2833 ArgDescriptor Arg = ArgDescriptor()) {
2834 if (Arg.isSet())
2835 return ArgDescriptor::createArg(Arg, Mask);
2836
2837 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2838 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2839 if (RegIdx == ArgVGPRs.size()) {
2840 // Spill to stack required.
2841 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2842
2843 return ArgDescriptor::createStack(Offset, Mask);
2844 }
2845
2846 unsigned Reg = ArgVGPRs[RegIdx];
2847 Reg = CCInfo.AllocateReg(Reg);
2848 assert(Reg != AMDGPU::NoRegister);
2849
2850 MachineFunction &MF = CCInfo.getMachineFunction();
2851 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2852 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2853 return ArgDescriptor::createRegister(Reg, Mask);
2854}
2855
2857 const TargetRegisterClass *RC,
2858 unsigned NumArgRegs) {
2859 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2860 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2861 if (RegIdx == ArgSGPRs.size())
2862 report_fatal_error("ran out of SGPRs for arguments");
2863
2864 unsigned Reg = ArgSGPRs[RegIdx];
2865 Reg = CCInfo.AllocateReg(Reg);
2866 assert(Reg != AMDGPU::NoRegister);
2867
2868 MachineFunction &MF = CCInfo.getMachineFunction();
2869 MF.addLiveIn(Reg, RC);
2871}
2872
2873// If this has a fixed position, we still should allocate the register in the
2874// CCInfo state. Technically we could get away with this for values passed
2875// outside of the normal argument range.
2877 const TargetRegisterClass *RC,
2878 MCRegister Reg) {
2879 Reg = CCInfo.AllocateReg(Reg);
2880 assert(Reg != AMDGPU::NoRegister);
2881 MachineFunction &MF = CCInfo.getMachineFunction();
2882 MF.addLiveIn(Reg, RC);
2883}
2884
2885static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2886 if (Arg) {
2887 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2888 Arg.getRegister());
2889 } else
2890 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2891}
2892
2893static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2894 if (Arg) {
2895 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2896 Arg.getRegister());
2897 } else
2898 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2899}
2900
2901/// Allocate implicit function VGPR arguments at the end of allocated user
2902/// arguments.
2904 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2905 SIMachineFunctionInfo &Info) const {
2906 const unsigned Mask = 0x3ff;
2907 ArgDescriptor Arg;
2908
2909 if (Info.hasWorkItemIDX()) {
2910 Arg = allocateVGPR32Input(CCInfo, Mask);
2911 Info.setWorkItemIDX(Arg);
2912 }
2913
2914 if (Info.hasWorkItemIDY()) {
2915 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2916 Info.setWorkItemIDY(Arg);
2917 }
2918
2919 if (Info.hasWorkItemIDZ())
2920 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2921}
2922
2923/// Allocate implicit function VGPR arguments in fixed registers.
2925 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2926 SIMachineFunctionInfo &Info) const {
2927 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2928 if (!Reg)
2929 report_fatal_error("failed to allocate VGPR for implicit arguments");
2930
2931 const unsigned Mask = 0x3ff;
2932 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2933 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2934 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2935}
2936
2938 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2939 SIMachineFunctionInfo &Info) const {
2940 auto &ArgInfo = Info.getArgInfo();
2941 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2942
2943 // TODO: Unify handling with private memory pointers.
2944 if (UserSGPRInfo.hasDispatchPtr())
2945 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2946
2947 if (UserSGPRInfo.hasQueuePtr())
2948 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2949
2950 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2951 // constant offset from the kernarg segment.
2952 if (Info.hasImplicitArgPtr())
2953 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2954
2955 if (UserSGPRInfo.hasDispatchID())
2956 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2957
2958 // flat_scratch_init is not applicable for non-kernel functions.
2959
2960 if (Info.hasWorkGroupIDX())
2961 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2962
2963 if (Info.hasWorkGroupIDY())
2964 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2965
2966 if (Info.hasWorkGroupIDZ())
2967 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2968
2969 if (Info.hasLDSKernelId())
2970 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2971}
2972
2973// Allocate special inputs passed in user SGPRs.
2975 MachineFunction &MF,
2976 const SIRegisterInfo &TRI,
2977 SIMachineFunctionInfo &Info) const {
2978 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2979 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2980 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2981 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2982 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2983 }
2984
2985 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2986 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2987 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2988 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2989 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2990 }
2991
2992 if (UserSGPRInfo.hasDispatchPtr()) {
2993 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2994 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2995 CCInfo.AllocateReg(DispatchPtrReg);
2996 }
2997
2998 if (UserSGPRInfo.hasQueuePtr()) {
2999 Register QueuePtrReg = Info.addQueuePtr(TRI);
3000 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
3001 CCInfo.AllocateReg(QueuePtrReg);
3002 }
3003
3004 if (UserSGPRInfo.hasKernargSegmentPtr()) {
3005 MachineRegisterInfo &MRI = MF.getRegInfo();
3006 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
3007 CCInfo.AllocateReg(InputPtrReg);
3008
3009 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
3011 }
3012
3013 if (UserSGPRInfo.hasDispatchID()) {
3014 Register DispatchIDReg = Info.addDispatchID(TRI);
3015 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
3016 CCInfo.AllocateReg(DispatchIDReg);
3017 }
3018
3019 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
3020 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
3021 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
3022 CCInfo.AllocateReg(FlatScratchInitReg);
3023 }
3024
3025 if (UserSGPRInfo.hasPrivateSegmentSize()) {
3026 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
3027 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
3028 CCInfo.AllocateReg(PrivateSegmentSizeReg);
3029 }
3030
3031 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
3032 // these from the dispatch pointer.
3033}
3034
3035// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
3036// sequential starting from the first argument.
3038 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
3040 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
3041 Function &F = MF.getFunction();
3042 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3043 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
3044 bool InPreloadSequence = true;
3045 unsigned InIdx = 0;
3046 bool AlignedForImplictArgs = false;
3047 unsigned ImplicitArgOffset = 0;
3048 for (auto &Arg : F.args()) {
3049 if (!InPreloadSequence || !Arg.hasInRegAttr())
3050 break;
3051
3052 unsigned ArgIdx = Arg.getArgNo();
3053 // Don't preload non-original args or parts not in the current preload
3054 // sequence.
3055 if (InIdx < Ins.size() &&
3056 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3057 break;
3058
3059 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
3060 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3061 InIdx++) {
3062 assert(ArgLocs[ArgIdx].isMemLoc());
3063 auto &ArgLoc = ArgLocs[InIdx];
3064 const Align KernelArgBaseAlign = Align(16);
3065 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3066 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
3067 unsigned NumAllocSGPRs =
3068 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
3069
3070 // Fix alignment for hidden arguments.
3071 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
3072 if (!AlignedForImplictArgs) {
3073 ImplicitArgOffset =
3074 alignTo(LastExplicitArgOffset,
3075 Subtarget->getAlignmentForImplicitArgPtr()) -
3076 LastExplicitArgOffset;
3077 AlignedForImplictArgs = true;
3078 }
3079 ArgOffset += ImplicitArgOffset;
3080 }
3081
3082 // Arg is preloaded into the previous SGPR.
3083 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3084 assert(InIdx >= 1 && "No previous SGPR");
3085 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3086 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3087 continue;
3088 }
3089
3090 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3091 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
3092 // Check for free user SGPRs for preloading.
3093 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3094 InPreloadSequence = false;
3095 break;
3096 }
3097
3098 // Preload this argument.
3099 const TargetRegisterClass *RC =
3100 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3101 SmallVectorImpl<MCRegister> *PreloadRegs =
3102 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3103
3104 if (PreloadRegs->size() > 1)
3105 RC = &AMDGPU::SGPR_32RegClass;
3106 for (auto &Reg : *PreloadRegs) {
3107 assert(Reg);
3108 MF.addLiveIn(Reg, RC);
3109 CCInfo.AllocateReg(Reg);
3110 }
3111
3112 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3113 }
3114 }
3115}
3116
3118 const SIRegisterInfo &TRI,
3119 SIMachineFunctionInfo &Info) const {
3120 // Always allocate this last since it is a synthetic preload.
3121 if (Info.hasLDSKernelId()) {
3122 Register Reg = Info.addLDSKernelId();
3123 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3124 CCInfo.AllocateReg(Reg);
3125 }
3126}
3127
3128// Allocate special input registers that are initialized per-wave.
3131 CallingConv::ID CallConv,
3132 bool IsShader) const {
3133 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3134 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3135 // Note: user SGPRs are handled by the front-end for graphics shaders
3136 // Pad up the used user SGPRs with dead inputs.
3137
3138 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3139 // before enabling architected SGPRs for workgroup IDs.
3140 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3141
3142 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3143 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3144 // rely on it to reach 16 since if we end up having no stack usage, it will
3145 // not really be added.
3146 unsigned NumRequiredSystemSGPRs =
3147 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3148 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3149 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3150 Register Reg = Info.addReservedUserSGPR();
3151 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3152 CCInfo.AllocateReg(Reg);
3153 }
3154 }
3155
3156 if (!HasArchitectedSGPRs) {
3157 if (Info.hasWorkGroupIDX()) {
3158 Register Reg = Info.addWorkGroupIDX();
3159 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3160 CCInfo.AllocateReg(Reg);
3161 }
3162
3163 if (Info.hasWorkGroupIDY()) {
3164 Register Reg = Info.addWorkGroupIDY();
3165 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3166 CCInfo.AllocateReg(Reg);
3167 }
3168
3169 if (Info.hasWorkGroupIDZ()) {
3170 Register Reg = Info.addWorkGroupIDZ();
3171 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3172 CCInfo.AllocateReg(Reg);
3173 }
3174 }
3175
3176 if (Info.hasWorkGroupInfo()) {
3177 Register Reg = Info.addWorkGroupInfo();
3178 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3179 CCInfo.AllocateReg(Reg);
3180 }
3181
3182 if (Info.hasPrivateSegmentWaveByteOffset()) {
3183 // Scratch wave offset passed in system SGPR.
3184 unsigned PrivateSegmentWaveByteOffsetReg;
3185
3186 if (IsShader) {
3187 PrivateSegmentWaveByteOffsetReg =
3188 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3189
3190 // This is true if the scratch wave byte offset doesn't have a fixed
3191 // location.
3192 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3193 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3194 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3195 }
3196 } else
3197 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3198
3199 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3200 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3201 }
3202
3203 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3204 Info.getNumPreloadedSGPRs() >= 16);
3205}
3206
3208 MachineFunction &MF,
3209 const SIRegisterInfo &TRI,
3210 SIMachineFunctionInfo &Info) {
3211 // Now that we've figured out where the scratch register inputs are, see if
3212 // should reserve the arguments and use them directly.
3213 MachineFrameInfo &MFI = MF.getFrameInfo();
3214 bool HasStackObjects = MFI.hasStackObjects();
3215 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3216
3217 // Record that we know we have non-spill stack objects so we don't need to
3218 // check all stack objects later.
3219 if (HasStackObjects)
3220 Info.setHasNonSpillStackObjects(true);
3221
3222 // Everything live out of a block is spilled with fast regalloc, so it's
3223 // almost certain that spilling will be required.
3225 HasStackObjects = true;
3226
3227 // For now assume stack access is needed in any callee functions, so we need
3228 // the scratch registers to pass in.
3229 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3230
3231 if (!ST.hasFlatScratchEnabled()) {
3232 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3233 // If we have stack objects, we unquestionably need the private buffer
3234 // resource. For the Code Object V2 ABI, this will be the first 4 user
3235 // SGPR inputs. We can reserve those and use them directly.
3236
3237 Register PrivateSegmentBufferReg =
3239 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3240 } else {
3241 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3242 // We tentatively reserve the last registers (skipping the last registers
3243 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3244 // we'll replace these with the ones immediately after those which were
3245 // really allocated. In the prologue copies will be inserted from the
3246 // argument to these reserved registers.
3247
3248 // Without HSA, relocations are used for the scratch pointer and the
3249 // buffer resource setup is always inserted in the prologue. Scratch wave
3250 // offset is still in an input SGPR.
3251 Info.setScratchRSrcReg(ReservedBufferReg);
3252 }
3253 }
3254
3255 MachineRegisterInfo &MRI = MF.getRegInfo();
3256
3257 // For entry functions we have to set up the stack pointer if we use it,
3258 // whereas non-entry functions get this "for free". This means there is no
3259 // intrinsic advantage to using S32 over S34 in cases where we do not have
3260 // calls but do need a frame pointer (i.e. if we are requested to have one
3261 // because frame pointer elimination is disabled). To keep things simple we
3262 // only ever use S32 as the call ABI stack pointer, and so using it does not
3263 // imply we need a separate frame pointer.
3264 //
3265 // Try to use s32 as the SP, but move it if it would interfere with input
3266 // arguments. This won't work with calls though.
3267 //
3268 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3269 // registers.
3270 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3271 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3272 } else {
3274
3275 if (MFI.hasCalls())
3276 report_fatal_error("call in graphics shader with too many input SGPRs");
3277
3278 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3279 if (!MRI.isLiveIn(Reg)) {
3280 Info.setStackPtrOffsetReg(Reg);
3281 break;
3282 }
3283 }
3284
3285 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3286 report_fatal_error("failed to find register for SP");
3287 }
3288
3289 // hasFP should be accurate for entry functions even before the frame is
3290 // finalized, because it does not rely on the known stack size, only
3291 // properties like whether variable sized objects are present.
3292 if (ST.getFrameLowering()->hasFP(MF)) {
3293 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3294 }
3295}
3296
3299 return !Info->isEntryFunction();
3300}
3301
3303
3305 MachineBasicBlock *Entry,
3306 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3308
3309 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3310 if (!IStart)
3311 return;
3312
3313 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3314 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3315 MachineBasicBlock::iterator MBBI = Entry->begin();
3316 for (const MCPhysReg *I = IStart; *I; ++I) {
3317 const TargetRegisterClass *RC = nullptr;
3318 if (AMDGPU::SReg_64RegClass.contains(*I))
3319 RC = &AMDGPU::SGPR_64RegClass;
3320 else if (AMDGPU::SReg_32RegClass.contains(*I))
3321 RC = &AMDGPU::SGPR_32RegClass;
3322 else
3323 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3324
3325 Register NewVR = MRI->createVirtualRegister(RC);
3326 // Create copy from CSR to a virtual register.
3327 Entry->addLiveIn(*I);
3328 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3329 .addReg(*I);
3330
3331 // Insert the copy-back instructions right before the terminator.
3332 for (auto *Exit : Exits)
3333 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3334 TII->get(TargetOpcode::COPY), *I)
3335 .addReg(NewVR);
3336 }
3337}
3338
3340 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3341 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3342 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3344
3346 const Function &Fn = MF.getFunction();
3349 bool IsError = false;
3350
3351 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3353 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3354 IsError = true;
3355 }
3356
3359 BitVector Skipped(Ins.size());
3360 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3361 *DAG.getContext());
3362
3363 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3364 bool IsKernel = AMDGPU::isKernel(CallConv);
3365 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3366
3367 if (IsGraphics) {
3368 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3369 assert(!UserSGPRInfo.hasDispatchPtr() &&
3370 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3371 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3372 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3373 (void)UserSGPRInfo;
3374 if (!Subtarget->hasFlatScratchEnabled())
3375 assert(!UserSGPRInfo.hasFlatScratchInit());
3376 if ((CallConv != CallingConv::AMDGPU_CS &&
3377 CallConv != CallingConv::AMDGPU_Gfx &&
3378 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3379 !Subtarget->hasArchitectedSGPRs())
3380 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3381 !Info->hasWorkGroupIDZ());
3382 }
3383
3384 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3385
3386 if (CallConv == CallingConv::AMDGPU_PS) {
3387 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3388
3389 // At least one interpolation mode must be enabled or else the GPU will
3390 // hang.
3391 //
3392 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3393 // set PSInputAddr, the user wants to enable some bits after the compilation
3394 // based on run-time states. Since we can't know what the final PSInputEna
3395 // will look like, so we shouldn't do anything here and the user should take
3396 // responsibility for the correct programming.
3397 //
3398 // Otherwise, the following restrictions apply:
3399 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3400 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3401 // enabled too.
3402 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3403 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3404 CCInfo.AllocateReg(AMDGPU::VGPR0);
3405 CCInfo.AllocateReg(AMDGPU::VGPR1);
3406 Info->markPSInputAllocated(0);
3407 Info->markPSInputEnabled(0);
3408 }
3409 if (Subtarget->isAmdPalOS()) {
3410 // For isAmdPalOS, the user does not enable some bits after compilation
3411 // based on run-time states; the register values being generated here are
3412 // the final ones set in hardware. Therefore we need to apply the
3413 // workaround to PSInputAddr and PSInputEnable together. (The case where
3414 // a bit is set in PSInputAddr but not PSInputEnable is where the
3415 // frontend set up an input arg for a particular interpolation mode, but
3416 // nothing uses that input arg. Really we should have an earlier pass
3417 // that removes such an arg.)
3418 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3419 if ((PsInputBits & 0x7F) == 0 ||
3420 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3421 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3422 }
3423 } else if (IsKernel) {
3424 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3425 } else {
3426 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3427 Ins.end());
3428 }
3429
3430 if (IsKernel)
3431 analyzeFormalArgumentsCompute(CCInfo, Ins);
3432
3433 if (IsEntryFunc) {
3434 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3435 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3436 if (IsKernel && Subtarget->hasKernargPreload())
3437 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3438
3439 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3440 } else if (!IsGraphics) {
3441 // For the fixed ABI, pass workitem IDs in the last argument register.
3442 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3443
3444 // FIXME: Sink this into allocateSpecialInputSGPRs
3445 if (!Subtarget->hasFlatScratchEnabled())
3446 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3447
3448 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3449 }
3450
3451 if (!IsKernel) {
3452 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3453 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3454
3455 // This assumes the registers are allocated by CCInfo in ascending order
3456 // with no gaps.
3457 Info->setNumWaveDispatchSGPRs(
3458 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3459 Info->setNumWaveDispatchVGPRs(
3460 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3461 } else if (Info->getNumKernargPreloadedSGPRs()) {
3462 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3463 }
3464
3466
3467 if (IsWholeWaveFunc) {
3468 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3469 {MVT::i1, MVT::Other}, Chain);
3470 InVals.push_back(Setup.getValue(0));
3471 Chains.push_back(Setup.getValue(1));
3472 }
3473
3474 // FIXME: This is the minimum kernel argument alignment. We should improve
3475 // this to the maximum alignment of the arguments.
3476 //
3477 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3478 // kern arg offset.
3479 const Align KernelArgBaseAlign = Align(16);
3480
3481 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3482 ++i) {
3483 const ISD::InputArg &Arg = Ins[i];
3484 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3485 InVals.push_back(DAG.getPOISON(Arg.VT));
3486 continue;
3487 }
3488
3489 CCValAssign &VA = ArgLocs[ArgIdx++];
3490 MVT VT = VA.getLocVT();
3491
3492 if (IsEntryFunc && VA.isMemLoc()) {
3493 VT = Ins[i].VT;
3494 EVT MemVT = VA.getLocVT();
3495
3496 const uint64_t Offset = VA.getLocMemOffset();
3497 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3498
3499 if (Arg.Flags.isByRef()) {
3500 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3501
3502 const GCNTargetMachine &TM =
3503 static_cast<const GCNTargetMachine &>(getTargetMachine());
3504 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3505 Arg.Flags.getPointerAddrSpace())) {
3508 }
3509
3510 InVals.push_back(Ptr);
3511 continue;
3512 }
3513
3514 SDValue NewArg;
3515 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3516 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3517 // In this case the argument is packed into the previous preload SGPR.
3518 int64_t AlignDownOffset = alignDown(Offset, 4);
3519 int64_t OffsetDiff = Offset - AlignDownOffset;
3520 EVT IntVT = MemVT.changeTypeToInteger();
3521
3522 const SIMachineFunctionInfo *Info =
3525 Register Reg =
3526 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3527
3528 assert(Reg);
3529 Register VReg = MRI.getLiveInVirtReg(Reg);
3530 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3531
3532 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3533 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3534
3535 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3536 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3537 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3538 Ins[i].Flags.isSExt(), &Ins[i]);
3539
3540 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3541 } else {
3542 const SIMachineFunctionInfo *Info =
3545 const SmallVectorImpl<MCRegister> &PreloadRegs =
3546 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3547
3548 SDValue Copy;
3549 if (PreloadRegs.size() == 1) {
3550 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3551 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3552 NewArg = DAG.getCopyFromReg(
3553 Chain, DL, VReg,
3555 TRI->getRegSizeInBits(*RC)));
3556
3557 } else {
3558 // If the kernarg alignment does not match the alignment of the SGPR
3559 // tuple RC that can accommodate this argument, it will be built up
3560 // via copies from from the individual SGPRs that the argument was
3561 // preloaded to.
3563 for (auto Reg : PreloadRegs) {
3564 Register VReg = MRI.getLiveInVirtReg(Reg);
3565 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3566 Elts.push_back(Copy);
3567 }
3568 NewArg =
3569 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3570 PreloadRegs.size()),
3571 DL, Elts);
3572 }
3573
3574 // If the argument was preloaded to multiple consecutive 32-bit
3575 // registers because of misalignment between addressable SGPR tuples
3576 // and the argument size, we can still assume that because of kernarg
3577 // segment alignment restrictions that NewArg's size is the same as
3578 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3579 // truncate since we cannot preload to less than a single SGPR and the
3580 // MemVT may be smaller.
3581 EVT MemVTInt =
3583 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3584 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3585
3586 NewArg = DAG.getBitcast(MemVT, NewArg);
3587 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3588 Ins[i].Flags.isSExt(), &Ins[i]);
3589 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3590 }
3591 } else {
3592 // Hidden arguments that are in the kernel signature must be preloaded
3593 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3594 // the argument list and is not preloaded.
3595 if (Arg.isOrigArg()) {
3596 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3597 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3599 *OrigArg->getParent(),
3600 "hidden argument in kernel signature was not preloaded",
3601 DL.getDebugLoc()));
3602 }
3603 }
3604
3605 NewArg =
3606 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3607 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3608 }
3609 Chains.push_back(NewArg.getValue(1));
3610
3611 auto *ParamTy =
3612 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3613 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3614 ParamTy &&
3615 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3616 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3617 // On SI local pointers are just offsets into LDS, so they are always
3618 // less than 16-bits. On CI and newer they could potentially be
3619 // real pointers, so we can't guarantee their size.
3620 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3621 DAG.getValueType(MVT::i16));
3622 }
3623
3624 InVals.push_back(NewArg);
3625 continue;
3626 }
3627 if (!IsEntryFunc && VA.isMemLoc()) {
3628 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3629 InVals.push_back(Val);
3630 if (!Arg.Flags.isByVal())
3631 Chains.push_back(Val.getValue(1));
3632 continue;
3633 }
3634
3635 assert(VA.isRegLoc() && "Parameter must be in a register!");
3636
3637 Register Reg = VA.getLocReg();
3638 const TargetRegisterClass *RC = nullptr;
3639 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3640 RC = &AMDGPU::VGPR_32RegClass;
3641 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3642 RC = &AMDGPU::SGPR_32RegClass;
3643 else
3644 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3645
3646 Reg = MF.addLiveIn(Reg, RC);
3647 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3648 if (Arg.Flags.isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3649 // FIXME: Need to forward the chains created by `CopyFromReg`s, make sure
3650 // they will read physical regs before any side effect instructions.
3651 SDValue ReadFirstLane =
3652 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3654 ReadFirstLane, Val);
3655 }
3656
3657 if (Arg.Flags.isSRet()) {
3658 // The return object should be reasonably addressable.
3659
3660 // FIXME: This helps when the return is a real sret. If it is a
3661 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3662 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3663 unsigned NumBits =
3665 Val = DAG.getNode(
3666 ISD::AssertZext, DL, VT, Val,
3667 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3668 }
3669
3670 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3671 InVals.push_back(Val);
3672 }
3673
3674 // Start adding system SGPRs.
3675 if (IsEntryFunc)
3676 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3677
3678 unsigned StackArgSize = CCInfo.getStackSize();
3679 Info->setBytesInStackArgArea(StackArgSize);
3680
3681 return Chains.empty() ? Chain
3682 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3683}
3684
3685// TODO: If return values can't fit in registers, we should return as many as
3686// possible in registers before passing on stack.
3688 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3689 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3690 const Type *RetTy) const {
3691 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3692 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3693 // for shaders. Vector types should be explicitly handled by CC.
3694 if (AMDGPU::isEntryFunctionCC(CallConv))
3695 return true;
3696
3698 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3699 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3700 return false;
3701
3702 // We must use the stack if return would require unavailable registers.
3703 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3704 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3705 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3706 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3707 return false;
3708
3709 return true;
3710}
3711
3712SDValue
3714 bool isVarArg,
3716 const SmallVectorImpl<SDValue> &OutVals,
3717 const SDLoc &DL, SelectionDAG &DAG) const {
3721
3722 if (AMDGPU::isKernel(CallConv)) {
3723 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3724 OutVals, DL, DAG);
3725 }
3726
3727 bool IsShader = AMDGPU::isShader(CallConv);
3728
3729 Info->setIfReturnsVoid(Outs.empty());
3730 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3731
3732 // CCValAssign - represent the assignment of the return value to a location.
3734
3735 // CCState - Info about the registers and stack slots.
3736 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3737 *DAG.getContext());
3738
3739 // Analyze outgoing return values.
3740 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3741
3742 SDValue Glue;
3744 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3745
3746 SDValue ReadFirstLane =
3747 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3748 // Copy the result values into the output registers.
3749 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3750 ++I, ++RealRVLocIdx) {
3751 CCValAssign &VA = RVLocs[I];
3752 assert(VA.isRegLoc() && "Can only return in registers!");
3753 // TODO: Partially return in registers if return values don't fit.
3754 SDValue Arg = OutVals[RealRVLocIdx];
3755
3756 // Copied from other backends.
3757 switch (VA.getLocInfo()) {
3758 case CCValAssign::Full:
3759 break;
3760 case CCValAssign::BCvt:
3761 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3762 break;
3763 case CCValAssign::SExt:
3764 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3765 break;
3766 case CCValAssign::ZExt:
3767 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3768 break;
3769 case CCValAssign::AExt:
3770 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3771 break;
3772 default:
3773 llvm_unreachable("Unknown loc info!");
3774 }
3775 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3777 ReadFirstLane, Arg);
3778 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3779 Glue = Chain.getValue(1);
3780 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3781 }
3782
3783 // FIXME: Does sret work properly?
3784 if (!Info->isEntryFunction()) {
3785 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3786 const MCPhysReg *I =
3787 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3788 if (I) {
3789 for (; *I; ++I) {
3790 if (AMDGPU::SReg_64RegClass.contains(*I))
3791 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3792 else if (AMDGPU::SReg_32RegClass.contains(*I))
3793 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3794 else
3795 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3796 }
3797 }
3798 }
3799
3800 // Update chain and glue.
3801 RetOps[0] = Chain;
3802 if (Glue.getNode())
3803 RetOps.push_back(Glue);
3804
3805 unsigned Opc = AMDGPUISD::ENDPGM;
3806 if (!IsWaveEnd)
3807 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3808 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3809 : AMDGPUISD::RET_GLUE;
3810 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3811}
3812
3814 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3815 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3816 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3817 SDValue ThisVal) const {
3818 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3819
3820 // Assign locations to each value returned by this call.
3822 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3823 *DAG.getContext());
3824 CCInfo.AnalyzeCallResult(Ins, RetCC);
3825
3826 // Copy all of the result registers out of their specified physreg.
3827 for (CCValAssign VA : RVLocs) {
3828 SDValue Val;
3829
3830 if (VA.isRegLoc()) {
3831 Val =
3832 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3833 Chain = Val.getValue(1);
3834 InGlue = Val.getValue(2);
3835 } else if (VA.isMemLoc()) {
3836 report_fatal_error("TODO: return values in memory");
3837 } else
3838 llvm_unreachable("unknown argument location type");
3839
3840 switch (VA.getLocInfo()) {
3841 case CCValAssign::Full:
3842 break;
3843 case CCValAssign::BCvt:
3844 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3845 break;
3846 case CCValAssign::ZExt:
3847 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3848 DAG.getValueType(VA.getValVT()));
3849 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3850 break;
3851 case CCValAssign::SExt:
3852 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3853 DAG.getValueType(VA.getValVT()));
3854 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3855 break;
3856 case CCValAssign::AExt:
3857 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3858 break;
3859 default:
3860 llvm_unreachable("Unknown loc info!");
3861 }
3862
3863 InVals.push_back(Val);
3864 }
3865
3866 return Chain;
3867}
3868
3869// Add code to pass special inputs required depending on used features separate
3870// from the explicit user arguments present in the IR.
3872 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3873 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3874 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3875 // If we don't have a call site, this was a call inserted by
3876 // legalization. These can never use special inputs.
3877 if (!CLI.CB)
3878 return;
3879
3880 SelectionDAG &DAG = CLI.DAG;
3881 const SDLoc &DL = CLI.DL;
3882 const Function &F = DAG.getMachineFunction().getFunction();
3883
3884 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3885 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3886
3887 const AMDGPUFunctionArgInfo &CalleeArgInfo =
3889
3890 // TODO: Unify with private memory register handling. This is complicated by
3891 // the fact that at least in kernels, the input argument is not necessarily
3892 // in the same location as the input.
3893 // clang-format off
3894 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3895 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3896 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3897 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3898 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3899 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3900 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3901 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3902 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3903 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3904 };
3905 // clang-format on
3906
3907 for (auto [InputID, Attrs] : ImplicitAttrs) {
3908 // If the callee does not use the attribute value, skip copying the value.
3909 if (all_of(Attrs, [&](StringRef Attr) {
3910 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3911 }))
3912 continue;
3913
3914 const auto [OutgoingArg, ArgRC, ArgTy] =
3915 CalleeArgInfo.getPreloadedValue(InputID);
3916 if (!OutgoingArg)
3917 continue;
3918
3919 const auto [IncomingArg, IncomingArgRC, Ty] =
3920 CallerArgInfo.getPreloadedValue(InputID);
3921 assert(IncomingArgRC == ArgRC);
3922
3923 // All special arguments are ints for now.
3924 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3925 SDValue InputReg;
3926
3927 if (IncomingArg) {
3928 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3929 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3930 // The implicit arg ptr is special because it doesn't have a corresponding
3931 // input for kernels, and is computed from the kernarg segment pointer.
3932 InputReg = getImplicitArgPtr(DAG, DL);
3933 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3934 std::optional<uint32_t> Id =
3936 if (Id.has_value()) {
3937 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3938 } else {
3939 InputReg = DAG.getPOISON(ArgVT);
3940 }
3941 } else {
3942 // We may have proven the input wasn't needed, although the ABI is
3943 // requiring it. We just need to allocate the register appropriately.
3944 InputReg = DAG.getPOISON(ArgVT);
3945 }
3946
3947 if (OutgoingArg->isRegister()) {
3948 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3949 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3950 report_fatal_error("failed to allocate implicit input argument");
3951 } else {
3952 unsigned SpecialArgOffset =
3953 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3954 SDValue ArgStore =
3955 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3956 MemOpChains.push_back(ArgStore);
3957 }
3958 }
3959
3960 // Pack workitem IDs into a single register or pass it as is if already
3961 // packed.
3962
3963 auto [OutgoingArg, ArgRC, Ty] =
3965 if (!OutgoingArg)
3966 std::tie(OutgoingArg, ArgRC, Ty) =
3968 if (!OutgoingArg)
3969 std::tie(OutgoingArg, ArgRC, Ty) =
3971 if (!OutgoingArg)
3972 return;
3973
3974 const ArgDescriptor *IncomingArgX = std::get<0>(
3976 const ArgDescriptor *IncomingArgY = std::get<0>(
3978 const ArgDescriptor *IncomingArgZ = std::get<0>(
3980
3981 SDValue InputReg;
3982 SDLoc SL;
3983
3984 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3985 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3986 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3987
3988 // If incoming ids are not packed we need to pack them.
3989 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
3990 NeedWorkItemIDX) {
3991 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3992 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3993 } else {
3994 InputReg = DAG.getConstant(0, DL, MVT::i32);
3995 }
3996 }
3997
3998 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
3999 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
4000 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
4001 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
4002 DAG.getShiftAmountConstant(10, MVT::i32, SL));
4003 InputReg = InputReg.getNode()
4004 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
4005 : Y;
4006 }
4007
4008 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
4009 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
4010 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
4011 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
4012 DAG.getShiftAmountConstant(20, MVT::i32, SL));
4013 InputReg = InputReg.getNode()
4014 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
4015 : Z;
4016 }
4017
4018 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4019 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4020 // We're in a situation where the outgoing function requires the workitem
4021 // ID, but the calling function does not have it (e.g a graphics function
4022 // calling a C calling convention function). This is illegal, but we need
4023 // to produce something.
4024 InputReg = DAG.getPOISON(MVT::i32);
4025 } else {
4026 // Workitem ids are already packed, any of present incoming arguments
4027 // will carry all required fields.
4028 ArgDescriptor IncomingArg =
4029 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
4030 : IncomingArgY ? *IncomingArgY
4031 : *IncomingArgZ,
4032 ~0u);
4033 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
4034 }
4035 }
4036
4037 if (OutgoingArg->isRegister()) {
4038 if (InputReg)
4039 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4040
4041 CCInfo.AllocateReg(OutgoingArg->getRegister());
4042 } else {
4043 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
4044 if (InputReg) {
4045 SDValue ArgStore =
4046 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
4047 MemOpChains.push_back(ArgStore);
4048 }
4049 }
4050}
4051
4053 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
4055 const SmallVectorImpl<SDValue> &OutVals,
4056 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4057 if (AMDGPU::isChainCC(CalleeCC))
4058 return true;
4059
4060 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
4061 return false;
4062
4063 // For a divergent call target, we need to do a waterfall loop over the
4064 // possible callees which precludes us from using a simple jump.
4065 if (Callee->isDivergent())
4066 return false;
4067
4069 const Function &CallerF = MF.getFunction();
4070 CallingConv::ID CallerCC = CallerF.getCallingConv();
4072 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4073
4074 // Kernels aren't callable, and don't have a live in return address so it
4075 // doesn't make sense to do a tail call with entry functions.
4076 if (!CallerPreserved)
4077 return false;
4078
4079 bool CCMatch = CallerCC == CalleeCC;
4080
4082 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
4083 return true;
4084 return false;
4085 }
4086
4087 // TODO: Can we handle var args?
4088 if (IsVarArg)
4089 return false;
4090
4091 for (const Argument &Arg : CallerF.args()) {
4092 if (Arg.hasByValAttr())
4093 return false;
4094 }
4095
4096 LLVMContext &Ctx = *DAG.getContext();
4097
4098 // Check that the call results are passed in the same way.
4099 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4100 CCAssignFnForCall(CalleeCC, IsVarArg),
4101 CCAssignFnForCall(CallerCC, IsVarArg)))
4102 return false;
4103
4104 // The callee has to preserve all registers the caller needs to preserve.
4105 if (!CCMatch) {
4106 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4107 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4108 return false;
4109 }
4110
4111 // Nothing more to check if the callee is taking no arguments.
4112 if (Outs.empty())
4113 return true;
4114
4116 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4117
4118 // FIXME: We are not allocating special input registers, so we will be
4119 // deciding based on incorrect register assignments.
4120 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4121
4122 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4123 // If the stack arguments for this call do not fit into our own save area then
4124 // the call cannot be made tail.
4125 // TODO: Is this really necessary?
4126 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4127 return false;
4128
4129 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4130 // FIXME: What about inreg arguments that end up passed in memory?
4131 if (!CCVA.isRegLoc())
4132 continue;
4133
4134 // If we are passing an argument in an SGPR, and the value is divergent,
4135 // this call requires a waterfall loop.
4136 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4137 LLVM_DEBUG(
4138 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4139 << printReg(CCVA.getLocReg(), TRI) << '\n');
4140 return false;
4141 }
4142 }
4143
4144 const MachineRegisterInfo &MRI = MF.getRegInfo();
4145 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4146}
4147
4149 if (!CI->isTailCall())
4150 return false;
4151
4152 const Function *ParentFn = CI->getFunction();
4154 return false;
4155 return true;
4156}
4157
4158namespace {
4159// Chain calls have special arguments that we need to handle. These are
4160// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4161// arguments (index 0 and 1 respectively).
4162enum ChainCallArgIdx {
4163 Exec = 2,
4164 Flags,
4165 NumVGPRs,
4166 FallbackExec,
4167 FallbackCallee
4168};
4169} // anonymous namespace
4170
4171// The wave scratch offset register is used as the global base pointer.
4173 SmallVectorImpl<SDValue> &InVals) const {
4174 CallingConv::ID CallConv = CLI.CallConv;
4175 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4176
4177 SelectionDAG &DAG = CLI.DAG;
4178
4179 const SDLoc &DL = CLI.DL;
4180 SDValue Chain = CLI.Chain;
4181 SDValue Callee = CLI.Callee;
4182
4183 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4184 bool UsesDynamicVGPRs = false;
4185 if (IsChainCallConv) {
4186 // The last arguments should be the value that we need to put in EXEC,
4187 // followed by the flags and any other arguments with special meanings.
4188 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4189 // we don't treat them like the "real" arguments.
4190 auto RequestedExecIt =
4191 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4192 return Arg.OrigArgIndex == 2;
4193 });
4194 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4195
4196 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4197 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4198 CLI.OutVals.end());
4199 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4200
4201 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4202 "Haven't popped all the special args");
4203
4204 TargetLowering::ArgListEntry RequestedExecArg =
4205 CLI.Args[ChainCallArgIdx::Exec];
4206 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4207 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4208
4209 // Convert constants into TargetConstants, so they become immediate operands
4210 // instead of being selected into S_MOV.
4211 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4212 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4213 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4214 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4215 } else
4216 ChainCallSpecialArgs.push_back(Arg.Node);
4217 };
4218
4219 PushNodeOrTargetConstant(RequestedExecArg);
4220
4221 // Process any other special arguments depending on the value of the flags.
4222 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4223
4224 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4225 if (FlagsValue.isZero()) {
4226 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4227 return lowerUnhandledCall(CLI, InVals,
4228 "no additional args allowed if flags == 0");
4229 } else if (FlagsValue.isOneBitSet(0)) {
4230 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4231 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4232 }
4233
4234 if (!Subtarget->isWave32()) {
4235 return lowerUnhandledCall(
4236 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4237 }
4238
4239 UsesDynamicVGPRs = true;
4240 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4241 CLI.Args.end(), PushNodeOrTargetConstant);
4242 }
4243 }
4244
4246 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4248 bool &IsTailCall = CLI.IsTailCall;
4249 bool IsVarArg = CLI.IsVarArg;
4250 bool IsSibCall = false;
4252
4253 if (Callee.isUndef() || isNullConstant(Callee)) {
4254 if (!CLI.IsTailCall) {
4255 for (ISD::InputArg &Arg : CLI.Ins)
4256 InVals.push_back(DAG.getPOISON(Arg.VT));
4257 }
4258
4259 return Chain;
4260 }
4261
4262 if (IsVarArg) {
4263 return lowerUnhandledCall(CLI, InVals,
4264 "unsupported call to variadic function ");
4265 }
4266
4267 if (!CLI.CB)
4268 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4269
4270 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4271 return lowerUnhandledCall(CLI, InVals,
4272 "unsupported required tail call to function ");
4273 }
4274
4275 if (IsTailCall) {
4276 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4277 Outs, OutVals, Ins, DAG);
4278 if (!IsTailCall &&
4279 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4280 report_fatal_error("failed to perform tail call elimination on a call "
4281 "site marked musttail or on llvm.amdgcn.cs.chain");
4282 }
4283
4284 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4285
4286 // A sibling call is one where we're under the usual C ABI and not planning
4287 // to change that but can still do a tail call:
4288 if (!TailCallOpt && IsTailCall)
4289 IsSibCall = true;
4290
4291 if (IsTailCall)
4292 ++NumTailCalls;
4293 }
4294
4297 SmallVector<SDValue, 8> MemOpChains;
4298
4299 // Analyze operands of the call, assigning locations to each operand.
4301 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4302 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4303
4304 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4306 // With a fixed ABI, allocate fixed registers before user arguments.
4307 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4308 }
4309
4310 // Mark the scratch resource descriptor as allocated so the CC analysis
4311 // does not assign user arguments to these registers, matching the callee.
4312 if (!Subtarget->hasFlatScratchEnabled())
4313 CCInfo.AllocateReg(Info->getScratchRSrcReg());
4314
4315 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4316
4317 // Get a count of how many bytes are to be pushed on the stack.
4318 unsigned NumBytes = CCInfo.getStackSize();
4319
4320 if (IsSibCall) {
4321 // Since we're not changing the ABI to make this a tail call, the memory
4322 // operands are already available in the caller's incoming argument space.
4323 NumBytes = 0;
4324 }
4325
4326 // FPDiff is the byte offset of the call's argument area from the callee's.
4327 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4328 // by this amount for a tail call. In a sibling call it must be 0 because the
4329 // caller will deallocate the entire stack and the callee still expects its
4330 // arguments to begin at SP+0. Completely unused for non-tail calls.
4331 int32_t FPDiff = 0;
4332 MachineFrameInfo &MFI = MF.getFrameInfo();
4333 auto *TRI = Subtarget->getRegisterInfo();
4334
4335 // Adjust the stack pointer for the new arguments...
4336 // These operations are automatically eliminated by the prolog/epilog pass
4337 if (!IsSibCall)
4338 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4339
4340 if (!IsSibCall || IsChainCallConv) {
4341 if (!Subtarget->hasFlatScratchEnabled()) {
4342 SmallVector<SDValue, 4> CopyFromChains;
4343
4344 // In the HSA case, this should be an identity copy.
4345 SDValue ScratchRSrcReg =
4346 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4347 RegsToPass.emplace_back(IsChainCallConv
4348 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4349 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4350 ScratchRSrcReg);
4351 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4352 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4353 }
4354 }
4355
4356 const unsigned NumSpecialInputs = RegsToPass.size();
4357
4358 MVT PtrVT = MVT::i32;
4359
4360 // Walk the register/memloc assignments, inserting copies/loads.
4361 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4362 CCValAssign &VA = ArgLocs[i];
4363 SDValue Arg = OutVals[i];
4364
4365 // Promote the value if needed.
4366 switch (VA.getLocInfo()) {
4367 case CCValAssign::Full:
4368 break;
4369 case CCValAssign::BCvt:
4370 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4371 break;
4372 case CCValAssign::ZExt:
4373 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4374 break;
4375 case CCValAssign::SExt:
4376 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4377 break;
4378 case CCValAssign::AExt:
4379 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4380 break;
4381 case CCValAssign::FPExt:
4382 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4383 break;
4384 default:
4385 llvm_unreachable("Unknown loc info!");
4386 }
4387
4388 if (VA.isRegLoc()) {
4389 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4390 } else {
4391 assert(VA.isMemLoc());
4392
4393 SDValue DstAddr;
4394 MachinePointerInfo DstInfo;
4395
4396 unsigned LocMemOffset = VA.getLocMemOffset();
4397 int32_t Offset = LocMemOffset;
4398
4399 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4400 MaybeAlign Alignment;
4401
4402 if (IsTailCall) {
4403 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4404 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4405 : VA.getValVT().getStoreSize();
4406
4407 // FIXME: We can have better than the minimum byval required alignment.
4408 Alignment =
4409 Flags.isByVal()
4410 ? Flags.getNonZeroByValAlign()
4411 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4412
4413 Offset = Offset + FPDiff;
4414 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4415
4416 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4417 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4418
4419 // Make sure any stack arguments overlapping with where we're storing
4420 // are loaded before this eventual operation. Otherwise they'll be
4421 // clobbered.
4422
4423 // FIXME: Why is this really necessary? This seems to just result in a
4424 // lot of code to copy the stack and write them back to the same
4425 // locations, which are supposed to be immutable?
4426 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4427 } else {
4428 // Stores to the argument stack area are relative to the stack pointer.
4429 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4430 MVT::i32);
4431 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4432 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4433 Alignment =
4434 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4435 }
4436
4437 if (Outs[i].Flags.isByVal()) {
4438 SDValue SizeNode =
4439 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4440 SDValue Cpy =
4441 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4442 Outs[i].Flags.getNonZeroByValAlign(),
4443 /*isVol = */ false, /*AlwaysInline = */ true,
4444 /*CI=*/nullptr, std::nullopt, DstInfo,
4446
4447 MemOpChains.push_back(Cpy);
4448 } else {
4449 SDValue Store =
4450 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4451 MemOpChains.push_back(Store);
4452 }
4453 }
4454 }
4455
4456 if (!MemOpChains.empty())
4457 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4458
4459 SDValue ReadFirstLaneID =
4460 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4461
4462 SDValue TokenGlue;
4463 if (CLI.ConvergenceControlToken) {
4464 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4466 }
4467
4468 // Build a sequence of copy-to-reg nodes chained together with token chain
4469 // and flag operands which copy the outgoing args into the appropriate regs.
4470 SDValue InGlue;
4471
4472 unsigned ArgIdx = 0;
4473 for (auto [Reg, Val] : RegsToPass) {
4474 if (ArgIdx++ >= NumSpecialInputs &&
4475 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4476 // For chain calls, the inreg arguments are required to be
4477 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4478 // they are uniform.
4479 //
4480 // For other calls, if an inreg arguments is known to be uniform,
4481 // speculatively insert a readfirstlane in case it is in a VGPR.
4482 //
4483 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4484 // value, so let that continue to produce invalid code.
4485
4486 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4487 if (TokenGlue)
4488 ReadfirstlaneArgs.push_back(TokenGlue);
4490 ReadfirstlaneArgs);
4491 }
4492
4493 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4494 InGlue = Chain.getValue(1);
4495 }
4496
4497 // We don't usually want to end the call-sequence here because we would tidy
4498 // the frame up *after* the call, however in the ABI-changing tail-call case
4499 // we've carefully laid out the parameters so that when sp is reset they'll be
4500 // in the correct location.
4501 if (IsTailCall && !IsSibCall) {
4502 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4503 InGlue = Chain.getValue(1);
4504 }
4505
4506 std::vector<SDValue> Ops({Chain});
4507
4508 // Add a redundant copy of the callee global which will not be legalized, as
4509 // we need direct access to the callee later.
4511 const GlobalValue *GV = GSD->getGlobal();
4512 Ops.push_back(Callee);
4513 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4514 } else {
4515 if (IsTailCall) {
4516 // isEligibleForTailCallOptimization considered whether the call target is
4517 // divergent, but we may still end up with a uniform value in a VGPR.
4518 // Insert a readfirstlane just in case.
4519 SDValue ReadFirstLaneID =
4520 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4521
4522 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4523 if (TokenGlue)
4524 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4525 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4526 ReadfirstlaneArgs);
4527 }
4528
4529 Ops.push_back(Callee);
4530 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4531 }
4532
4533 if (IsTailCall) {
4534 // Each tail call may have to adjust the stack by a different amount, so
4535 // this information must travel along with the operation for eventual
4536 // consumption by emitEpilogue.
4537 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4538 }
4539
4540 if (IsChainCallConv)
4541 llvm::append_range(Ops, ChainCallSpecialArgs);
4542
4543 // Add argument registers to the end of the list so that they are known live
4544 // into the call.
4545 for (auto &[Reg, Val] : RegsToPass)
4546 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4547
4548 // Add a register mask operand representing the call-preserved registers.
4549 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4550 assert(Mask && "Missing call preserved mask for calling convention");
4551 Ops.push_back(DAG.getRegisterMask(Mask));
4552
4553 if (SDValue Token = CLI.ConvergenceControlToken) {
4555 GlueOps.push_back(Token);
4556 if (InGlue)
4557 GlueOps.push_back(InGlue);
4558
4559 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4560 MVT::Glue, GlueOps),
4561 0);
4562 }
4563
4564 if (InGlue)
4565 Ops.push_back(InGlue);
4566
4567 // If we're doing a tall call, use a TC_RETURN here rather than an
4568 // actual call instruction.
4569 if (IsTailCall) {
4570 MFI.setHasTailCall();
4571 unsigned OPC = AMDGPUISD::TC_RETURN;
4572 switch (CallConv) {
4574 OPC = AMDGPUISD::TC_RETURN_GFX;
4575 break;
4578 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4579 : AMDGPUISD::TC_RETURN_CHAIN;
4580 break;
4581 }
4582
4583 // If the caller is a whole wave function, we need to use a special opcode
4584 // so we can patch up EXEC.
4585 if (Info->isWholeWaveFunction())
4586 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4587
4588 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4589 }
4590
4591 // Returns a chain and a flag for retval copy to use.
4592 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4593 Chain = Call.getValue(0);
4594 InGlue = Call.getValue(1);
4595
4596 uint64_t CalleePopBytes = NumBytes;
4597 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4598 if (!Ins.empty())
4599 InGlue = Chain.getValue(1);
4600
4601 // Handle result values, copying them out of physregs into vregs that we
4602 // return.
4603 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4604 InVals, /*IsThisReturn=*/false, SDValue());
4605}
4606
4607// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4608// except for:
4609// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4610// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4612 SelectionDAG &DAG) const {
4613 const MachineFunction &MF = DAG.getMachineFunction();
4615
4616 SDLoc dl(Op);
4617 EVT VT = Op.getValueType();
4618 SDValue Chain = Op.getOperand(0);
4619 Register SPReg = Info->getStackPtrOffsetReg();
4620
4621 // Chain the dynamic stack allocation so that it doesn't modify the stack
4622 // pointer when other instructions are using the stack.
4623 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4624
4625 SDValue Size = Op.getOperand(1);
4626 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4627 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4628
4629 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4631 "Stack grows upwards for AMDGPU");
4632
4633 Chain = BaseAddr.getValue(1);
4634 Align StackAlign = TFL->getStackAlign();
4635 if (Alignment > StackAlign) {
4636 uint64_t ScaledAlignment = Alignment.value()
4637 << Subtarget->getWavefrontSizeLog2();
4638 uint64_t StackAlignMask = ScaledAlignment - 1;
4639 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4640 DAG.getConstant(StackAlignMask, dl, VT));
4641 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4642 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4643 }
4644
4645 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4646 SDValue NewSP;
4648 // For constant sized alloca, scale alloca size by wave-size
4649 SDValue ScaledSize = DAG.getNode(
4650 ISD::SHL, dl, VT, Size,
4651 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4652 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4653 } else {
4654 // For dynamic sized alloca, perform wave-wide reduction to get max of
4655 // alloca size(divergent) and then scale it by wave-size
4656 SDValue WaveReduction =
4657 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4658 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4659 Size, DAG.getTargetConstant(0, dl, MVT::i32));
4660 SDValue ScaledSize = DAG.getNode(
4661 ISD::SHL, dl, VT, Size,
4662 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4663 NewSP =
4664 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4665 SDValue ReadFirstLaneID =
4666 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4667 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4668 NewSP);
4669 }
4670
4671 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4672 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4673
4674 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4675}
4676
4678 if (Op.getValueType() != MVT::i32)
4679 return Op; // Defer to cannot select error.
4680
4682 SDLoc SL(Op);
4683
4684 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4685
4686 // Convert from wave uniform to swizzled vector address. This should protect
4687 // from any edge cases where the stacksave result isn't directly used with
4688 // stackrestore.
4689 SDValue VectorAddress =
4690 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4691 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4692}
4693
4695 SelectionDAG &DAG) const {
4696 SDLoc SL(Op);
4697 assert(Op.getValueType() == MVT::i32);
4698
4699 uint32_t BothRoundHwReg =
4701 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4702
4703 SDValue IntrinID =
4704 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4705 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4706 Op.getOperand(0), IntrinID, GetRoundBothImm);
4707
4708 // There are two rounding modes, one for f32 and one for f64/f16. We only
4709 // report in the standard value range if both are the same.
4710 //
4711 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4712 // ties away from zero is not supported, and the other values are rotated by
4713 // 1.
4714 //
4715 // If the two rounding modes are not the same, report a target defined value.
4716
4717 // Mode register rounding mode fields:
4718 //
4719 // [1:0] Single-precision round mode.
4720 // [3:2] Double/Half-precision round mode.
4721 //
4722 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4723 //
4724 // Hardware Spec
4725 // Toward-0 3 0
4726 // Nearest Even 0 1
4727 // +Inf 1 2
4728 // -Inf 2 3
4729 // NearestAway0 N/A 4
4730 //
4731 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4732 // table we can index by the raw hardware mode.
4733 //
4734 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4735
4736 SDValue BitTable =
4738
4739 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4740 SDValue RoundModeTimesNumBits =
4741 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4742
4743 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4744 // knew only one mode was demanded.
4745 SDValue TableValue =
4746 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4747 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4748
4749 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4750 SDValue TableEntry =
4751 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4752
4753 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4754 // if it's an extended value.
4755 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4756 SDValue IsStandardValue =
4757 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4758 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4759 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4760 TableEntry, EnumOffset);
4761
4762 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4763}
4764
4766 SelectionDAG &DAG) const {
4767 SDLoc SL(Op);
4768
4769 SDValue NewMode = Op.getOperand(1);
4770 assert(NewMode.getValueType() == MVT::i32);
4771
4772 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4773 // hardware MODE.fp_round values.
4774 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4775 uint32_t ClampedVal = std::min(
4776 static_cast<uint32_t>(ConstMode->getZExtValue()),
4778 NewMode = DAG.getConstant(
4779 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4780 } else {
4781 // If we know the input can only be one of the supported standard modes in
4782 // the range 0-3, we can use a simplified mapping to hardware values.
4783 KnownBits KB = DAG.computeKnownBits(NewMode);
4784 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4785 // The supported standard values are 0-3. The extended values start at 8. We
4786 // need to offset by 4 if the value is in the extended range.
4787
4788 if (UseReducedTable) {
4789 // Truncate to the low 32-bits.
4790 SDValue BitTable = DAG.getConstant(
4791 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4792
4793 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4794 SDValue RoundModeTimesNumBits =
4795 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4796
4797 NewMode =
4798 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4799
4800 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4801 // the table extracted bits into inline immediates.
4802 } else {
4803 // table_index = umin(value, value - 4)
4804 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4805 SDValue BitTable =
4807
4808 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4809 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4810 SDValue IndexVal =
4811 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4812
4813 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4814 SDValue RoundModeTimesNumBits =
4815 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4816
4817 SDValue TableValue =
4818 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4819 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4820
4821 // No need to mask out the high bits since the setreg will ignore them
4822 // anyway.
4823 NewMode = TruncTable;
4824 }
4825
4826 // Insert a readfirstlane in case the value is a VGPR. We could do this
4827 // earlier and keep more operations scalar, but that interferes with
4828 // combining the source.
4829 SDValue ReadFirstLaneID =
4830 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4831 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4832 ReadFirstLaneID, NewMode);
4833 }
4834
4835 // N.B. The setreg will be later folded into s_round_mode on supported
4836 // targets.
4837 SDValue IntrinID =
4838 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4839 uint32_t BothRoundHwReg =
4841 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4842
4843 SDValue SetReg =
4844 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4845 IntrinID, RoundBothImm, NewMode);
4846
4847 return SetReg;
4848}
4849
4851 if (Op->isDivergent() &&
4852 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4853 // Cannot do I$ prefetch with divergent pointer.
4854 return SDValue();
4855
4856 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4860 break;
4862 if (Subtarget->hasSafeSmemPrefetch())
4863 break;
4864 [[fallthrough]];
4865 default:
4866 return SDValue();
4867 }
4868
4869 // I$ prefetch
4870 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4871 return SDValue();
4872
4873 return Op;
4874}
4875
4876// Work around DAG legality rules only based on the result type.
4878 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4879 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4880 EVT SrcVT = Src.getValueType();
4881
4882 if (SrcVT.getScalarType() != MVT::bf16)
4883 return Op;
4884
4885 SDLoc SL(Op);
4886 SDValue BitCast =
4887 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4888
4889 EVT DstVT = Op.getValueType();
4890 if (IsStrict)
4891 llvm_unreachable("Need STRICT_BF16_TO_FP");
4892
4893 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4894}
4895
4897 SDLoc SL(Op);
4898 if (Op.getValueType() != MVT::i64)
4899 return Op;
4900
4901 uint32_t ModeHwReg =
4903 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4904 uint32_t TrapHwReg =
4906 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4907
4908 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4909 SDValue IntrinID =
4910 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4911 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4912 Op.getOperand(0), IntrinID, ModeHwRegImm);
4913 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4914 Op.getOperand(0), IntrinID, TrapHwRegImm);
4915 SDValue TokenReg =
4916 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4917 GetTrapReg.getValue(1));
4918
4919 SDValue CvtPtr =
4920 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4921 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4922
4923 return DAG.getMergeValues({Result, TokenReg}, SL);
4924}
4925
4927 SDLoc SL(Op);
4928 if (Op.getOperand(1).getValueType() != MVT::i64)
4929 return Op;
4930
4931 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4932 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4933 DAG.getConstant(0, SL, MVT::i32));
4934 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4935 DAG.getConstant(1, SL, MVT::i32));
4936
4937 SDValue ReadFirstLaneID =
4938 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4939 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4940 ReadFirstLaneID, NewModeReg);
4941 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4942 ReadFirstLaneID, NewTrapReg);
4943
4944 unsigned ModeHwReg =
4946 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4947 unsigned TrapHwReg =
4949 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4950
4951 SDValue IntrinID =
4952 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4953 SDValue SetModeReg =
4954 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4955 IntrinID, ModeHwRegImm, NewModeReg);
4956 SDValue SetTrapReg =
4957 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4958 IntrinID, TrapHwRegImm, NewTrapReg);
4959 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4960}
4961
4963 const MachineFunction &MF) const {
4964 const Function &Fn = MF.getFunction();
4965
4967 .Case("m0", AMDGPU::M0)
4968 .Case("exec", AMDGPU::EXEC)
4969 .Case("exec_lo", AMDGPU::EXEC_LO)
4970 .Case("exec_hi", AMDGPU::EXEC_HI)
4971 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4972 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4973 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4974 .Default(Register());
4975 if (!Reg)
4976 return Reg;
4977
4978 if (!Subtarget->hasFlatScrRegister() &&
4979 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4980 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4981 "\" for subtarget."));
4982 }
4983
4984 switch (Reg) {
4985 case AMDGPU::M0:
4986 case AMDGPU::EXEC_LO:
4987 case AMDGPU::EXEC_HI:
4988 case AMDGPU::FLAT_SCR_LO:
4989 case AMDGPU::FLAT_SCR_HI:
4990 if (VT.getSizeInBits() == 32)
4991 return Reg;
4992 break;
4993 case AMDGPU::EXEC:
4994 case AMDGPU::FLAT_SCR:
4995 if (VT.getSizeInBits() == 64)
4996 return Reg;
4997 break;
4998 default:
4999 llvm_unreachable("missing register type checking");
5000 }
5001
5003 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
5004}
5005
5006// If kill is not the last instruction, split the block so kill is always a
5007// proper terminator.
5010 MachineBasicBlock *BB) const {
5011 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
5013 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
5014 return SplitBB;
5015}
5016
5017// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
5018// \p MI will be the only instruction in the loop body block. Otherwise, it will
5019// be the first instruction in the remainder block.
5020//
5021/// \returns { LoopBody, Remainder }
5022static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5024 MachineFunction *MF = MBB.getParent();
5026
5027 // To insert the loop we need to split the block. Move everything after this
5028 // point to a new block, and insert a new empty block between the two.
5030 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
5032 ++MBBI;
5033
5034 MF->insert(MBBI, LoopBB);
5035 MF->insert(MBBI, RemainderBB);
5036
5037 LoopBB->addSuccessor(LoopBB);
5038 LoopBB->addSuccessor(RemainderBB);
5039
5040 // Move the rest of the block into a new block.
5041 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
5042
5043 if (InstInLoop) {
5044 auto Next = std::next(I);
5045
5046 // Move instruction to loop body.
5047 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
5048
5049 // Move the rest of the block.
5050 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
5051 } else {
5052 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
5053 }
5054
5055 MBB.addSuccessor(LoopBB);
5056
5057 return std::pair(LoopBB, RemainderBB);
5058}
5059
5060/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
5062 MachineBasicBlock *MBB = MI.getParent();
5064 auto I = MI.getIterator();
5065 auto E = std::next(I);
5066
5067 // clang-format off
5068 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
5069 .addImm(0);
5070 // clang-format on
5071
5072 MIBundleBuilder Bundler(*MBB, I, E);
5073 finalizeBundle(*MBB, Bundler.begin());
5074}
5075
5078 MachineBasicBlock *BB) const {
5079 const DebugLoc &DL = MI.getDebugLoc();
5080
5082
5084
5085 // Apparently kill flags are only valid if the def is in the same block?
5086 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
5087 Src->setIsKill(false);
5088
5089 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
5090
5091 MachineBasicBlock::iterator I = LoopBB->end();
5092
5093 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5095
5096 // Clear TRAP_STS.MEM_VIOL
5097 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5098 .addImm(0)
5099 .addImm(EncodedReg);
5100
5102
5103 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5104
5105 // Load and check TRAP_STS.MEM_VIOL
5106 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5107 .addImm(EncodedReg);
5108
5109 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5110 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5111 .addReg(Reg, RegState::Kill)
5112 .addImm(0);
5113 // clang-format off
5114 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5115 .addMBB(LoopBB);
5116 // clang-format on
5117
5118 return RemainderBB;
5119}
5120
5121// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5122// wavefront. If the value is uniform and just happens to be in a VGPR, this
5123// will only do one iteration. In the worst case, this will loop 64 times.
5124//
5125// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5128 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5129 const DebugLoc &DL, const MachineOperand &Idx,
5130 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5131 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5132 Register &SGPRIdxReg) {
5133
5134 MachineFunction *MF = OrigBB.getParent();
5135 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5136 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5139
5140 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5141 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5142 Register NewExec = MRI.createVirtualRegister(BoolRC);
5143 Register CurrentIdxReg =
5144 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5145 Register CondReg = MRI.createVirtualRegister(BoolRC);
5146
5147 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5148 .addReg(InitReg)
5149 .addMBB(&OrigBB)
5150 .addReg(ResultReg)
5151 .addMBB(&LoopBB);
5152
5153 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5154 .addReg(InitSaveExecReg)
5155 .addMBB(&OrigBB)
5156 .addReg(NewExec)
5157 .addMBB(&LoopBB);
5158
5159 // Read the next variant <- also loop target.
5160 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5161 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5162
5163 // Compare the just read M0 value to all possible Idx values.
5164 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5165 .addReg(CurrentIdxReg)
5166 .addReg(Idx.getReg(), {}, Idx.getSubReg());
5167
5168 // Update EXEC, save the original EXEC value to VCC.
5169 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5170 .addReg(CondReg, RegState::Kill);
5171
5172 MRI.setSimpleHint(NewExec, CondReg);
5173
5174 if (UseGPRIdxMode) {
5175 if (Offset == 0) {
5176 SGPRIdxReg = CurrentIdxReg;
5177 } else {
5178 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5179 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5180 .addReg(CurrentIdxReg, RegState::Kill)
5181 .addImm(Offset);
5182 }
5183 } else {
5184 // Move index from VCC into M0
5185 if (Offset == 0) {
5186 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5187 .addReg(CurrentIdxReg, RegState::Kill);
5188 } else {
5189 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5190 .addReg(CurrentIdxReg, RegState::Kill)
5191 .addImm(Offset);
5192 }
5193 }
5194
5195 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5196 MachineInstr *InsertPt =
5197 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5198 .addReg(LMC.ExecReg)
5199 .addReg(NewExec);
5200
5201 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5202 // s_cbranch_scc0?
5203
5204 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5205 // clang-format off
5206 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5207 .addMBB(&LoopBB);
5208 // clang-format on
5209
5210 return InsertPt->getIterator();
5211}
5212
5213// This has slightly sub-optimal regalloc when the source vector is killed by
5214// the read. The register allocator does not understand that the kill is
5215// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5216// subregister from it, using 1 more VGPR than necessary. This was saved when
5217// this was expanded after register allocation.
5220 unsigned InitResultReg, unsigned PhiReg, int Offset,
5221 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5222 MachineFunction *MF = MBB.getParent();
5223 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5224 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5225 MachineRegisterInfo &MRI = MF->getRegInfo();
5226 const DebugLoc &DL = MI.getDebugLoc();
5228
5229 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5230 Register DstReg = MI.getOperand(0).getReg();
5231 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5232 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5234
5235 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5236
5237 // Save the EXEC mask
5238 // clang-format off
5239 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5240 .addReg(LMC.ExecReg);
5241 // clang-format on
5242
5243 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5244
5245 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5246
5247 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5248 InitResultReg, DstReg, PhiReg, TmpExec,
5249 Offset, UseGPRIdxMode, SGPRIdxReg);
5250
5251 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5253 ++MBBI;
5254 MF->insert(MBBI, LandingPad);
5255 LoopBB->removeSuccessor(RemainderBB);
5256 LandingPad->addSuccessor(RemainderBB);
5257 LoopBB->addSuccessor(LandingPad);
5258 MachineBasicBlock::iterator First = LandingPad->begin();
5259 // clang-format off
5260 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5261 .addReg(SaveExec);
5262 // clang-format on
5263
5264 return InsPt;
5265}
5266
5267// Returns subreg index, offset
5268static std::pair<unsigned, int>
5270 const TargetRegisterClass *SuperRC, unsigned VecReg,
5271 int Offset) {
5272 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5273
5274 // Skip out of bounds offsets, or else we would end up using an undefined
5275 // register.
5276 if (Offset >= NumElts || Offset < 0)
5277 return std::pair(AMDGPU::sub0, Offset);
5278
5279 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5280}
5281
5284 int Offset) {
5285 MachineBasicBlock *MBB = MI.getParent();
5286 const DebugLoc &DL = MI.getDebugLoc();
5288
5289 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5290
5291 assert(Idx->getReg() != AMDGPU::NoRegister);
5292
5293 if (Offset == 0) {
5294 // clang-format off
5295 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5296 .add(*Idx);
5297 // clang-format on
5298 } else {
5299 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5300 .add(*Idx)
5301 .addImm(Offset);
5302 }
5303}
5304
5307 int Offset) {
5308 MachineBasicBlock *MBB = MI.getParent();
5309 const DebugLoc &DL = MI.getDebugLoc();
5311
5312 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5313
5314 if (Offset == 0)
5315 return Idx->getReg();
5316
5317 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5318 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5319 .add(*Idx)
5320 .addImm(Offset);
5321 return Tmp;
5322}
5323
5326 const GCNSubtarget &ST) {
5327 const SIInstrInfo *TII = ST.getInstrInfo();
5328 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5329 MachineFunction *MF = MBB.getParent();
5330 MachineRegisterInfo &MRI = MF->getRegInfo();
5331
5332 Register Dst = MI.getOperand(0).getReg();
5333 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5334 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5335 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5336
5337 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5338 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5339
5340 unsigned SubReg;
5341 std::tie(SubReg, Offset) =
5342 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5343
5344 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5345
5346 // Check for a SGPR index.
5347 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5349 const DebugLoc &DL = MI.getDebugLoc();
5350
5351 if (UseGPRIdxMode) {
5352 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5353 // to avoid interfering with other uses, so probably requires a new
5354 // optimization pass.
5355 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5356
5357 const MCInstrDesc &GPRIDXDesc =
5358 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5359 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5360 .addReg(SrcReg)
5361 .addReg(Idx)
5362 .addImm(SubReg);
5363 } else {
5365
5366 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5367 .addReg(SrcReg, {}, SubReg)
5368 .addReg(SrcReg, RegState::Implicit);
5369 }
5370
5371 MI.eraseFromParent();
5372
5373 return &MBB;
5374 }
5375
5376 // Control flow needs to be inserted if indexing with a VGPR.
5377 const DebugLoc &DL = MI.getDebugLoc();
5379
5380 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5381 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5382
5383 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5384
5385 Register SGPRIdxReg;
5386 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5387 UseGPRIdxMode, SGPRIdxReg);
5388
5389 MachineBasicBlock *LoopBB = InsPt->getParent();
5390
5391 if (UseGPRIdxMode) {
5392 const MCInstrDesc &GPRIDXDesc =
5393 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5394
5395 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5396 .addReg(SrcReg)
5397 .addReg(SGPRIdxReg)
5398 .addImm(SubReg);
5399 } else {
5400 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5401 .addReg(SrcReg, {}, SubReg)
5402 .addReg(SrcReg, RegState::Implicit);
5403 }
5404
5405 MI.eraseFromParent();
5406
5407 return LoopBB;
5408}
5409
5412 const GCNSubtarget &ST) {
5413 const SIInstrInfo *TII = ST.getInstrInfo();
5414 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5415 MachineFunction *MF = MBB.getParent();
5416 MachineRegisterInfo &MRI = MF->getRegInfo();
5417
5418 Register Dst = MI.getOperand(0).getReg();
5419 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5420 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5421 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5422 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5423 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5424 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5425
5426 // This can be an immediate, but will be folded later.
5427 assert(Val->getReg());
5428
5429 unsigned SubReg;
5430 std::tie(SubReg, Offset) =
5431 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5432 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5433
5434 if (Idx->getReg() == AMDGPU::NoRegister) {
5436 const DebugLoc &DL = MI.getDebugLoc();
5437
5438 assert(Offset == 0);
5439
5440 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5441 .add(*SrcVec)
5442 .add(*Val)
5443 .addImm(SubReg);
5444
5445 MI.eraseFromParent();
5446 return &MBB;
5447 }
5448
5449 // Check for a SGPR index.
5450 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5452 const DebugLoc &DL = MI.getDebugLoc();
5453
5454 if (UseGPRIdxMode) {
5455 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5456
5457 const MCInstrDesc &GPRIDXDesc =
5458 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5459 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5460 .addReg(SrcVec->getReg())
5461 .add(*Val)
5462 .addReg(Idx)
5463 .addImm(SubReg);
5464 } else {
5466
5467 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5468 TRI.getRegSizeInBits(*VecRC), 32, false);
5469 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5470 .addReg(SrcVec->getReg())
5471 .add(*Val)
5472 .addImm(SubReg);
5473 }
5474 MI.eraseFromParent();
5475 return &MBB;
5476 }
5477
5478 // Control flow needs to be inserted if indexing with a VGPR.
5479 if (Val->isReg())
5480 MRI.clearKillFlags(Val->getReg());
5481
5482 const DebugLoc &DL = MI.getDebugLoc();
5483
5484 Register PhiReg = MRI.createVirtualRegister(VecRC);
5485
5486 Register SGPRIdxReg;
5487 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5488 UseGPRIdxMode, SGPRIdxReg);
5489 MachineBasicBlock *LoopBB = InsPt->getParent();
5490
5491 if (UseGPRIdxMode) {
5492 const MCInstrDesc &GPRIDXDesc =
5493 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5494
5495 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5496 .addReg(PhiReg)
5497 .add(*Val)
5498 .addReg(SGPRIdxReg)
5499 .addImm(SubReg);
5500 } else {
5501 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5502 TRI.getRegSizeInBits(*VecRC), 32, false);
5503 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5504 .addReg(PhiReg)
5505 .add(*Val)
5506 .addImm(SubReg);
5507 }
5508
5509 MI.eraseFromParent();
5510 return LoopBB;
5511}
5512
5514 MachineBasicBlock *BB) {
5515 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5516 // For GFX12, we emit s_add_u64 and s_sub_u64.
5517 MachineFunction *MF = BB->getParent();
5518 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5519 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5521 const DebugLoc &DL = MI.getDebugLoc();
5522 MachineOperand &Dest = MI.getOperand(0);
5523 MachineOperand &Src0 = MI.getOperand(1);
5524 MachineOperand &Src1 = MI.getOperand(2);
5525 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5526 if (ST.hasScalarAddSub64()) {
5527 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5528 // clang-format off
5529 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5530 .add(Src0)
5531 .add(Src1);
5532 // clang-format on
5533 } else {
5534 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5535 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5536
5537 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5538 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5539
5540 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5541 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5542 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5543 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5544
5545 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5546 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5547 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5548 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5549
5550 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5551 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5552 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5553 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5554 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5555 .addReg(DestSub0)
5556 .addImm(AMDGPU::sub0)
5557 .addReg(DestSub1)
5558 .addImm(AMDGPU::sub1);
5559 }
5560 MI.eraseFromParent();
5561 return BB;
5562}
5563
5565 MachineFunction *MF = BB->getParent();
5566 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5567 const SIInstrInfo *TII = ST.getInstrInfo();
5568 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5569 MachineRegisterInfo &MRI = MF->getRegInfo();
5570 const DebugLoc &DL = MI.getDebugLoc();
5571 Register Dst = MI.getOperand(0).getReg();
5572 const MachineOperand &Src0 = MI.getOperand(1);
5573 const MachineOperand &Src1 = MI.getOperand(2);
5574 Register SrcCond = MI.getOperand(3).getReg();
5575
5576 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5577 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5578 const TargetRegisterClass *CondRC = TRI->getWaveMaskRegClass();
5579 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5580
5581 int Src0Idx =
5582 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
5583 int Src1Idx =
5584 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
5585 const TargetRegisterClass *Src0RC =
5586 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), Src0Idx));
5587 const TargetRegisterClass *Src1RC =
5588 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), Src1Idx));
5589
5590 const TargetRegisterClass *Src0SubRC =
5591 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5592 const TargetRegisterClass *Src1SubRC =
5593 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5594
5595 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5596 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5597 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5598 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5599
5600 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5601 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5602 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5603 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5604
5605 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5606 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5607 .addImm(0)
5608 .add(Src0Sub0)
5609 .addImm(0)
5610 .add(Src1Sub0)
5611 .addReg(SrcCondCopy);
5612
5613 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5614 .addImm(0)
5615 .add(Src0Sub1)
5616 .addImm(0)
5617 .add(Src1Sub1)
5618 .addReg(SrcCondCopy);
5619
5620 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5621 .addReg(DstLo)
5622 .addImm(AMDGPU::sub0)
5623 .addReg(DstHi)
5624 .addImm(AMDGPU::sub1);
5625 MI.eraseFromParent();
5626}
5627
5629 switch (Opc) {
5630 case AMDGPU::S_MIN_U32:
5631 return std::numeric_limits<uint32_t>::max();
5632 case AMDGPU::S_MIN_I32:
5633 return std::numeric_limits<int32_t>::max();
5634 case AMDGPU::S_MAX_U32:
5635 return std::numeric_limits<uint32_t>::min();
5636 case AMDGPU::S_MAX_I32:
5637 return std::numeric_limits<int32_t>::min();
5638 case AMDGPU::V_ADD_F32_e64: // -0.0
5639 return 0x80000000;
5640 case AMDGPU::V_SUB_F32_e64: // +0.0
5641 return 0x0;
5642 case AMDGPU::S_ADD_I32:
5643 case AMDGPU::S_SUB_I32:
5644 case AMDGPU::S_OR_B32:
5645 case AMDGPU::S_XOR_B32:
5646 return std::numeric_limits<uint32_t>::min();
5647 case AMDGPU::S_AND_B32:
5648 return std::numeric_limits<uint32_t>::max();
5649 case AMDGPU::V_MIN_F32_e64:
5650 case AMDGPU::V_MAX_F32_e64:
5651 return 0x7fc00000; // qNAN
5652 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5653 return std::numeric_limits<uint64_t>::max();
5654 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5655 return std::numeric_limits<int64_t>::max();
5656 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5657 return std::numeric_limits<uint64_t>::min();
5658 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5659 return std::numeric_limits<int64_t>::min();
5660 case AMDGPU::V_MIN_F64_e64:
5661 case AMDGPU::V_MAX_F64_e64:
5662 case AMDGPU::V_MIN_NUM_F64_e64:
5663 case AMDGPU::V_MAX_NUM_F64_e64:
5664 return 0x7FF8000000000000; // qNAN
5665 case AMDGPU::S_ADD_U64_PSEUDO:
5666 case AMDGPU::S_SUB_U64_PSEUDO:
5667 case AMDGPU::S_OR_B64:
5668 case AMDGPU::S_XOR_B64:
5669 return std::numeric_limits<uint64_t>::min();
5670 case AMDGPU::S_AND_B64:
5671 return std::numeric_limits<uint64_t>::max();
5672 case AMDGPU::V_ADD_F64_e64:
5673 case AMDGPU::V_ADD_F64_pseudo_e64:
5674 return 0x8000000000000000; // -0.0
5675 default:
5676 llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5677 }
5678}
5679
5680static bool is32bitWaveReduceOperation(unsigned Opc) {
5681 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5682 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5683 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5684 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5685 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5686 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5687 Opc == AMDGPU::V_SUB_F32_e64;
5688}
5689
5691 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5692 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
5693 Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
5694 Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5695 Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5696}
5697
5698static std::tuple<unsigned, unsigned>
5700 unsigned DPPOpc;
5701 switch (Opc) {
5702 case AMDGPU::S_MIN_U32:
5703 DPPOpc = AMDGPU::V_MIN_U32_dpp;
5704 break;
5705 case AMDGPU::S_MIN_I32:
5706 DPPOpc = AMDGPU::V_MIN_I32_dpp;
5707 break;
5708 case AMDGPU::S_MAX_U32:
5709 DPPOpc = AMDGPU::V_MAX_U32_dpp;
5710 break;
5711 case AMDGPU::S_MAX_I32:
5712 DPPOpc = AMDGPU::V_MAX_I32_dpp;
5713 break;
5714 case AMDGPU::S_ADD_I32:
5715 case AMDGPU::S_SUB_I32:
5716 DPPOpc = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp
5717 : AMDGPU::V_ADD_CO_U32_dpp;
5718 break;
5719 case AMDGPU::S_AND_B32:
5720 DPPOpc = AMDGPU::V_AND_B32_dpp;
5721 break;
5722 case AMDGPU::S_OR_B32:
5723 DPPOpc = AMDGPU::V_OR_B32_dpp;
5724 break;
5725 case AMDGPU::S_XOR_B32:
5726 DPPOpc = AMDGPU::V_XOR_B32_dpp;
5727 break;
5728 case AMDGPU::V_ADD_F32_e64:
5729 case AMDGPU::V_SUB_F32_e64:
5730 DPPOpc = AMDGPU::V_ADD_F32_dpp;
5731 break;
5732 case AMDGPU::V_MIN_F32_e64:
5733 DPPOpc = AMDGPU::V_MIN_F32_dpp;
5734 break;
5735 case AMDGPU::V_MAX_F32_e64:
5736 DPPOpc = AMDGPU::V_MAX_F32_dpp;
5737 break;
5738 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5739 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5740 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5741 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5742 case AMDGPU::S_ADD_U64_PSEUDO:
5743 case AMDGPU::S_SUB_U64_PSEUDO:
5744 case AMDGPU::S_AND_B64:
5745 case AMDGPU::S_OR_B64:
5746 case AMDGPU::S_XOR_B64:
5747 case AMDGPU::V_MIN_NUM_F64_e64:
5748 case AMDGPU::V_MIN_F64_e64:
5749 case AMDGPU::V_MAX_NUM_F64_e64:
5750 case AMDGPU::V_MAX_F64_e64:
5751 case AMDGPU::V_ADD_F64_pseudo_e64:
5752 case AMDGPU::V_ADD_F64_e64:
5753 DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
5754 break;
5755 default:
5756 llvm_unreachable("unhandled lane op");
5757 }
5758 unsigned ClampOpc = Opc;
5759 if (!ST.getInstrInfo()->isVALU(Opc)) {
5760 if (Opc == AMDGPU::S_SUB_I32)
5761 ClampOpc = AMDGPU::S_ADD_I32;
5762 if (Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO)
5763 ClampOpc = AMDGPU::V_ADD_CO_U32_e64;
5764 else if (Opc == AMDGPU::S_AND_B64)
5765 ClampOpc = AMDGPU::V_AND_B32_e64;
5766 else if (Opc == AMDGPU::S_OR_B64)
5767 ClampOpc = AMDGPU::V_OR_B32_e64;
5768 else if (Opc == AMDGPU::S_XOR_B64)
5769 ClampOpc = AMDGPU::V_XOR_B32_e64;
5770 else
5771 ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
5772 }
5773 return {DPPOpc, ClampOpc};
5774}
5775
5776static std::pair<Register, Register>
5778 const TargetRegisterClass *SrcRC, const GCNSubtarget &ST,
5779 MachineRegisterInfo &MRI) {
5780 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5781 const SIInstrInfo *TII = ST.getInstrInfo();
5782 const TargetRegisterClass *SrcSubRC =
5783 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5784 Register Op1L =
5785 TII->buildExtractSubReg(MI, MRI, Op, SrcRC, AMDGPU::sub0, SrcSubRC);
5786 Register Op1H =
5787 TII->buildExtractSubReg(MI, MRI, Op, SrcRC, AMDGPU::sub1, SrcSubRC);
5788 return {Op1L, Op1H};
5789}
5790
5793 const GCNSubtarget &ST,
5794 unsigned Opc) {
5796 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5797 const DebugLoc &DL = MI.getDebugLoc();
5798 const SIInstrInfo *TII = ST.getInstrInfo();
5799
5800 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5801 Register SrcReg = MI.getOperand(1).getReg();
5802 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5803 Register DstReg = MI.getOperand(0).getReg();
5804 unsigned Stratergy = static_cast<unsigned>(MI.getOperand(2).getImm());
5805 enum WAVE_REDUCE_STRATEGY : unsigned { DEFAULT = 0, ITERATIVE = 1, DPP = 2 };
5806 MachineBasicBlock *RetBB = nullptr;
5807 unsigned MIOpc = MI.getOpcode();
5808 auto BuildRegSequence = [&](MachineBasicBlock &BB,
5810 Register Src0, Register Src1) {
5811 auto RegSequence =
5812 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dst)
5813 .addReg(Src0)
5814 .addImm(AMDGPU::sub0)
5815 .addReg(Src1)
5816 .addImm(AMDGPU::sub1);
5817 return RegSequence;
5818 };
5819 if (isSGPR) {
5820 switch (Opc) {
5821 case AMDGPU::S_MIN_U32:
5822 case AMDGPU::S_MIN_I32:
5823 case AMDGPU::V_MIN_F32_e64:
5824 case AMDGPU::S_MAX_U32:
5825 case AMDGPU::S_MAX_I32:
5826 case AMDGPU::V_MAX_F32_e64:
5827 case AMDGPU::S_AND_B32:
5828 case AMDGPU::S_OR_B32: {
5829 // Idempotent operations.
5830 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5831 RetBB = &BB;
5832 break;
5833 }
5834 case AMDGPU::V_CMP_LT_U64_e64: // umin
5835 case AMDGPU::V_CMP_LT_I64_e64: // min
5836 case AMDGPU::V_CMP_GT_U64_e64: // umax
5837 case AMDGPU::V_CMP_GT_I64_e64: // max
5838 case AMDGPU::V_MIN_F64_e64:
5839 case AMDGPU::V_MIN_NUM_F64_e64:
5840 case AMDGPU::V_MAX_F64_e64:
5841 case AMDGPU::V_MAX_NUM_F64_e64:
5842 case AMDGPU::S_AND_B64:
5843 case AMDGPU::S_OR_B64: {
5844 // Idempotent operations.
5845 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5846 RetBB = &BB;
5847 break;
5848 }
5849 case AMDGPU::S_XOR_B32:
5850 case AMDGPU::S_XOR_B64:
5851 case AMDGPU::S_ADD_I32:
5852 case AMDGPU::S_ADD_U64_PSEUDO:
5853 case AMDGPU::V_ADD_F32_e64:
5854 case AMDGPU::V_ADD_F64_e64:
5855 case AMDGPU::V_ADD_F64_pseudo_e64:
5856 case AMDGPU::S_SUB_I32:
5857 case AMDGPU::S_SUB_U64_PSEUDO:
5858 case AMDGPU::V_SUB_F32_e64: {
5859 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5860 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5861 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5862 Register NumActiveLanes =
5863 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5864
5865 bool IsWave32 = ST.isWave32();
5866 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5867 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5868 unsigned BitCountOpc =
5869 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5870
5871 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5872
5873 auto NewAccumulator =
5874 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5875 .addReg(ExecMask);
5876
5877 switch (Opc) {
5878 case AMDGPU::S_XOR_B32:
5879 case AMDGPU::S_XOR_B64: {
5880 // Performing an XOR operation on a uniform value
5881 // depends on the parity of the number of active lanes.
5882 // For even parity, the result will be 0, for odd
5883 // parity the result will be the same as the input value.
5884 Register ParityRegister =
5885 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5886
5887 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5888 .addReg(NewAccumulator->getOperand(0).getReg())
5889 .addImm(1)
5890 .setOperandDead(3); // Dead scc
5891 if (Opc == AMDGPU::S_XOR_B32) {
5892 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5893 .addReg(SrcReg)
5894 .addReg(ParityRegister);
5895 } else {
5896 Register DestSub0 =
5897 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5898 Register DestSub1 =
5899 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5900 auto [Op1L, Op1H] = ExtractSubRegs(MI, MI.getOperand(1),
5901 MRI.getRegClass(SrcReg), ST, MRI);
5902 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5903 .addReg(Op1L)
5904 .addReg(ParityRegister);
5905 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5906 .addReg(Op1H)
5907 .addReg(ParityRegister);
5908 BuildRegSequence(BB, MI, DstReg, DestSub0, DestSub1);
5909 }
5910 break;
5911 }
5912 case AMDGPU::S_SUB_I32: {
5913 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5914
5915 // Take the negation of the source operand.
5916 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5917 .addImm(0)
5918 .addReg(SrcReg);
5919 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5920 .addReg(NegatedVal)
5921 .addReg(NewAccumulator->getOperand(0).getReg());
5922 break;
5923 }
5924 case AMDGPU::S_ADD_I32: {
5925 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5926 .addReg(SrcReg)
5927 .addReg(NewAccumulator->getOperand(0).getReg());
5928 break;
5929 }
5930 case AMDGPU::S_ADD_U64_PSEUDO:
5931 case AMDGPU::S_SUB_U64_PSEUDO: {
5932 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5933 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5934 Register Op1H_Op0L_Reg =
5935 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5936 Register Op1L_Op0H_Reg =
5937 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5938 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5939 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5940 Register NegatedValLo =
5941 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5942 Register NegatedValHi =
5943 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5944 auto [Op1L, Op1H] = ExtractSubRegs(MI, MI.getOperand(1),
5945 MRI.getRegClass(SrcReg), ST, MRI);
5946 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5947 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5948 .addImm(0)
5949 .addReg(NewAccumulator->getOperand(0).getReg())
5950 .setOperandDead(3); // Dead scc
5951 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5952 .addReg(NegatedValLo)
5953 .addImm(31)
5954 .setOperandDead(3); // Dead scc
5955 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5956 .addReg(Op1L)
5957 .addReg(NegatedValHi);
5958 }
5959 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5960 ? NegatedValLo
5961 : NewAccumulator->getOperand(0).getReg();
5962 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5963 .addReg(Op1L)
5964 .addReg(LowOpcode);
5965 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5966 .addReg(Op1L)
5967 .addReg(LowOpcode);
5968 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5969 .addReg(Op1H)
5970 .addReg(LowOpcode);
5971
5972 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5973 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5974 .addReg(CarryReg)
5975 .addReg(Op1H_Op0L_Reg)
5976 .setOperandDead(3); // Dead scc
5977
5978 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5979 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5980 .addReg(HiVal)
5981 .addReg(Op1L_Op0H_Reg)
5982 .setOperandDead(3); // Dead scc
5983 }
5984 BuildRegSequence(BB, MI, DstReg, DestSub0, DestSub1);
5985 break;
5986 }
5987 case AMDGPU::V_ADD_F32_e64:
5988 case AMDGPU::V_ADD_F64_e64:
5989 case AMDGPU::V_ADD_F64_pseudo_e64:
5990 case AMDGPU::V_SUB_F32_e64: {
5991 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5992 const TargetRegisterClass *VregRC = TII->getRegClass(TII->get(Opc), 0);
5993 Register ActiveLanesVreg = MRI.createVirtualRegister(VregRC);
5994 Register DstVreg = MRI.createVirtualRegister(VregRC);
5995 // Get number of active lanes as a float val.
5996 BuildMI(BB, MI, DL,
5997 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5998 : AMDGPU::V_CVT_F64_I32_e64),
5999 ActiveLanesVreg)
6000 .addReg(NewAccumulator->getOperand(0).getReg())
6001 .addImm(0) // clamp
6002 .addImm(0); // output-modifier
6003
6004 // Take negation of input for SUB reduction
6005 unsigned srcMod = (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6006 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
6009 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
6010 : ST.getGeneration() >= AMDGPUSubtarget::GFX12
6011 ? AMDGPU::V_MUL_F64_pseudo_e64
6012 : AMDGPU::V_MUL_F64_e64;
6013 auto DestVregInst = BuildMI(BB, MI, DL, TII->get(MulOpc),
6014 DstVreg)
6015 .addImm(srcMod) // src0 modifier
6016 .addReg(SrcReg)
6017 .addImm(SISrcMods::NONE) // src1 modifier
6018 .addReg(ActiveLanesVreg)
6019 .addImm(SISrcMods::NONE) // clamp
6020 .addImm(SISrcMods::NONE); // output-mod
6021 if (is32BitOpc) {
6022 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6023 .addReg(DstVreg);
6024 } else {
6025 Register LaneValueLoReg =
6026 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6027 Register LaneValueHiReg =
6028 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6029 auto [Op1L, Op1H] =
6030 ExtractSubRegs(MI, DestVregInst->getOperand(0), VregRC, ST, MRI);
6031 // lane value input should be in an sgpr
6032 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6033 LaneValueLoReg)
6034 .addReg(Op1L);
6035 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6036 LaneValueHiReg)
6037 .addReg(Op1H);
6038 NewAccumulator =
6039 BuildRegSequence(BB, MI, DstReg, LaneValueLoReg, LaneValueHiReg);
6040 }
6041 }
6042 }
6043 RetBB = &BB;
6044 }
6045 }
6046 } else {
6048 Register SrcReg = MI.getOperand(1).getReg();
6049 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
6051 bool NeedsMovDPP = !is32BitOpc;
6052 // Create virtual registers required for lowering.
6053 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
6054 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
6055 const TargetRegisterClass *SrcRegClass = MRI.getRegClass(SrcReg);
6056 bool IsWave32 = ST.isWave32();
6057 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6058 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6059 if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
6060 !ST.hasDPP()) { // If target doesn't support DPP operations, default to
6061 // iterative stratergy
6062
6063 // To reduce the VGPR using iterative approach, we need to iterate
6064 // over all the active lanes. Lowering consists of ComputeLoop,
6065 // which iterate over only active lanes. We use copy of EXEC register
6066 // as induction variable and every active lane modifies it using bitset0
6067 // so that we will get the next active lane for next iteration.
6068
6069 // Create Control flow for loop
6070 // Split MI's Machine Basic block into For loop
6071 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
6072
6073 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
6074 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
6075 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
6076 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
6077 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
6078 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6079 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
6080
6081 // Create initial values of induction variable from Exec, Accumulator and
6082 // insert branch instr to newly created ComputeBlock
6083 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
6084 uint64_t IdentityValue =
6085 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6086 ? 0x0 // +0.0 for double sub reduction
6088 BuildMI(BB, I, DL,
6089 TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
6090 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6091 IdentityValReg)
6092 .addImm(IdentityValue);
6093 // clang-format off
6094 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
6095 .addMBB(ComputeLoop);
6096 // clang-format on
6097
6098 // Start constructing ComputeLoop
6099 I = ComputeLoop->begin();
6100 auto Accumulator =
6101 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
6102 .addReg(IdentityValReg)
6103 .addMBB(&BB);
6104 auto ActiveBits =
6105 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
6106 .addReg(LoopIterator)
6107 .addMBB(&BB);
6108
6109 I = ComputeLoop->end();
6110 MachineInstr *NewAccumulator;
6111 // Perform the computations
6112 unsigned SFFOpc =
6113 IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
6114 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
6115 .addReg(ActiveBitsReg);
6116 if (is32BitOpc) {
6117 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6118 LaneValueReg)
6119 .addReg(SrcReg)
6120 .addReg(FF1Reg);
6121 if (isFPOp) {
6122 Register LaneValVreg =
6123 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
6124 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
6125 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
6126 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
6127 LaneValVreg)
6128 .addReg(LaneValueReg);
6129 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
6130 .addImm(0) // src0 modifier
6131 .addReg(Accumulator->getOperand(0).getReg())
6132 .addImm(0) // src1 modifier
6133 .addReg(LaneValVreg)
6134 .addImm(0) // clamp
6135 .addImm(0); // omod
6136 NewAccumulator =
6137 BuildMI(*ComputeLoop, I, DL,
6138 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6139 .addReg(DstVreg);
6140 } else {
6141 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6142 .addReg(Accumulator->getOperand(0).getReg())
6143 .addReg(LaneValueReg);
6144 }
6145 } else {
6146 Register LaneValueLoReg =
6147 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6148 Register LaneValueHiReg =
6149 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6150 Register LaneValReg =
6151 MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6152 auto [Op1L, Op1H] = ExtractSubRegs(MI, MI.getOperand(1),
6153 MRI.getRegClass(SrcReg), ST, MRI);
6154 // lane value input should be in an sgpr
6155 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6156 LaneValueLoReg)
6157 .addReg(Op1L)
6158 .addReg(FF1Reg);
6159 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6160 LaneValueHiReg)
6161 .addReg(Op1H)
6162 .addReg(FF1Reg);
6163 auto LaneValue = BuildRegSequence(*ComputeLoop, I, LaneValReg,
6164 LaneValueLoReg, LaneValueHiReg);
6165 switch (Opc) {
6166 case AMDGPU::S_OR_B64:
6167 case AMDGPU::S_AND_B64:
6168 case AMDGPU::S_XOR_B64: {
6169 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6170 .addReg(Accumulator->getOperand(0).getReg())
6171 .addReg(LaneValue->getOperand(0).getReg())
6172 .setOperandDead(3); // Dead scc
6173 break;
6174 }
6175 case AMDGPU::V_CMP_GT_I64_e64:
6176 case AMDGPU::V_CMP_GT_U64_e64:
6177 case AMDGPU::V_CMP_LT_I64_e64:
6178 case AMDGPU::V_CMP_LT_U64_e64: {
6179 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
6180 Register ComparisonResultReg =
6181 MRI.createVirtualRegister(WaveMaskRegClass);
6182 int SrcIdx =
6183 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6184 const TargetRegisterClass *VregClass =
6185 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6186 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
6187 auto [SrcReg0Sub0, SrcReg0Sub1] = ExtractSubRegs(
6188 MI, Accumulator->getOperand(0), VregClass, ST, MRI);
6189 BuildRegSequence(*ComputeLoop, I, AccumulatorVReg, SrcReg0Sub0,
6190 SrcReg0Sub1);
6191 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
6192 .addReg(LaneValue->getOperand(0).getReg())
6193 .addReg(AccumulatorVReg);
6194
6195 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6196 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
6197 .addReg(LaneMaskReg)
6198 .addReg(ActiveBitsReg);
6199
6200 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6201 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6202 .addReg(LaneValue->getOperand(0).getReg())
6203 .addReg(Accumulator->getOperand(0).getReg());
6204 break;
6205 }
6206 case AMDGPU::V_MIN_F64_e64:
6207 case AMDGPU::V_MIN_NUM_F64_e64:
6208 case AMDGPU::V_MAX_F64_e64:
6209 case AMDGPU::V_MAX_NUM_F64_e64:
6210 case AMDGPU::V_ADD_F64_e64:
6211 case AMDGPU::V_ADD_F64_pseudo_e64: {
6212 int SrcIdx =
6213 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6214 const TargetRegisterClass *VregRC =
6215 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6216 Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
6217 Register DstVreg = MRI.createVirtualRegister(VregRC);
6218 Register LaneValLo =
6219 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6220 Register LaneValHi =
6221 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6222 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
6223 .addReg(Accumulator->getOperand(0).getReg());
6224 unsigned Modifier =
6225 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6228 auto DstVregInst =
6229 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
6230 .addImm(Modifier) // src0 modifiers
6231 .addReg(LaneValue->getOperand(0).getReg())
6232 .addImm(SISrcMods::NONE) // src1 modifiers
6233 .addReg(AccumulatorVReg)
6234 .addImm(SISrcMods::NONE) // clamp
6235 .addImm(SISrcMods::NONE); // omod
6236 auto ReadLaneLo =
6237 BuildMI(*ComputeLoop, I, DL,
6238 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo);
6239 auto ReadLaneHi =
6240 BuildMI(*ComputeLoop, I, DL,
6241 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi);
6242 MachineBasicBlock::iterator Iters = *ReadLaneLo;
6243 auto [Op1L, Op1H] = ExtractSubRegs(*Iters, DstVregInst->getOperand(0),
6244 VregRC, ST, MRI);
6245 ReadLaneLo.addReg(Op1L);
6246 ReadLaneHi.addReg(Op1H);
6247 NewAccumulator =
6248 BuildRegSequence(*ComputeLoop, I, DstReg, LaneValLo, LaneValHi);
6249 break;
6250 }
6251 case AMDGPU::S_ADD_U64_PSEUDO:
6252 case AMDGPU::S_SUB_U64_PSEUDO: {
6253 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6254 .addReg(Accumulator->getOperand(0).getReg())
6255 .addReg(LaneValue->getOperand(0).getReg());
6256 ComputeLoop =
6257 expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
6258 break;
6259 }
6260 }
6261 }
6262 // Manipulate the iterator to get the next active lane
6263 unsigned BITSETOpc =
6264 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6265 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
6266 .addReg(FF1Reg)
6267 .addReg(ActiveBitsReg);
6268
6269 // Add phi nodes
6270 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
6271 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6272
6273 // Creating branching
6274 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6275 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
6276 .addReg(NewActiveBitsReg)
6277 .addImm(0);
6278 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6279 .addMBB(ComputeLoop);
6280
6281 RetBB = ComputeEnd;
6282 } else {
6283 assert(ST.hasDPP() && "Sub Target does not support DPP Operations");
6284 MachineBasicBlock *CurrBB = &BB;
6285 Register SrcWithIdentity = MRI.createVirtualRegister(SrcRegClass);
6286 Register IdentityVGPR = MRI.createVirtualRegister(SrcRegClass);
6287 Register IdentitySGPR = MRI.createVirtualRegister(DstRegClass);
6288 Register DPPRowShr1 = MRI.createVirtualRegister(SrcRegClass);
6289 Register DPPRowShr2 = MRI.createVirtualRegister(SrcRegClass);
6290 Register DPPRowShr4 = MRI.createVirtualRegister(SrcRegClass);
6291 Register DPPRowShr8 = MRI.createVirtualRegister(SrcRegClass);
6292 Register RowBcast15 = MRI.createVirtualRegister(SrcRegClass);
6293 Register ReducedValSGPR = MRI.createVirtualRegister(DstRegClass);
6294 Register NegatedReducedVal = MRI.createVirtualRegister(DstRegClass);
6295 Register RowBcast31 = MRI.createVirtualRegister(SrcRegClass);
6296 Register UndefExec = MRI.createVirtualRegister(WaveMaskRegClass);
6297 Register FinalDPPResult;
6298 MachineInstr *SrcWithIdentityInstr;
6299 MachineInstr *LastBcastInstr;
6300 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
6301
6303 BuildMI(*CurrBB, MI, DL,
6304 TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
6305 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6306 IdentitySGPR)
6307 .addImm(IdentityValue);
6308 auto IdentityCopyInstr =
6309 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR)
6310 .addReg(IdentitySGPR);
6311 auto DPPClampOpcPair = getDPPOpcForWaveReduction(Opc, ST);
6312 unsigned DPPOpc = std::get<0>(DPPClampOpcPair);
6313 unsigned ClampOpc = std::get<1>(DPPClampOpcPair);
6314 auto BuildSetInactiveInstr = [&](Register Dst, Register Src0,
6315 Register Src1) {
6316 return BuildMI(BB, MI, DL, TII->get(AMDGPU::V_SET_INACTIVE_B32),
6317 Dst)
6318 .addImm(0) // src0 modifiers
6319 .addReg(Src0) // src0
6320 .addImm(0) // src1 modifiers
6321 .addReg(Src1) // identity value for inactive lanes
6322 .addReg(UndefExec); // bool i1
6323 };
6324 auto BuildDPPMachineInstr = [&](Register Dst, Register Src,
6325 unsigned DPPCtrl) {
6326 auto DPPInstr =
6327 BuildMI(*CurrBB, MI, DL, TII->get(DPPOpc), Dst).addReg(Src); // old
6328 if (isFPOp && !NeedsMovDPP)
6329 DPPInstr.addImm(SISrcMods::NONE); // src0 modifier
6330 DPPInstr.addReg(Src); // src0
6331 if (isFPOp && !NeedsMovDPP)
6332 DPPInstr.addImm(SISrcMods::NONE); // src1 modifier
6333 if (!NeedsMovDPP)
6334 DPPInstr.addReg(Src); // src1
6335 if (AMDGPU::getNamedOperandIdx(DPPOpc, AMDGPU::OpName::clamp) >= 0)
6336 DPPInstr.addImm(0); // clamp
6337 DPPInstr
6338 .addImm(DPPCtrl) // dpp-ctrl
6339 .addImm(0xf) // row-mask
6340 .addImm(0xf) // bank-mask
6341 .addImm(0); // bound-control
6342 };
6343 auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1,
6344 bool isAddSub = false,
6345 bool needsCarryIn = false,
6346 Register CarryIn = Register()) {
6347 unsigned InstrOpc = ClampOpc;
6348 Register CarryOutReg = MRI.createVirtualRegister(WaveMaskRegClass);
6349 if (needsCarryIn)
6350 InstrOpc = AMDGPU::V_ADDC_U32_e64;
6351 auto ClampInstr = BuildMI(*CurrBB, MI, DL, TII->get(InstrOpc), Dst);
6352 if (isFPOp)
6353 ClampInstr.addImm(SISrcMods::NONE); // src0 mod
6354 if (isAddSub) {
6355 if (needsCarryIn)
6356 ClampInstr.addReg(CarryOutReg,
6358 RegState::Dead); // killed carry-out reg
6359 else
6360 ClampInstr.addReg(CarryOutReg, RegState::Define); // carry-out reg
6361 }
6362 ClampInstr.addReg(Src0); // src0
6363 if (isFPOp)
6364 ClampInstr.addImm(SISrcMods::NONE); // src1 mod
6365 ClampInstr.addReg(Src1); // src1
6366 if (needsCarryIn)
6367 ClampInstr.addReg(CarryIn, RegState::Kill); // carry-in reg
6368 if (AMDGPU::getNamedOperandIdx(InstrOpc, AMDGPU::OpName::clamp) >= 0)
6369 ClampInstr.addImm(0); // clamp
6370 if (isFPOp)
6371 ClampInstr.addImm(0); // omod
6372 LastBcastInstr = ClampInstr;
6373 return CarryOutReg;
6374 };
6375 auto BuildPostDPPInstr = [&](Register Src0, Register Src1) {
6376 bool isAddSubOpc =
6377 Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO;
6378 bool isBitWiseOpc = Opc == AMDGPU::S_AND_B64 ||
6379 Opc == AMDGPU::S_OR_B64 || Opc == AMDGPU::S_XOR_B64;
6380 Register ReturnReg = MRI.createVirtualRegister(SrcRegClass);
6381 if (isAddSubOpc || isBitWiseOpc) {
6382 Register ResLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6383 Register ResHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6384 MachineOperand Src0Operand =
6385 MachineOperand::CreateReg(Src0, /*isDef=*/false);
6386 MachineOperand Src1Operand =
6387 MachineOperand::CreateReg(Src1, /*isDef=*/false);
6388 auto [Src0Lo, Src0Hi] =
6389 ExtractSubRegs(MI, Src0Operand, SrcRegClass, ST, MRI);
6390 auto [Src1Lo, Src1Hi] =
6391 ExtractSubRegs(MI, Src1Operand, SrcRegClass, ST, MRI);
6392 Register CarryReg = BuildClampInstr(
6393 ResLo, Src0Lo, Src1Lo, isAddSubOpc, /*needsCarryIn*/ false);
6394 BuildClampInstr(ResHi, Src0Hi, Src1Hi, isAddSubOpc,
6395 /*needsCarryIn*/ isAddSubOpc, CarryReg);
6396 BuildRegSequence(*CurrBB, MI, ReturnReg, ResLo, ResHi);
6397 } else {
6398 if (isFPOp) {
6399 BuildMI(*CurrBB, MI, DL, TII->get(Opc), ReturnReg)
6400 .addImm(SISrcMods::NONE) // src0 modifiers
6401 .addReg(Src0)
6402 .addImm(SISrcMods::NONE) // src1 modifiers
6403 .addReg(Src1)
6404 .addImm(SISrcMods::NONE) // clamp
6405 .addImm(SISrcMods::NONE); // omod
6406 } else {
6407 Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
6408 BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
6409 .addReg(Src0) // src0
6410 .addReg(Src1); // src1
6411 LastBcastInstr =
6412 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
6413 ReturnReg)
6414 .addReg(Src1) // src0
6415 .addReg(Src0) // src1
6416 .addReg(CmpMaskReg); // src2
6417 expand64BitV_CNDMASK(*LastBcastInstr, CurrBB);
6418 }
6419 }
6420 return ReturnReg;
6421 };
6422
6423 // Set inactive lanes to the identity value.
6424 if (is32BitOpc) {
6425 SrcWithIdentityInstr =
6426 BuildSetInactiveInstr(SrcWithIdentity, SrcReg, IdentityVGPR);
6427 } else {
6428 Register SrcWithIdentitylo =
6429 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6430 Register SrcWithIdentityhi =
6431 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6432 auto [Reg0Sub0, Reg0Sub1] = ExtractSubRegs(
6433 MI, IdentityCopyInstr->getOperand(0), SrcRegClass, ST, MRI);
6434 auto [SrcReg0Sub0, SrcReg0Sub1] =
6435 ExtractSubRegs(MI, MI.getOperand(1), SrcRegClass, ST, MRI);
6436 MachineInstr *SetInactiveLoInstr =
6437 BuildSetInactiveInstr(SrcWithIdentitylo, SrcReg0Sub0, Reg0Sub0);
6438 MachineInstr *SetInactiveHiInstr =
6439 BuildSetInactiveInstr(SrcWithIdentityhi, SrcReg0Sub1, Reg0Sub1);
6440 SrcWithIdentityInstr =
6441 BuildRegSequence(*CurrBB, MI, SrcWithIdentity,
6442 SetInactiveLoInstr->getOperand(0).getReg(),
6443 SetInactiveHiInstr->getOperand(0).getReg());
6444 }
6445 // DPP reduction
6446 Register SrcWithIdentityReg =
6447 SrcWithIdentityInstr->getOperand(0).getReg();
6448 BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentityReg,
6450 if (NeedsMovDPP)
6451 DPPRowShr1 = BuildPostDPPInstr(SrcWithIdentityReg, DPPRowShr1);
6452
6453 BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
6455 if (NeedsMovDPP)
6456 DPPRowShr2 = BuildPostDPPInstr(DPPRowShr1, DPPRowShr2);
6457
6458 BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
6460 if (NeedsMovDPP)
6461 DPPRowShr4 = BuildPostDPPInstr(DPPRowShr2, DPPRowShr4);
6462
6463 BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
6465 if (NeedsMovDPP)
6466 DPPRowShr8 = BuildPostDPPInstr(DPPRowShr4, DPPRowShr8);
6467
6468 if (ST.hasDPPBroadcasts()) {
6469 BuildDPPMachineInstr(RowBcast15, DPPRowShr8, AMDGPU::DPP::BCAST15);
6470 if (NeedsMovDPP)
6471 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, RowBcast15);
6472 } else {
6473 // magic constant: 0x1E0
6474 // To Set BIT_MODE : bit 15 = 0
6475 // XOR mask : bit [14:10] = 0
6476 // OR mask : bit [9:5] = 15
6477 // AND mask : bit [4:0] = 0
6478 if (is32BitOpc) {
6479 Register SwizzledValue =
6480 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6481 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
6482 SwizzledValue)
6483 .addReg(DPPRowShr8) // addr
6484 .addImm(0x1E0) // swizzle offset (i16)
6485 .addImm(0x0); // gds (i1)
6486 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue);
6487 } else {
6488 Register SwizzledValuelo =
6489 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6490 Register SwizzledValuehi =
6491 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6492 Register SwizzledValue64 = MRI.createVirtualRegister(SrcRegClass);
6493 MachineOperand DPPRowShr8Op =
6494 MachineOperand::CreateReg(DPPRowShr8, /*isDef=*/false);
6495 auto [Op1L, Op1H] =
6496 ExtractSubRegs(MI, DPPRowShr8Op, SrcRegClass, ST, MRI);
6497 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
6498 SwizzledValuelo)
6499 .addReg(Op1L) // addr
6500 .addImm(0x1E0) // swizzle offset (i16)
6501 .addImm(0x0); // gds (i1)
6502 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
6503 SwizzledValuehi)
6504 .addReg(Op1H) // addr
6505 .addImm(0x1E0) // swizzle offset (i16)
6506 .addImm(0x0); // gds (i1)
6507 BuildRegSequence(*CurrBB, MI, SwizzledValue64, SwizzledValuelo,
6508 SwizzledValuehi);
6509 if (NeedsMovDPP)
6510 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, SwizzledValue64);
6511 else
6512 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue64);
6513 }
6514 }
6515 FinalDPPResult = RowBcast15;
6516 if (!IsWave32) {
6517 if (ST.hasDPPBroadcasts()) {
6518 BuildDPPMachineInstr(RowBcast31, RowBcast15, AMDGPU::DPP::BCAST31);
6519 if (NeedsMovDPP)
6520 RowBcast31 = BuildPostDPPInstr(RowBcast15, RowBcast31);
6521 } else {
6522 Register ShiftedThreadID =
6523 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6524 Register PermuteByteOffset =
6525 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6526 Register PermutedValue = MRI.createVirtualRegister(SrcRegClass);
6527 Register Lane32Offset =
6528 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6529 Register WordSizeConst =
6530 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6531 Register ThreadIDRegLo =
6532 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6533 Register ThreadIDReg =
6534 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6535 // Get the thread ID.
6536 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
6537 ThreadIDRegLo)
6538 .addImm(-1)
6539 .addImm(0);
6540 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
6541 ThreadIDReg)
6542 .addImm(-1)
6543 .addReg(ThreadIDRegLo);
6544 // shift each lane over by 32 positions, so value in 31st lane is
6545 // present in 63rd lane.
6546 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
6547 .addImm(0x20);
6548 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64),
6549 ShiftedThreadID)
6550 .addReg(ThreadIDReg)
6551 .addReg(Lane32Offset)
6552 .addImm(0); // clamp
6553 // multiply by reg size.
6554 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
6555 .addImm(0x4);
6556 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64),
6557 PermuteByteOffset)
6558 .addReg(WordSizeConst)
6559 .addReg(ShiftedThreadID);
6560 // Permute the lanes
6561 if (is32BitOpc) {
6562 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
6563 PermutedValue)
6564 .addReg(PermuteByteOffset) // addr
6565 .addReg(RowBcast15) // data
6566 .addImm(0); // offset
6567 } else {
6568 Register PermutedValuelo =
6569 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6570 Register PermutedValuehi =
6571 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6572 MachineOperand RowBcast15Op =
6573 MachineOperand::CreateReg(RowBcast15, /*isDef=*/false);
6574 auto [RowBcast15Lo, RowBcast15Hi] =
6575 ExtractSubRegs(MI, RowBcast15Op, SrcRegClass, ST, MRI);
6576 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
6577 PermutedValuelo)
6578 .addReg(PermuteByteOffset) // addr
6579 .addReg(RowBcast15Lo) // swizzle offset (i16)
6580 .addImm(0x0); // gds (i1)
6581 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
6582 PermutedValuehi)
6583 .addReg(PermuteByteOffset) // addr
6584 .addReg(RowBcast15Hi) // swizzle offset (i16)
6585 .addImm(0x0); // gds (i1)
6586 BuildRegSequence(*CurrBB, MI, PermutedValue, PermutedValuelo,
6587 PermutedValuehi);
6588 }
6589 if (NeedsMovDPP)
6590 RowBcast31 = BuildPostDPPInstr(RowBcast15, PermutedValue);
6591 else
6592 BuildClampInstr(RowBcast31, RowBcast15, PermutedValue);
6593 }
6594 FinalDPPResult = RowBcast31;
6595 }
6596 if (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6597 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64) {
6598 Register NegatedValVGPR = MRI.createVirtualRegister(SrcRegClass);
6599 // Opc for f32 reduction is V_SUB_F32.
6600 // For f64, there is no equivalent V_SUB_F64 opcode, so use
6601 // V_ADD_F64/V_ADD_F64_pseudo, and negate the second operand.
6602 BuildMI(*CurrBB, MI, DL, TII->get(Opc),
6603 NegatedValVGPR)
6604 .addImm(SISrcMods::NONE) // src0 mods
6605 .addReg(IdentityVGPR) // src0
6606 .addImm(is32BitOpc ? SISrcMods::NONE : SISrcMods::NEG) // src1 mods
6607 .addReg(IsWave32 ? RowBcast15 : RowBcast31) // src1
6608 .addImm(SISrcMods::NONE) // clamp
6609 .addImm(SISrcMods::NONE); // omod
6610 FinalDPPResult = NegatedValVGPR;
6611 }
6612 // The final reduced value is in the last lane.
6613 if (is32BitOpc) {
6614 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
6615 ReducedValSGPR)
6616 .addReg(FinalDPPResult)
6617 .addImm(ST.getWavefrontSize() - 1);
6618 } else {
6619 Register LaneValueLoReg =
6620 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6621 Register LaneValueHiReg =
6622 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6623 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
6624 MachineOperand FinalDPPResultOperand =
6625 MachineOperand::CreateReg(FinalDPPResult, /*isDef=*/false);
6626 auto [Op1L, Op1H] =
6627 ExtractSubRegs(MI, FinalDPPResultOperand, SrcRC, ST, MRI);
6628 // lane value input should be in an sgpr
6629 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
6630 LaneValueLoReg)
6631 .addReg(Op1L)
6632 .addImm(ST.getWavefrontSize() - 1);
6633 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
6634 LaneValueHiReg)
6635 .addReg(Op1H)
6636 .addImm(ST.getWavefrontSize() - 1);
6637 BuildRegSequence(*CurrBB, MI, ReducedValSGPR, LaneValueLoReg,
6638 LaneValueHiReg);
6639 }
6640 if (Opc == AMDGPU::S_SUB_I32) {
6641 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
6642 .addImm(0)
6643 .addReg(ReducedValSGPR);
6644 } else if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6645 auto NegatedValInstr =
6646 BuildMI(*CurrBB, MI, DL, TII->get(Opc), NegatedReducedVal)
6647 .addImm(0)
6648 .addReg(ReducedValSGPR);
6649 CurrBB = expand64BitScalarArithmetic(*NegatedValInstr, CurrBB);
6650 }
6651 // Mark the final result as a whole-wave-mode calculation.
6652 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
6653 .addReg(Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_SUB_U64_PSEUDO
6654 ? NegatedReducedVal
6655 : ReducedValSGPR);
6656 RetBB = CurrBB;
6657 }
6658 }
6659 MI.eraseFromParent();
6660 return RetBB;
6661}
6662
6665 MachineBasicBlock *BB) const {
6666 MachineFunction *MF = BB->getParent();
6668 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6670 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6671 MachineRegisterInfo &MRI = MF->getRegInfo();
6672 const DebugLoc &DL = MI.getDebugLoc();
6673
6674 switch (MI.getOpcode()) {
6675 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6676 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
6677 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6678 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
6679 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6680 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
6681 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6682 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
6683 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6684 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
6685 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6686 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6687 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6688 ? AMDGPU::V_MIN_NUM_F64_e64
6689 : AMDGPU::V_MIN_F64_e64);
6690 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6691 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
6692 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6693 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
6694 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6695 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6696 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6697 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6698 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6699 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6700 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6701 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6702 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6703 ? AMDGPU::V_MAX_NUM_F64_e64
6704 : AMDGPU::V_MAX_F64_e64);
6705 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6706 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6707 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6708 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6709 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6710 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6711 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6712 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6713 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6714 ? AMDGPU::V_ADD_F64_pseudo_e64
6715 : AMDGPU::V_ADD_F64_e64);
6716 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6717 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6718 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6719 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6720 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6721 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6722 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6723 // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
6724 // fadd + neg, by setting the NEG bit in the instruction.
6725 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6726 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6727 ? AMDGPU::V_ADD_F64_pseudo_e64
6728 : AMDGPU::V_ADD_F64_e64);
6729 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6730 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6731 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6732 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6733 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6734 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6735 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6736 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6737 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6738 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6739 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6740 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6741 case AMDGPU::S_UADDO_PSEUDO:
6742 case AMDGPU::S_USUBO_PSEUDO: {
6743 MachineOperand &Dest0 = MI.getOperand(0);
6744 MachineOperand &Dest1 = MI.getOperand(1);
6745 MachineOperand &Src0 = MI.getOperand(2);
6746 MachineOperand &Src1 = MI.getOperand(3);
6747
6748 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6749 ? AMDGPU::S_ADD_U32
6750 : AMDGPU::S_SUB_U32;
6751 // clang-format off
6752 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6753 .add(Src0)
6754 .add(Src1);
6755 // clang-format on
6756
6757 unsigned SelOpc =
6758 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6759 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6760
6761 MI.eraseFromParent();
6762 return BB;
6763 }
6764 case AMDGPU::S_ADD_U64_PSEUDO:
6765 case AMDGPU::S_SUB_U64_PSEUDO: {
6766 return expand64BitScalarArithmetic(MI, BB);
6767 }
6768 case AMDGPU::V_ADD_U64_PSEUDO:
6769 case AMDGPU::V_SUB_U64_PSEUDO: {
6770 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6771
6772 MachineOperand &Dest = MI.getOperand(0);
6773 MachineOperand &Src0 = MI.getOperand(1);
6774 MachineOperand &Src1 = MI.getOperand(2);
6775
6776 if (ST.hasAddSubU64Insts()) {
6777 auto I = BuildMI(*BB, MI, DL,
6778 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6779 : AMDGPU::V_SUB_U64_e64),
6780 Dest.getReg())
6781 .add(Src0)
6782 .add(Src1)
6783 .addImm(0); // clamp
6784 TII->legalizeOperands(*I);
6785 MI.eraseFromParent();
6786 return BB;
6787 }
6788
6789 if (IsAdd && ST.hasLshlAddU64Inst()) {
6790 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6791 Dest.getReg())
6792 .add(Src0)
6793 .addImm(0)
6794 .add(Src1);
6795 TII->legalizeOperands(*Add);
6796 MI.eraseFromParent();
6797 return BB;
6798 }
6799
6800 const auto *CarryRC = TRI->getWaveMaskRegClass();
6801
6802 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6803 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6804
6805 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6806 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6807
6808 const TargetRegisterClass *Src0RC = Src0.isReg()
6809 ? MRI.getRegClass(Src0.getReg())
6810 : &AMDGPU::VReg_64RegClass;
6811 const TargetRegisterClass *Src1RC = Src1.isReg()
6812 ? MRI.getRegClass(Src1.getReg())
6813 : &AMDGPU::VReg_64RegClass;
6814
6815 const TargetRegisterClass *Src0SubRC =
6816 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6817 const TargetRegisterClass *Src1SubRC =
6818 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6819
6820 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6821 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6822 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6823 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6824
6825 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6826 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6827 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6828 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6829
6830 unsigned LoOpc =
6831 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6832 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6833 .addReg(CarryReg, RegState::Define)
6834 .add(SrcReg0Sub0)
6835 .add(SrcReg1Sub0)
6836 .addImm(0); // clamp bit
6837
6838 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6839 MachineInstr *HiHalf =
6840 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6841 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6842 .add(SrcReg0Sub1)
6843 .add(SrcReg1Sub1)
6844 .addReg(CarryReg, RegState::Kill)
6845 .addImm(0); // clamp bit
6846
6847 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6848 .addReg(DestSub0)
6849 .addImm(AMDGPU::sub0)
6850 .addReg(DestSub1)
6851 .addImm(AMDGPU::sub1);
6852 TII->legalizeOperands(*LoHalf);
6853 TII->legalizeOperands(*HiHalf);
6854 MI.eraseFromParent();
6855 return BB;
6856 }
6857 case AMDGPU::S_ADD_CO_PSEUDO:
6858 case AMDGPU::S_SUB_CO_PSEUDO: {
6859 // This pseudo has a chance to be selected
6860 // only from uniform add/subcarry node. All the VGPR operands
6861 // therefore assumed to be splat vectors.
6863 MachineOperand &Dest = MI.getOperand(0);
6864 MachineOperand &CarryDest = MI.getOperand(1);
6865 MachineOperand &Src0 = MI.getOperand(2);
6866 MachineOperand &Src1 = MI.getOperand(3);
6867 MachineOperand &Src2 = MI.getOperand(4);
6868 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6869 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6870 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6871 .addReg(Src0.getReg());
6872 Src0.setReg(RegOp0);
6873 }
6874 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6875 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6876 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6877 .addReg(Src1.getReg());
6878 Src1.setReg(RegOp1);
6879 }
6880 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6881 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6882 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6883 .addReg(Src2.getReg());
6884 Src2.setReg(RegOp2);
6885 }
6886
6887 if (ST.isWave64()) {
6888 if (ST.hasScalarCompareEq64()) {
6889 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6890 .addReg(Src2.getReg())
6891 .addImm(0);
6892 } else {
6893 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6894 const TargetRegisterClass *SubRC =
6895 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6896 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6897 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6898 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6899 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6900 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6901
6902 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6903 .add(Src2Sub0)
6904 .add(Src2Sub1);
6905
6906 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6907 .addReg(Src2_32, RegState::Kill)
6908 .addImm(0);
6909 }
6910 } else {
6911 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6912 .addReg(Src2.getReg())
6913 .addImm(0);
6914 }
6915
6916 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6917 ? AMDGPU::S_ADDC_U32
6918 : AMDGPU::S_SUBB_U32;
6919
6920 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6921
6922 unsigned SelOpc =
6923 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6924
6925 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6926 .addImm(-1)
6927 .addImm(0);
6928
6929 MI.eraseFromParent();
6930 return BB;
6931 }
6932 case AMDGPU::SI_INIT_M0: {
6933 MachineOperand &M0Init = MI.getOperand(0);
6934 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6935 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6936 AMDGPU::M0)
6937 .add(M0Init);
6938 MI.eraseFromParent();
6939 return BB;
6940 }
6941 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6942 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6943 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6944 TII->get(AMDGPU::S_CMP_EQ_U32))
6945 .addImm(0)
6946 .addImm(0);
6947 return BB;
6948 }
6949 case AMDGPU::GET_GROUPSTATICSIZE: {
6950 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6951 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6952 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6953 .add(MI.getOperand(0))
6954 .addImm(MFI->getLDSSize());
6955 MI.eraseFromParent();
6956 return BB;
6957 }
6958 case AMDGPU::GET_SHADERCYCLESHILO: {
6959 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
6960 // The algorithm is:
6961 //
6962 // hi1 = getreg(SHADER_CYCLES_HI)
6963 // lo1 = getreg(SHADER_CYCLES_LO)
6964 // hi2 = getreg(SHADER_CYCLES_HI)
6965 //
6966 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6967 // Otherwise there was overflow and the result is hi2:0. In both cases the
6968 // result should represent the actual time at some point during the sequence
6969 // of three getregs.
6970 using namespace AMDGPU::Hwreg;
6971 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6972 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6973 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6974 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6975 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6976 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6977 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6978 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6979 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6980 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6981 .addReg(RegHi1)
6982 .addReg(RegHi2);
6983 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6984 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6985 .addReg(RegLo1)
6986 .addImm(0);
6987 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6988 .add(MI.getOperand(0))
6989 .addReg(RegLo)
6990 .addImm(AMDGPU::sub0)
6991 .addReg(RegHi2)
6992 .addImm(AMDGPU::sub1);
6993 MI.eraseFromParent();
6994 return BB;
6995 }
6996 case AMDGPU::SI_INDIRECT_SRC_V1:
6997 case AMDGPU::SI_INDIRECT_SRC_V2:
6998 case AMDGPU::SI_INDIRECT_SRC_V3:
6999 case AMDGPU::SI_INDIRECT_SRC_V4:
7000 case AMDGPU::SI_INDIRECT_SRC_V5:
7001 case AMDGPU::SI_INDIRECT_SRC_V6:
7002 case AMDGPU::SI_INDIRECT_SRC_V7:
7003 case AMDGPU::SI_INDIRECT_SRC_V8:
7004 case AMDGPU::SI_INDIRECT_SRC_V9:
7005 case AMDGPU::SI_INDIRECT_SRC_V10:
7006 case AMDGPU::SI_INDIRECT_SRC_V11:
7007 case AMDGPU::SI_INDIRECT_SRC_V12:
7008 case AMDGPU::SI_INDIRECT_SRC_V16:
7009 case AMDGPU::SI_INDIRECT_SRC_V32:
7010 return emitIndirectSrc(MI, *BB, *getSubtarget());
7011 case AMDGPU::SI_INDIRECT_DST_V1:
7012 case AMDGPU::SI_INDIRECT_DST_V2:
7013 case AMDGPU::SI_INDIRECT_DST_V3:
7014 case AMDGPU::SI_INDIRECT_DST_V4:
7015 case AMDGPU::SI_INDIRECT_DST_V5:
7016 case AMDGPU::SI_INDIRECT_DST_V6:
7017 case AMDGPU::SI_INDIRECT_DST_V7:
7018 case AMDGPU::SI_INDIRECT_DST_V8:
7019 case AMDGPU::SI_INDIRECT_DST_V9:
7020 case AMDGPU::SI_INDIRECT_DST_V10:
7021 case AMDGPU::SI_INDIRECT_DST_V11:
7022 case AMDGPU::SI_INDIRECT_DST_V12:
7023 case AMDGPU::SI_INDIRECT_DST_V16:
7024 case AMDGPU::SI_INDIRECT_DST_V32:
7025 return emitIndirectDst(MI, *BB, *getSubtarget());
7026 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
7027 case AMDGPU::SI_KILL_I1_PSEUDO:
7028 return splitKillBlock(MI, BB);
7029 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
7031 return BB;
7032 }
7033 case AMDGPU::SI_BR_UNDEF: {
7034 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
7035 .add(MI.getOperand(0));
7036 Br->getOperand(1).setIsUndef(); // read undef SCC
7037 MI.eraseFromParent();
7038 return BB;
7039 }
7040 case AMDGPU::ADJCALLSTACKUP:
7041 case AMDGPU::ADJCALLSTACKDOWN: {
7043 MachineInstrBuilder MIB(*MF, &MI);
7044 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
7045 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
7046 return BB;
7047 }
7048 case AMDGPU::SI_CALL_ISEL: {
7049 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
7050
7052 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
7053
7054 for (const MachineOperand &MO : MI.operands())
7055 MIB.add(MO);
7056
7057 MIB.cloneMemRefs(MI);
7058 MI.eraseFromParent();
7059 return BB;
7060 }
7061 case AMDGPU::V_ADD_CO_U32_e32:
7062 case AMDGPU::V_SUB_CO_U32_e32:
7063 case AMDGPU::V_SUBREV_CO_U32_e32: {
7064 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
7065 unsigned Opc = MI.getOpcode();
7066
7067 bool NeedClampOperand = false;
7068 if (TII->pseudoToMCOpcode(Opc) == -1) {
7070 NeedClampOperand = true;
7071 }
7072
7073 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
7074 if (TII->isVOP3(*I)) {
7075 I.addReg(TRI->getVCC(), RegState::Define);
7076 }
7077 I.add(MI.getOperand(1)).add(MI.getOperand(2));
7078 if (NeedClampOperand)
7079 I.addImm(0); // clamp bit for e64 encoding
7080
7081 TII->legalizeOperands(*I);
7082
7083 MI.eraseFromParent();
7084 return BB;
7085 }
7086 case AMDGPU::V_ADDC_U32_e32:
7087 case AMDGPU::V_SUBB_U32_e32:
7088 case AMDGPU::V_SUBBREV_U32_e32:
7089 // These instructions have an implicit use of vcc which counts towards the
7090 // constant bus limit.
7091 TII->legalizeOperands(MI);
7092 return BB;
7093 case AMDGPU::DS_GWS_INIT:
7094 case AMDGPU::DS_GWS_SEMA_BR:
7095 case AMDGPU::DS_GWS_BARRIER:
7096 case AMDGPU::DS_GWS_SEMA_V:
7097 case AMDGPU::DS_GWS_SEMA_P:
7098 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
7099 // A s_waitcnt 0 is required to be the instruction immediately following.
7100 if (getSubtarget()->hasGWSAutoReplay()) {
7102 return BB;
7103 }
7104
7105 return emitGWSMemViolTestLoop(MI, BB);
7106 case AMDGPU::S_SETREG_B32: {
7107 // Try to optimize cases that only set the denormal mode or rounding mode.
7108 //
7109 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
7110 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
7111 // instead.
7112 //
7113 // FIXME: This could be predicates on the immediate, but tablegen doesn't
7114 // allow you to have a no side effect instruction in the output of a
7115 // sideeffecting pattern.
7116 auto [ID, Offset, Width] =
7117 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
7119 return BB;
7120
7121 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
7122 const unsigned SetMask = WidthMask << Offset;
7123
7124 if (getSubtarget()->hasDenormModeInst()) {
7125 unsigned SetDenormOp = 0;
7126 unsigned SetRoundOp = 0;
7127
7128 // The dedicated instructions can only set the whole denorm or round mode
7129 // at once, not a subset of bits in either.
7130 if (SetMask ==
7132 // If this fully sets both the round and denorm mode, emit the two
7133 // dedicated instructions for these.
7134 SetRoundOp = AMDGPU::S_ROUND_MODE;
7135 SetDenormOp = AMDGPU::S_DENORM_MODE;
7136 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
7137 SetRoundOp = AMDGPU::S_ROUND_MODE;
7138 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
7139 SetDenormOp = AMDGPU::S_DENORM_MODE;
7140 }
7141
7142 if (SetRoundOp || SetDenormOp) {
7143 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
7144 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
7145 unsigned ImmVal = Def->getOperand(1).getImm();
7146 if (SetRoundOp) {
7147 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
7148 .addImm(ImmVal & 0xf);
7149
7150 // If we also have the denorm mode, get just the denorm mode bits.
7151 ImmVal >>= 4;
7152 }
7153
7154 if (SetDenormOp) {
7155 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
7156 .addImm(ImmVal & 0xf);
7157 }
7158
7159 MI.eraseFromParent();
7160 return BB;
7161 }
7162 }
7163 }
7164
7165 // If only FP bits are touched, used the no side effects pseudo.
7166 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
7167 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
7168 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
7169
7170 return BB;
7171 }
7172 case AMDGPU::S_INVERSE_BALLOT_U32:
7173 case AMDGPU::S_INVERSE_BALLOT_U64:
7174 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
7175 // necessary. After that they are equivalent to a COPY.
7176 MI.setDesc(TII->get(AMDGPU::COPY));
7177 return BB;
7178 case AMDGPU::ENDPGM_TRAP: {
7179 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
7180 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
7181 MI.addOperand(MachineOperand::CreateImm(0));
7182 return BB;
7183 }
7184
7185 // We need a block split to make the real endpgm a terminator. We also don't
7186 // want to break phis in successor blocks, so we can't just delete to the
7187 // end of the block.
7188
7189 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
7191 MF->push_back(TrapBB);
7192 // clang-format off
7193 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
7194 .addImm(0);
7195 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
7196 .addMBB(TrapBB);
7197 // clang-format on
7198
7199 BB->addSuccessor(TrapBB);
7200 MI.eraseFromParent();
7201 return SplitBB;
7202 }
7203 case AMDGPU::SIMULATED_TRAP: {
7204 assert(Subtarget->hasPrivEnabledTrap2NopBug());
7205 MachineBasicBlock *SplitBB =
7206 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
7207 MI.eraseFromParent();
7208 return SplitBB;
7209 }
7210 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
7211 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
7213
7214 // During ISel, it's difficult to propagate the original EXEC mask to use as
7215 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
7216 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
7217 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
7218 Register OriginalExec = Setup->getOperand(0).getReg();
7219 MF->getRegInfo().clearKillFlags(OriginalExec);
7220 MI.getOperand(0).setReg(OriginalExec);
7221 return BB;
7222 }
7223 default:
7224 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
7225 if (!MI.mayStore())
7227 return BB;
7228 }
7230 }
7231}
7232
7234 // This currently forces unfolding various combinations of fsub into fma with
7235 // free fneg'd operands. As long as we have fast FMA (controlled by
7236 // isFMAFasterThanFMulAndFAdd), we should perform these.
7237
7238 // When fma is quarter rate, for f64 where add / sub are at best half rate,
7239 // most of these combines appear to be cycle neutral but save on instruction
7240 // count / code size.
7241 return true;
7242}
7243
7245
7247 EVT VT) const {
7248 if (!VT.isVector()) {
7249 return MVT::i1;
7250 }
7251 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
7252}
7253
7255 // TODO: Should i16 be used always if legal? For now it would force VALU
7256 // shifts.
7257 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
7258}
7259
7261 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
7262 ? Ty.changeElementSize(16)
7263 : Ty.changeElementSize(32);
7264}
7265
7266// Answering this is somewhat tricky and depends on the specific device which
7267// have different rates for fma or all f64 operations.
7268//
7269// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
7270// regardless of which device (although the number of cycles differs between
7271// devices), so it is always profitable for f64.
7272//
7273// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
7274// only on full rate devices. Normally, we should prefer selecting v_mad_f32
7275// which we can always do even without fused FP ops since it returns the same
7276// result as the separate operations and since it is always full
7277// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
7278// however does not support denormals, so we do report fma as faster if we have
7279// a fast fma device and require denormals.
7280//
7282 EVT VT) const {
7283 VT = VT.getScalarType();
7284
7285 switch (VT.getSimpleVT().SimpleTy) {
7286 case MVT::f32: {
7287 // If mad is not available this depends only on if f32 fma is full rate.
7288 if (!Subtarget->hasMadMacF32Insts())
7289 return Subtarget->hasFastFMAF32();
7290
7291 // Otherwise f32 mad is always full rate and returns the same result as
7292 // the separate operations so should be preferred over fma.
7293 // However does not support denormals.
7295 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
7296
7297 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
7298 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
7299 }
7300 case MVT::f64:
7301 return true;
7302 case MVT::f16:
7303 case MVT::bf16:
7304 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
7305 default:
7306 break;
7307 }
7308
7309 return false;
7310}
7311
7313 LLT Ty) const {
7314 switch (Ty.getScalarSizeInBits()) {
7315 case 16:
7316 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
7317 case 32:
7318 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
7319 case 64:
7320 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
7321 default:
7322 break;
7323 }
7324
7325 return false;
7326}
7327
7329 if (!Ty.isScalar())
7330 return false;
7331
7332 if (Ty.getScalarSizeInBits() == 16)
7333 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
7334 if (Ty.getScalarSizeInBits() == 32)
7335 return Subtarget->hasMadMacF32Insts() &&
7336 denormalModeIsFlushAllF32(*MI.getMF());
7337
7338 return false;
7339}
7340
7342 const SDNode *N) const {
7343 // TODO: Check future ftz flag
7344 // v_mad_f32/v_mac_f32 do not support denormals.
7345 EVT VT = N->getValueType(0);
7346 if (VT == MVT::f32)
7347 return Subtarget->hasMadMacF32Insts() &&
7349 if (VT == MVT::f16) {
7350 return Subtarget->hasMadF16() &&
7352 }
7353
7354 return false;
7355}
7356
7357//===----------------------------------------------------------------------===//
7358// Custom DAG Lowering Operations
7359//===----------------------------------------------------------------------===//
7360
7361// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
7362// wider vector type is legal.
7364 SelectionDAG &DAG) const {
7365 unsigned Opc = Op.getOpcode();
7366 EVT VT = Op.getValueType();
7367 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
7368 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
7369 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
7370 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7371 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
7372 VT == MVT::v32bf16);
7373
7374 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7375
7376 SDLoc SL(Op);
7377 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
7378 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
7379
7380 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7381}
7382
7383// Enable lowering of ROTR for vxi32 types. This is a workaround for a
7384// regression whereby extra unnecessary instructions were added to codegen
7385// for rotr operations, casued by legalising v2i32 or. This resulted in extra
7386// instructions to extract the result from the vector.
7388 [[maybe_unused]] EVT VT = Op.getValueType();
7389
7390 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
7391 VT == MVT::v16i32) &&
7392 "Unexpected ValueType.");
7393
7394 return DAG.UnrollVectorOp(Op.getNode());
7395}
7396
7397// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
7398// wider vector type is legal.
7400 SelectionDAG &DAG) const {
7401 unsigned Opc = Op.getOpcode();
7402 EVT VT = Op.getValueType();
7403 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
7404 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
7405 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
7406 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7407 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
7408 VT == MVT::v32bf16);
7409
7410 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
7411 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
7412
7413 SDLoc SL(Op);
7414
7415 SDValue OpLo =
7416 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
7417 SDValue OpHi =
7418 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
7419
7420 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7421}
7422
7424 SelectionDAG &DAG) const {
7425 unsigned Opc = Op.getOpcode();
7426 EVT VT = Op.getValueType();
7427 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7428 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
7429 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7430 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
7431 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
7432 VT == MVT::v32bf16);
7433
7434 SDValue Op0 = Op.getOperand(0);
7435 auto [Lo0, Hi0] = Op0.getValueType().isVector()
7436 ? DAG.SplitVectorOperand(Op.getNode(), 0)
7437 : std::pair(Op0, Op0);
7438
7439 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
7440 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
7441
7442 SDLoc SL(Op);
7443 auto ResVT = DAG.GetSplitDestVTs(VT);
7444
7445 SDValue OpLo =
7446 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
7447 SDValue OpHi =
7448 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
7449
7450 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7451}
7452
7454 switch (Op.getOpcode()) {
7455 default:
7457 case ISD::BRCOND:
7458 return LowerBRCOND(Op, DAG);
7459 case ISD::RETURNADDR:
7460 return LowerRETURNADDR(Op, DAG);
7461 case ISD::SPONENTRY:
7462 return LowerSPONENTRY(Op, DAG);
7463 case ISD::LOAD: {
7464 SDValue Result = LowerLOAD(Op, DAG);
7465 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7466 "Load should return a value and a chain");
7467 return Result;
7468 }
7469 case ISD::FSQRT: {
7470 EVT VT = Op.getValueType();
7471 if (VT == MVT::f32)
7472 return lowerFSQRTF32(Op, DAG);
7473 if (VT == MVT::f64)
7474 return lowerFSQRTF64(Op, DAG);
7475 return SDValue();
7476 }
7477 case ISD::FSIN:
7478 case ISD::FCOS:
7479 return LowerTrig(Op, DAG);
7480 case ISD::SELECT:
7481 return LowerSELECT(Op, DAG);
7482 case ISD::FDIV:
7483 return LowerFDIV(Op, DAG);
7484 case ISD::FFREXP:
7485 return LowerFFREXP(Op, DAG);
7487 return LowerATOMIC_CMP_SWAP(Op, DAG);
7488 case ISD::STORE:
7489 return LowerSTORE(Op, DAG);
7490 case ISD::GlobalAddress: {
7493 return LowerGlobalAddress(MFI, Op, DAG);
7494 }
7496 return LowerExternalSymbol(Op, DAG);
7498 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7500 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7502 return LowerINTRINSIC_VOID(Op, DAG);
7503 case ISD::ADDRSPACECAST:
7504 return lowerADDRSPACECAST(Op, DAG);
7506 return lowerINSERT_SUBVECTOR(Op, DAG);
7508 return lowerINSERT_VECTOR_ELT(Op, DAG);
7510 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
7512 return lowerVECTOR_SHUFFLE(Op, DAG);
7514 return lowerSCALAR_TO_VECTOR(Op, DAG);
7515 case ISD::BUILD_VECTOR:
7516 return lowerBUILD_VECTOR(Op, DAG);
7517 case ISD::FP_ROUND:
7519 return lowerFP_ROUND(Op, DAG);
7520 case ISD::TRAP:
7521 return lowerTRAP(Op, DAG);
7522 case ISD::DEBUGTRAP:
7523 return lowerDEBUGTRAP(Op, DAG);
7524 case ISD::ABS:
7525 case ISD::FABS:
7526 case ISD::FNEG:
7527 case ISD::FCANONICALIZE:
7528 case ISD::BSWAP:
7529 return splitUnaryVectorOp(Op, DAG);
7530 case ISD::FMINNUM:
7531 case ISD::FMAXNUM:
7532 return lowerFMINNUM_FMAXNUM(Op, DAG);
7533 case ISD::FMINIMUMNUM:
7534 case ISD::FMAXIMUMNUM:
7535 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
7536 case ISD::FMINIMUM:
7537 case ISD::FMAXIMUM:
7538 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
7539 case ISD::FLDEXP:
7540 case ISD::STRICT_FLDEXP:
7541 return lowerFLDEXP(Op, DAG);
7542 case ISD::FMA:
7543 return splitTernaryVectorOp(Op, DAG);
7544 case ISD::FP_TO_SINT:
7545 case ISD::FP_TO_UINT:
7546 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
7547 Op.getValueType() == MVT::i16 &&
7548 Op.getOperand(0).getValueType() == MVT::f32) {
7549 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
7550 return Op;
7551 }
7552 return LowerFP_TO_INT(Op, DAG);
7553 case ISD::SHL:
7554 case ISD::SRA:
7555 case ISD::SRL:
7556 case ISD::ADD:
7557 case ISD::SUB:
7558 case ISD::SMIN:
7559 case ISD::SMAX:
7560 case ISD::UMIN:
7561 case ISD::UMAX:
7562 case ISD::FADD:
7563 case ISD::FMUL:
7564 case ISD::FMINNUM_IEEE:
7565 case ISD::FMAXNUM_IEEE:
7566 case ISD::UADDSAT:
7567 case ISD::USUBSAT:
7568 case ISD::SADDSAT:
7569 case ISD::SSUBSAT:
7570 return splitBinaryVectorOp(Op, DAG);
7571 case ISD::FCOPYSIGN:
7572 return lowerFCOPYSIGN(Op, DAG);
7573 case ISD::MUL:
7574 return lowerMUL(Op, DAG);
7575 case ISD::SMULO:
7576 case ISD::UMULO:
7577 return lowerXMULO(Op, DAG);
7578 case ISD::SMUL_LOHI:
7579 case ISD::UMUL_LOHI:
7580 return lowerXMUL_LOHI(Op, DAG);
7582 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7583 case ISD::STACKSAVE:
7584 return LowerSTACKSAVE(Op, DAG);
7585 case ISD::GET_ROUNDING:
7586 return lowerGET_ROUNDING(Op, DAG);
7587 case ISD::SET_ROUNDING:
7588 return lowerSET_ROUNDING(Op, DAG);
7589 case ISD::PREFETCH:
7590 return lowerPREFETCH(Op, DAG);
7591 case ISD::FP_EXTEND:
7593 return lowerFP_EXTEND(Op, DAG);
7594 case ISD::GET_FPENV:
7595 return lowerGET_FPENV(Op, DAG);
7596 case ISD::SET_FPENV:
7597 return lowerSET_FPENV(Op, DAG);
7598 case ISD::ROTR:
7599 return lowerROTR(Op, DAG);
7600 }
7601 return SDValue();
7602}
7603
7604// Used for D16: Casts the result of an instruction into the right vector,
7605// packs values if loads return unpacked values.
7607 const SDLoc &DL, SelectionDAG &DAG,
7608 bool Unpacked) {
7609 if (!LoadVT.isVector())
7610 return Result;
7611
7612 // Cast back to the original packed type or to a larger type that is a
7613 // multiple of 32 bit for D16. Widening the return type is a required for
7614 // legalization.
7615 EVT FittingLoadVT = LoadVT;
7616 if ((LoadVT.getVectorNumElements() % 2) == 1) {
7617 FittingLoadVT =
7619 LoadVT.getVectorNumElements() + 1);
7620 }
7621
7622 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
7623 // Truncate to v2i16/v4i16.
7624 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
7625
7626 // Workaround legalizer not scalarizing truncate after vector op
7627 // legalization but not creating intermediate vector trunc.
7629 DAG.ExtractVectorElements(Result, Elts);
7630 for (SDValue &Elt : Elts)
7631 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
7632
7633 // Pad illegal v1i16/v3fi6 to v4i16
7634 if ((LoadVT.getVectorNumElements() % 2) == 1)
7635 Elts.push_back(DAG.getPOISON(MVT::i16));
7636
7637 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
7638
7639 // Bitcast to original type (v2f16/v4f16).
7640 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7641 }
7642
7643 // Cast back to the original packed type.
7644 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7645}
7646
7647SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7648 SelectionDAG &DAG,
7650 bool IsIntrinsic) const {
7651 SDLoc DL(M);
7652
7653 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7654 EVT LoadVT = M->getValueType(0);
7655
7656 EVT EquivLoadVT = LoadVT;
7657 if (LoadVT.isVector()) {
7658 if (Unpacked) {
7659 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
7660 LoadVT.getVectorNumElements());
7661 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7662 // Widen v3f16 to legal type
7663 EquivLoadVT =
7665 LoadVT.getVectorNumElements() + 1);
7666 }
7667 }
7668
7669 // Change from v4f16/v2f16 to EquivLoadVT.
7670 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7671
7673 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7674 M->getMemoryVT(), M->getMemOperand());
7675
7676 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7677
7678 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7679}
7680
7681SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7682 SelectionDAG &DAG,
7683 ArrayRef<SDValue> Ops) const {
7684 SDLoc DL(M);
7685 EVT LoadVT = M->getValueType(0);
7686 EVT EltType = LoadVT.getScalarType();
7687 EVT IntVT = LoadVT.changeTypeToInteger();
7688
7689 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7690
7691 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7692 bool IsTFE = M->getNumValues() == 3;
7693
7694 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7695 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7696 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7697 : AMDGPUISD::BUFFER_LOAD;
7698
7699 if (IsD16) {
7700 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7701 }
7702
7703 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7704 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7705 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7706 IsTFE);
7707
7708 if (isTypeLegal(LoadVT)) {
7709 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7710 M->getMemOperand(), DAG);
7711 }
7712
7713 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7714 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7715 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7716 M->getMemOperand(), DAG);
7717 return DAG.getMergeValues(
7718 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7719 DL);
7720}
7721
7723 SelectionDAG &DAG) {
7724 EVT VT = N->getValueType(0);
7725 unsigned CondCode = N->getConstantOperandVal(3);
7726 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7727 return DAG.getPOISON(VT);
7728
7729 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7730
7731 SDValue LHS = N->getOperand(1);
7732 SDValue RHS = N->getOperand(2);
7733
7734 SDLoc DL(N);
7735
7736 EVT CmpVT = LHS.getValueType();
7737 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7738 unsigned PromoteOp =
7740 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7741 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7742 }
7743
7744 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7745
7746 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7747 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7748
7749 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7750 DAG.getCondCode(CCOpcode));
7751 if (VT.bitsEq(CCVT))
7752 return SetCC;
7753 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7754}
7755
7757 SelectionDAG &DAG) {
7758 EVT VT = N->getValueType(0);
7759
7760 unsigned CondCode = N->getConstantOperandVal(3);
7761 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7762 return DAG.getPOISON(VT);
7763
7764 SDValue Src0 = N->getOperand(1);
7765 SDValue Src1 = N->getOperand(2);
7766 EVT CmpVT = Src0.getValueType();
7767 SDLoc SL(N);
7768
7769 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7770 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7771 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7772 }
7773
7774 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7775 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7776 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7777 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7778 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7779 DAG.getCondCode(CCOpcode));
7780 if (VT.bitsEq(CCVT))
7781 return SetCC;
7782 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7783}
7784
7786 SelectionDAG &DAG) {
7787 EVT VT = N->getValueType(0);
7788 SDValue Src = N->getOperand(1);
7789 SDLoc SL(N);
7790
7791 if (Src.getOpcode() == ISD::SETCC) {
7792 SDValue Op0 = Src.getOperand(0);
7793 SDValue Op1 = Src.getOperand(1);
7794 // Need to expand bfloat to float for comparison (setcc).
7795 if (Op0.getValueType() == MVT::bf16) {
7796 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7797 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7798 }
7799 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7800 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7801 }
7802 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7803 // (ballot 0) -> 0
7804 if (Arg->isZero())
7805 return DAG.getConstant(0, SL, VT);
7806
7807 // (ballot 1) -> EXEC/EXEC_LO
7808 if (Arg->isOne()) {
7809 Register Exec;
7810 if (VT.getScalarSizeInBits() == 32)
7811 Exec = AMDGPU::EXEC_LO;
7812 else if (VT.getScalarSizeInBits() == 64)
7813 Exec = AMDGPU::EXEC;
7814 else
7815 return SDValue();
7816
7817 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7818 }
7819 }
7820
7821 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7822 // ISD::SETNE)
7823 return DAG.getNode(
7824 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7825 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7826}
7827
7829 SelectionDAG &DAG) {
7830 EVT VT = N->getValueType(0);
7831 unsigned ValSize = VT.getSizeInBits();
7832 unsigned IID = N->getConstantOperandVal(0);
7833 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7834 IID == Intrinsic::amdgcn_permlanex16;
7835 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7836 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7837 SDLoc SL(N);
7838 MVT IntVT = MVT::getIntegerVT(ValSize);
7839 const GCNSubtarget *ST = TLI.getSubtarget();
7840 unsigned SplitSize = 32;
7841 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7842 ST->hasDPALU_DPP() &&
7843 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7844 SplitSize = 64;
7845
7846 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7847 SDValue Src2, MVT ValT) -> SDValue {
7848 SmallVector<SDValue, 8> Operands;
7849 switch (IID) {
7850 case Intrinsic::amdgcn_permlane16:
7851 case Intrinsic::amdgcn_permlanex16:
7852 case Intrinsic::amdgcn_update_dpp:
7853 Operands.push_back(N->getOperand(6));
7854 Operands.push_back(N->getOperand(5));
7855 Operands.push_back(N->getOperand(4));
7856 [[fallthrough]];
7857 case Intrinsic::amdgcn_writelane:
7858 Operands.push_back(Src2);
7859 [[fallthrough]];
7860 case Intrinsic::amdgcn_readlane:
7861 case Intrinsic::amdgcn_set_inactive:
7862 case Intrinsic::amdgcn_set_inactive_chain_arg:
7863 case Intrinsic::amdgcn_mov_dpp8:
7864 Operands.push_back(Src1);
7865 [[fallthrough]];
7866 case Intrinsic::amdgcn_readfirstlane:
7867 case Intrinsic::amdgcn_permlane64:
7868 Operands.push_back(Src0);
7869 break;
7870 default:
7871 llvm_unreachable("unhandled lane op");
7872 }
7873
7874 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7875 std::reverse(Operands.begin(), Operands.end());
7876
7877 if (SDNode *GL = N->getGluedNode()) {
7878 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7879 GL = GL->getOperand(0).getNode();
7880 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7881 SDValue(GL, 0)));
7882 }
7883
7884 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7885 };
7886
7887 SDValue Src0 = N->getOperand(1);
7888 SDValue Src1, Src2;
7889 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7890 IID == Intrinsic::amdgcn_mov_dpp8 ||
7891 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7892 Src1 = N->getOperand(2);
7893 if (IID == Intrinsic::amdgcn_writelane ||
7894 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7895 Src2 = N->getOperand(3);
7896 }
7897
7898 if (ValSize == SplitSize) {
7899 // Already legal
7900 return SDValue();
7901 }
7902
7903 if (ValSize < 32) {
7904 bool IsFloat = VT.isFloatingPoint();
7905 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7906 SL, MVT::i32);
7907
7908 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7909 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7910 SL, MVT::i32);
7911 }
7912
7913 if (IID == Intrinsic::amdgcn_writelane) {
7914 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7915 SL, MVT::i32);
7916 }
7917
7918 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7919 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7920 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7921 }
7922
7923 if (ValSize % SplitSize != 0)
7924 return SDValue();
7925
7926 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7927 EVT VT = N->getValueType(0);
7928 unsigned NE = VT.getVectorNumElements();
7929 EVT EltVT = VT.getVectorElementType();
7931 unsigned NumOperands = N->getNumOperands();
7932 SmallVector<SDValue, 4> Operands(NumOperands);
7933 SDNode *GL = N->getGluedNode();
7934
7935 // only handle convergencectrl_glue
7937
7938 for (unsigned i = 0; i != NE; ++i) {
7939 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7940 ++j) {
7941 SDValue Operand = N->getOperand(j);
7942 EVT OperandVT = Operand.getValueType();
7943 if (OperandVT.isVector()) {
7944 // A vector operand; extract a single element.
7945 EVT OperandEltVT = OperandVT.getVectorElementType();
7946 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7947 Operand, DAG.getVectorIdxConstant(i, SL));
7948 } else {
7949 // A scalar operand; just use it as is.
7950 Operands[j] = Operand;
7951 }
7952 }
7953
7954 if (GL)
7955 Operands[NumOperands - 1] =
7956 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7957 SDValue(GL->getOperand(0).getNode(), 0));
7958
7959 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7960 }
7961
7962 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7963 return DAG.getBuildVector(VecVT, SL, Scalars);
7964 };
7965
7966 if (VT.isVector()) {
7967 switch (MVT::SimpleValueType EltTy =
7969 case MVT::i32:
7970 case MVT::f32:
7971 if (SplitSize == 32) {
7972 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7973 return unrollLaneOp(LaneOp.getNode());
7974 }
7975 [[fallthrough]];
7976 case MVT::i16:
7977 case MVT::f16:
7978 case MVT::bf16: {
7979 unsigned SubVecNumElt =
7980 SplitSize / VT.getVectorElementType().getSizeInBits();
7981 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7983 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7984 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7985 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7986 DAG.getConstant(EltIdx, SL, MVT::i32));
7987
7988 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7989 IsPermLane16)
7990 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7991 DAG.getConstant(EltIdx, SL, MVT::i32));
7992
7993 if (IID == Intrinsic::amdgcn_writelane)
7994 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7995 DAG.getConstant(EltIdx, SL, MVT::i32));
7996
7997 Pieces.push_back(
7998 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7999 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
8000 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
8001 EltIdx += SubVecNumElt;
8002 }
8003 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
8004 }
8005 default:
8006 // Handle all other cases by bitcasting to i32 vectors
8007 break;
8008 }
8009 }
8010
8011 MVT VecVT =
8012 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
8013 Src0 = DAG.getBitcast(VecVT, Src0);
8014
8015 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
8016 Src1 = DAG.getBitcast(VecVT, Src1);
8017
8018 if (IID == Intrinsic::amdgcn_writelane)
8019 Src2 = DAG.getBitcast(VecVT, Src2);
8020
8021 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
8022 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
8023 return DAG.getBitcast(VT, UnrolledLaneOp);
8024}
8025
8027 SelectionDAG &DAG) {
8028 EVT VT = N->getValueType(0);
8029
8030 if (VT.getSizeInBits() != 32)
8031 return SDValue();
8032
8033 SDLoc SL(N);
8034
8035 SDValue Value = N->getOperand(1);
8036 SDValue Index = N->getOperand(2);
8037
8038 // ds_bpermute requires index to be multiplied by 4
8039 SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
8040 SDValue ShiftedIndex =
8041 DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
8042
8043 // Intrinsics will require i32 to operate on
8044 SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
8045
8046 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
8047 SmallVector<SDValue> IntrinArgs) -> SDValue {
8048 SmallVector<SDValue> Operands(1);
8049 Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
8050 Operands.append(IntrinArgs);
8051 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
8052 };
8053
8054 // If we can bpermute across the whole wave, then just do that
8056 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8057 {ShiftedIndex, ValueI32});
8058 return DAG.getBitcast(VT, BPermute);
8059 }
8060
8061 assert(TLI.getSubtarget()->isWave64());
8062
8063 // Otherwise, we need to make use of whole wave mode
8064 SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
8065
8066 // Set inactive lanes to poison
8067 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8068 {ValueI32, PoisonVal});
8069 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8070 {ShiftedIndex, PoisonVal});
8071
8072 SDValue Swapped =
8073 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
8074
8075 // Get permutation of each half, then we'll select which one to use
8076 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8077 {WWMIndex, WWMValue});
8078 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
8079 MVT::i32, {WWMIndex, Swapped});
8080 SDValue BPermOtherHalfWWM =
8081 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
8082
8083 // Select which side to take the permute from
8084 SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
8085 // We can get away with only using mbcnt_lo here since we're only
8086 // trying to detect which side of 32 each lane is on, and mbcnt_lo
8087 // returns 32 for lanes 32-63.
8088 SDValue ThreadID =
8089 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
8090 {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
8091
8092 SDValue SameOrOtherHalf =
8093 DAG.getNode(ISD::AND, SL, MVT::i32,
8094 DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
8095 DAG.getTargetConstant(32, SL, MVT::i32));
8096 SDValue UseSameHalf =
8097 DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
8098 DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
8099 SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
8100 BPermOtherHalfWWM);
8101 return DAG.getBitcast(VT, Result);
8102}
8103
8106 SelectionDAG &DAG) const {
8107 switch (N->getOpcode()) {
8109 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
8110 Results.push_back(Res);
8111 return;
8112 }
8114 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
8115 Results.push_back(Res);
8116 return;
8117 }
8119 unsigned IID = N->getConstantOperandVal(0);
8120 switch (IID) {
8121 case Intrinsic::amdgcn_make_buffer_rsrc:
8122 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
8123 return;
8124 case Intrinsic::amdgcn_cvt_pkrtz: {
8125 SDValue Src0 = N->getOperand(1);
8126 SDValue Src1 = N->getOperand(2);
8127 SDLoc SL(N);
8128 SDValue Cvt =
8129 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
8130 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
8131 return;
8132 }
8133 case Intrinsic::amdgcn_cvt_pknorm_i16:
8134 case Intrinsic::amdgcn_cvt_pknorm_u16:
8135 case Intrinsic::amdgcn_cvt_pk_i16:
8136 case Intrinsic::amdgcn_cvt_pk_u16: {
8137 SDValue Src0 = N->getOperand(1);
8138 SDValue Src1 = N->getOperand(2);
8139 SDLoc SL(N);
8140 unsigned Opcode;
8141
8142 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
8143 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
8144 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
8145 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
8146 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
8147 Opcode = AMDGPUISD::CVT_PK_I16_I32;
8148 else
8149 Opcode = AMDGPUISD::CVT_PK_U16_U32;
8150
8151 EVT VT = N->getValueType(0);
8152 if (isTypeLegal(VT))
8153 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
8154 else {
8155 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
8156 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
8157 }
8158 return;
8159 }
8160 case Intrinsic::amdgcn_s_buffer_load: {
8161 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
8162 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
8163 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
8164 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
8165 // s_buffer_load_i8.
8166 if (!Subtarget->hasScalarSubwordLoads())
8167 return;
8168 SDValue Op = SDValue(N, 0);
8169 SDValue Rsrc = Op.getOperand(1);
8170 SDValue Offset = Op.getOperand(2);
8171 SDValue CachePolicy = Op.getOperand(3);
8172 EVT VT = Op.getValueType();
8173 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
8174 SDLoc DL(Op);
8176 const DataLayout &DataLayout = DAG.getDataLayout();
8177 Align Alignment =
8183 VT.getStoreSize(), Alignment);
8184 SDValue LoadVal;
8185 if (!Offset->isDivergent()) {
8186 SDValue Ops[] = {Rsrc, // source register
8187 Offset, CachePolicy};
8188 SDValue BufferLoad =
8189 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
8190 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8191 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8192 } else {
8193 SDValue Ops[] = {
8194 DAG.getEntryNode(), // Chain
8195 Rsrc, // rsrc
8196 DAG.getConstant(0, DL, MVT::i32), // vindex
8197 {}, // voffset
8198 {}, // soffset
8199 {}, // offset
8200 CachePolicy, // cachepolicy
8201 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8202 };
8203 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8204 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8205 }
8206 Results.push_back(LoadVal);
8207 return;
8208 }
8209 case Intrinsic::amdgcn_dead: {
8210 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
8211 Results.push_back(DAG.getPOISON(N->getValueType(I)));
8212 return;
8213 }
8214 }
8215 break;
8216 }
8218 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
8219 if (Res.getOpcode() == ISD::MERGE_VALUES) {
8220 // FIXME: Hacky
8221 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
8222 Results.push_back(Res.getOperand(I));
8223 }
8224 } else {
8225 Results.push_back(Res);
8226 Results.push_back(Res.getValue(1));
8227 }
8228 return;
8229 }
8230
8231 break;
8232 }
8233 case ISD::SELECT: {
8234 SDLoc SL(N);
8235 EVT VT = N->getValueType(0);
8236 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
8237 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
8238 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
8239
8240 EVT SelectVT = NewVT;
8241 if (NewVT.bitsLT(MVT::i32)) {
8242 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
8243 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
8244 SelectVT = MVT::i32;
8245 }
8246
8247 SDValue NewSelect =
8248 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
8249
8250 if (NewVT != SelectVT)
8251 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
8252 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
8253 return;
8254 }
8255 case ISD::FNEG: {
8256 if (N->getValueType(0) != MVT::v2f16)
8257 break;
8258
8259 SDLoc SL(N);
8260 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
8261
8262 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
8263 DAG.getConstant(0x80008000, SL, MVT::i32));
8264 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
8265 return;
8266 }
8267 case ISD::FABS: {
8268 if (N->getValueType(0) != MVT::v2f16)
8269 break;
8270
8271 SDLoc SL(N);
8272 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
8273
8274 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
8275 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
8276 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
8277 return;
8278 }
8279 case ISD::FSQRT: {
8280 if (N->getValueType(0) != MVT::f16)
8281 break;
8282 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
8283 break;
8284 }
8285 default:
8287 break;
8288 }
8289}
8290
8291/// Helper function for LowerBRCOND
8292static SDNode *findUser(SDValue Value, unsigned Opcode) {
8293
8294 for (SDUse &U : Value->uses()) {
8295 if (U.get() != Value)
8296 continue;
8297
8298 if (U.getUser()->getOpcode() == Opcode)
8299 return U.getUser();
8300 }
8301 return nullptr;
8302}
8303
8304unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
8305 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
8306 switch (Intr->getConstantOperandVal(1)) {
8307 case Intrinsic::amdgcn_if:
8308 return AMDGPUISD::IF;
8309 case Intrinsic::amdgcn_else:
8310 return AMDGPUISD::ELSE;
8311 case Intrinsic::amdgcn_loop:
8312 return AMDGPUISD::LOOP;
8313 case Intrinsic::amdgcn_end_cf:
8314 llvm_unreachable("should not occur");
8315 default:
8316 return 0;
8317 }
8318 }
8319
8320 // break, if_break, else_break are all only used as inputs to loop, not
8321 // directly as branch conditions.
8322 return 0;
8323}
8324
8331
8333 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
8334 return false;
8335
8336 // FIXME: Either avoid relying on address space here or change the default
8337 // address space for functions to avoid the explicit check.
8338 return (GV->getValueType()->isFunctionTy() ||
8341}
8342
8344 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
8345}
8346
8348 if (!GV->hasExternalLinkage())
8349 return true;
8350
8351 const auto OS = getTargetMachine().getTargetTriple().getOS();
8352 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
8353}
8354
8355/// This transforms the control flow intrinsics to get the branch destination as
8356/// last parameter, also switches branch target with BR if the need arise
8357SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
8358 SDLoc DL(BRCOND);
8359
8360 SDNode *Intr = BRCOND.getOperand(1).getNode();
8361 SDValue Target = BRCOND.getOperand(2);
8362 SDNode *BR = nullptr;
8363 SDNode *SetCC = nullptr;
8364
8365 switch (Intr->getOpcode()) {
8366 case ISD::SETCC: {
8367 // As long as we negate the condition everything is fine
8368 SetCC = Intr;
8369 Intr = SetCC->getOperand(0).getNode();
8370 break;
8371 }
8372 case ISD::XOR: {
8373 // Similar to SETCC, if we have (xor c, -1), we will be fine.
8374 SDValue LHS = Intr->getOperand(0);
8375 SDValue RHS = Intr->getOperand(1);
8376 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
8377 Intr = LHS.getNode();
8378 break;
8379 }
8380 [[fallthrough]];
8381 }
8382 default: {
8383 // Get the target from BR if we don't negate the condition
8384 BR = findUser(BRCOND, ISD::BR);
8385 assert(BR && "brcond missing unconditional branch user");
8386 Target = BR->getOperand(1);
8387 }
8388 }
8389
8390 unsigned CFNode = isCFIntrinsic(Intr);
8391 if (CFNode == 0) {
8392 // This is a uniform branch so we don't need to legalize.
8393 return BRCOND;
8394 }
8395
8396 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
8398
8399 assert(!SetCC ||
8400 (SetCC->getConstantOperandVal(1) == 1 &&
8401 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
8402 ISD::SETNE));
8403
8404 // operands of the new intrinsic call
8406 if (HaveChain)
8407 Ops.push_back(BRCOND.getOperand(0));
8408
8409 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
8410 Ops.push_back(Target);
8411
8412 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
8413
8414 // build the new intrinsic call
8415 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
8416
8417 if (!HaveChain) {
8418 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
8419
8421 }
8422
8423 if (BR) {
8424 // Give the branch instruction our target
8425 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
8426 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
8427 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
8428 }
8429
8430 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
8431
8432 // Copy the intrinsic results to registers
8433 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
8434 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
8435 if (!CopyToReg)
8436 continue;
8437
8438 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
8439 SDValue(Result, i - 1), SDValue());
8440
8441 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
8442 }
8443
8444 // Remove the old intrinsic from the chain
8445 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
8446 Intr->getOperand(0));
8447
8448 return Chain;
8449}
8450
8451SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
8452 MVT VT = Op.getSimpleValueType();
8453 SDLoc DL(Op);
8454 // Checking the depth
8455 if (Op.getConstantOperandVal(0) != 0)
8456 return DAG.getConstant(0, DL, VT);
8457
8458 MachineFunction &MF = DAG.getMachineFunction();
8459 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8460 // Check for kernel and shader functions
8461 if (Info->isEntryFunction())
8462 return DAG.getConstant(0, DL, VT);
8463
8464 MachineFrameInfo &MFI = MF.getFrameInfo();
8465 // There is a call to @llvm.returnaddress in this function
8466 MFI.setReturnAddressIsTaken(true);
8467
8468 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
8469 // Get the return address reg and mark it as an implicit live-in
8470 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
8471 getRegClassFor(VT, Op.getNode()->isDivergent()));
8472
8473 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
8474}
8475
8476SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
8477 MachineFunction &MF = DAG.getMachineFunction();
8478 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8479
8480 // For functions that set up their own stack, select the GET_STACK_BASE
8481 // pseudo.
8482 if (MFI->isBottomOfStack())
8483 return Op;
8484
8485 // For everything else, create a dummy stack object.
8486 int FI = MF.getFrameInfo().CreateFixedObject(1, 0, /*IsImmutable=*/false);
8487 return DAG.getFrameIndex(FI, Op.getValueType());
8488}
8489
8490SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
8491 const SDLoc &DL, EVT VT) const {
8492 return Op.getValueType().bitsLE(VT)
8493 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
8494 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
8495 DAG.getTargetConstant(0, DL, MVT::i32));
8496}
8497
8498SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
8499 SelectionDAG &DAG) const {
8500 EVT DstVT = Op.getValueType();
8501 unsigned NumElts = DstVT.getVectorNumElements();
8502 assert(NumElts > 2 && isPowerOf2_32(NumElts));
8503
8504 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
8505
8506 SDLoc DL(Op);
8507 unsigned Opc = Op.getOpcode();
8508 SDValue Flags = Op.getOperand(1);
8509 EVT HalfDstVT =
8510 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
8511 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
8512 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
8513
8514 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
8515}
8516
8517SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
8518 SDValue Src = Op.getOperand(0);
8519 EVT SrcVT = Src.getValueType();
8520 EVT DstVT = Op.getValueType();
8521
8522 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
8523 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
8524 if (SrcVT.getScalarType() != MVT::f32)
8525 return SDValue();
8526 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
8527 }
8528
8529 if (SrcVT.getScalarType() != MVT::f64)
8530 return Op;
8531
8532 SDLoc DL(Op);
8533 if (DstVT == MVT::f16) {
8534 // TODO: Handle strictfp
8535 if (Op.getOpcode() != ISD::FP_ROUND)
8536 return Op;
8537
8538 if (!Subtarget->has16BitInsts()) {
8539 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
8540 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8541 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8542 }
8543 if (Op->getFlags().hasApproximateFuncs()) {
8544 SDValue Flags = Op.getOperand(1);
8545 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
8546 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
8547 }
8548 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
8549 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8550 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8551 }
8552
8553 assert(DstVT.getScalarType() == MVT::bf16 &&
8554 "custom lower FP_ROUND for f16 or bf16");
8555 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
8556
8557 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
8558 // hardware f32 -> bf16 instruction.
8559 EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
8560 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
8561 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
8562 DAG.getTargetConstant(0, DL, MVT::i32));
8563}
8564
8565SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
8566 SelectionDAG &DAG) const {
8567 EVT VT = Op.getValueType();
8568 const MachineFunction &MF = DAG.getMachineFunction();
8569 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8570 bool IsIEEEMode = Info->getMode().IEEE;
8571
8572 // FIXME: Assert during selection that this is only selected for
8573 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
8574 // mode functions, but this happens to be OK since it's only done in cases
8575 // where there is known no sNaN.
8576 if (IsIEEEMode)
8577 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
8578
8579 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8580 VT == MVT::v16bf16)
8581 return splitBinaryVectorOp(Op, DAG);
8582 return Op;
8583}
8584
8585SDValue
8586SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
8587 SelectionDAG &DAG) const {
8588 EVT VT = Op.getValueType();
8589 const MachineFunction &MF = DAG.getMachineFunction();
8590 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8591 bool IsIEEEMode = Info->getMode().IEEE;
8592
8593 if (IsIEEEMode)
8594 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
8595
8596 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8597 VT == MVT::v16bf16)
8598 return splitBinaryVectorOp(Op, DAG);
8599 return Op;
8600}
8601
8602SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
8603 SelectionDAG &DAG) const {
8604 EVT VT = Op.getValueType();
8605 if (VT.isVector())
8606 return splitBinaryVectorOp(Op, DAG);
8607
8608 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8609 !Subtarget->hasMinimum3Maximum3F16() &&
8610 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8611 "should not need to widen f16 minimum/maximum to v2f16");
8612
8613 // Widen f16 operation to v2f16
8614
8615 // fminimum f16:x, f16:y ->
8616 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
8617 // (v2f16 (scalar_to_vector y))), 0
8618 SDLoc SL(Op);
8619 SDValue WideSrc0 =
8620 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
8621 SDValue WideSrc1 =
8622 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
8623
8624 SDValue Widened =
8625 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8626
8627 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
8628 DAG.getConstant(0, SL, MVT::i32));
8629}
8630
8631SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
8632 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
8633 EVT VT = Op.getValueType();
8634 assert(VT == MVT::f16);
8635
8636 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
8637 EVT ExpVT = Exp.getValueType();
8638 if (ExpVT == MVT::i16)
8639 return Op;
8640
8641 SDLoc DL(Op);
8642
8643 // Correct the exponent type for f16 to i16.
8644 // Clamp the range of the exponent to the instruction's range.
8645
8646 // TODO: This should be a generic narrowing legalization, and can easily be
8647 // for GlobalISel.
8648
8649 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
8650 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
8651
8652 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
8653 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
8654
8655 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
8656
8657 if (IsStrict) {
8658 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
8659 {Op.getOperand(0), Op.getOperand(1), TruncExp});
8660 }
8661
8662 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
8663}
8664
8666 switch (Op->getOpcode()) {
8667 case ISD::SRA:
8668 case ISD::SMIN:
8669 case ISD::SMAX:
8670 return ISD::SIGN_EXTEND;
8671 case ISD::SRL:
8672 case ISD::UMIN:
8673 case ISD::UMAX:
8674 return ISD::ZERO_EXTEND;
8675 case ISD::ADD:
8676 case ISD::SUB:
8677 case ISD::AND:
8678 case ISD::OR:
8679 case ISD::XOR:
8680 case ISD::SHL:
8681 case ISD::SELECT:
8682 case ISD::MUL:
8683 // operation result won't be influenced by garbage high bits.
8684 // TODO: are all of those cases correct, and are there more?
8685 return ISD::ANY_EXTEND;
8686 case ISD::SETCC: {
8687 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8689 }
8690 default:
8691 llvm_unreachable("unexpected opcode!");
8692 }
8693}
8694
8695SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8696 DAGCombinerInfo &DCI) const {
8697 const unsigned Opc = Op.getOpcode();
8698 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8699 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8700 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8701 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8702 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
8703
8704 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8705 : Op->getOperand(0).getValueType();
8706 auto &DAG = DCI.DAG;
8707 auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
8708
8709 if (DCI.isBeforeLegalizeOps() ||
8710 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
8711 return SDValue();
8712
8713 SDLoc DL(Op);
8714 SDValue LHS;
8715 SDValue RHS;
8716 if (Opc == ISD::SELECT) {
8717 LHS = Op->getOperand(1);
8718 RHS = Op->getOperand(2);
8719 } else {
8720 LHS = Op->getOperand(0);
8721 RHS = Op->getOperand(1);
8722 }
8723
8724 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8725 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
8726
8727 // Special case: for shifts, the RHS always needs a zext.
8728 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8729 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
8730 else
8731 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
8732
8733 // setcc always return i1/i1 vec so no need to truncate after.
8734 if (Opc == ISD::SETCC) {
8735 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8736 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
8737 }
8738
8739 // For other ops, we extend the operation's return type as well so we need to
8740 // truncate back to the original type.
8741 SDValue NewVal;
8742 if (Opc == ISD::SELECT)
8743 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
8744 else
8745 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
8746
8747 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8748}
8749
8750SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8751 SDValue Mag = Op.getOperand(0);
8752 EVT MagVT = Mag.getValueType();
8753
8754 if (MagVT.getVectorNumElements() > 2)
8755 return splitBinaryVectorOp(Op, DAG);
8756
8757 SDValue Sign = Op.getOperand(1);
8758 EVT SignVT = Sign.getValueType();
8759
8760 if (MagVT == SignVT)
8761 return Op;
8762
8763 // fcopysign v2f16:mag, v2f32:sign ->
8764 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8765
8766 SDLoc SL(Op);
8767 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8768 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8769
8770 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8771
8772 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8773}
8774
8775// Custom lowering for vector multiplications and s_mul_u64.
8776SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8777 EVT VT = Op.getValueType();
8778
8779 // Split vector operands.
8780 if (VT.isVector())
8781 return splitBinaryVectorOp(Op, DAG);
8782
8783 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8784
8785 // There are four ways to lower s_mul_u64:
8786 //
8787 // 1. If all the operands are uniform, then we lower it as it is.
8788 //
8789 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8790 // multiplications because there is not a vector equivalent of s_mul_u64.
8791 //
8792 // 3. If the cost model decides that it is more efficient to use vector
8793 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8794 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8795 //
8796 // 4. If the cost model decides to use vector registers and both of the
8797 // operands are zero-extended/sign-extended from 32-bits, then we split the
8798 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8799 // possible to check if the operands are zero-extended or sign-extended in
8800 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8801 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8802 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8803 // If the cost model decides that we have to use vector registers, then
8804 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8805 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8806 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8807 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8808 // SIInstrInfo.cpp .
8809
8810 if (Op->isDivergent())
8811 return SDValue();
8812
8813 SDValue Op0 = Op.getOperand(0);
8814 SDValue Op1 = Op.getOperand(1);
8815 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8816 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8817 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8818 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8819 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8820 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8821 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8822 SDLoc SL(Op);
8823 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8824 return SDValue(
8825 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8826 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8827 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8828 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8829 return SDValue(
8830 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8831 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8832 return Op;
8833}
8834
8835SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8836 EVT VT = Op.getValueType();
8837 SDLoc SL(Op);
8838 SDValue LHS = Op.getOperand(0);
8839 SDValue RHS = Op.getOperand(1);
8840 bool isSigned = Op.getOpcode() == ISD::SMULO;
8841
8842 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8843 const APInt &C = RHSC->getAPIntValue();
8844 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8845 if (C.isPowerOf2()) {
8846 // smulo(x, signed_min) is same as umulo(x, signed_min).
8847 bool UseArithShift = isSigned && !C.isMinSignedValue();
8848 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8849 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8850 SDValue Overflow =
8851 DAG.getSetCC(SL, MVT::i1,
8852 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8853 Result, ShiftAmt),
8854 LHS, ISD::SETNE);
8855 return DAG.getMergeValues({Result, Overflow}, SL);
8856 }
8857 }
8858
8859 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8860 SDValue Top =
8861 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8862
8863 SDValue Sign = isSigned
8864 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8865 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8866 SL, MVT::i32))
8867 : DAG.getConstant(0, SL, VT);
8868 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8869
8870 return DAG.getMergeValues({Result, Overflow}, SL);
8871}
8872
8873SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8874 if (Op->isDivergent()) {
8875 // Select to V_MAD_[IU]64_[IU]32.
8876 return Op;
8877 }
8878 if (Subtarget->hasSMulHi()) {
8879 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8880 return SDValue();
8881 }
8882 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8883 // calculate the high part, so we might as well do the whole thing with
8884 // V_MAD_[IU]64_[IU]32.
8885 return Op;
8886}
8887
8888SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8889 if (!Subtarget->hasTrapHandler() ||
8890 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8891 return lowerTrapEndpgm(Op, DAG);
8892
8893 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8894 : lowerTrapHsaQueuePtr(Op, DAG);
8895}
8896
8897SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8898 SDLoc SL(Op);
8899 SDValue Chain = Op.getOperand(0);
8900 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8901}
8902
8903SDValue
8904SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8905 const SDLoc &DL, Align Alignment,
8906 ImplicitParameter Param) const {
8907 MachineFunction &MF = DAG.getMachineFunction();
8908 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8909 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8910 MachinePointerInfo PtrInfo =
8912 return DAG.getLoad(
8913 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
8915}
8916
8917SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8918 SelectionDAG &DAG) const {
8919 SDLoc SL(Op);
8920 SDValue Chain = Op.getOperand(0);
8921
8922 SDValue QueuePtr;
8923 // For code object version 5, QueuePtr is passed through implicit kernarg.
8924 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8926 QueuePtr =
8927 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8928 } else {
8929 MachineFunction &MF = DAG.getMachineFunction();
8930 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8931 Register UserSGPR = Info->getQueuePtrUserSGPR();
8932
8933 if (UserSGPR == AMDGPU::NoRegister) {
8934 // We probably are in a function incorrectly marked with
8935 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8936 // trap, so just use a null pointer.
8937 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8938 } else {
8939 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8940 MVT::i64);
8941 }
8942 }
8943
8944 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8945 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8946
8947 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8948 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8949 ToReg.getValue(1)};
8950 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8951}
8952
8953SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8954 SDLoc SL(Op);
8955 SDValue Chain = Op.getOperand(0);
8956
8957 // We need to simulate the 's_trap 2' instruction on targets that run in
8958 // PRIV=1 (where it is treated as a nop).
8959 if (Subtarget->hasPrivEnabledTrap2NopBug())
8960 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8961
8962 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8963 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8964 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8965}
8966
8967SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8968 SDLoc SL(Op);
8969 SDValue Chain = Op.getOperand(0);
8970 MachineFunction &MF = DAG.getMachineFunction();
8971
8972 if (!Subtarget->hasTrapHandler() ||
8973 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8974 LLVMContext &Ctx = MF.getFunction().getContext();
8975 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8976 "debugtrap handler not supported",
8977 Op.getDebugLoc(), DS_Warning));
8978 return Chain;
8979 }
8980
8981 uint64_t TrapID =
8982 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8983 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8984 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8985}
8986
8987SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8988 SelectionDAG &DAG) const {
8989 if (Subtarget->hasApertureRegs()) {
8990 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8991 ? AMDGPU::SRC_SHARED_BASE
8992 : AMDGPU::SRC_PRIVATE_BASE;
8993 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8994 !Subtarget->hasGloballyAddressableScratch()) &&
8995 "Cannot use src_private_base with globally addressable scratch!");
8996 // Note: this feature (register) is broken. When used as a 32-bit operand,
8997 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8998 // bits.
8999 //
9000 // To work around the issue, emit a 64 bit copy from this register
9001 // then extract the high bits. Note that this shouldn't even result in a
9002 // shift being emitted and simply become a pair of registers (e.g.):
9003 // s_mov_b64 s[6:7], src_shared_base
9004 // v_mov_b32_e32 v1, s7
9005 SDValue Copy =
9006 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
9007 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
9008 }
9009
9010 // For code object version 5, private_base and shared_base are passed through
9011 // implicit kernargs.
9012 const Module *M = DAG.getMachineFunction().getFunction().getParent();
9016 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
9017 }
9018
9019 MachineFunction &MF = DAG.getMachineFunction();
9020 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9021 Register UserSGPR = Info->getQueuePtrUserSGPR();
9022 if (UserSGPR == AMDGPU::NoRegister) {
9023 // We probably are in a function incorrectly marked with
9024 // amdgpu-no-queue-ptr. This is undefined.
9025 return DAG.getPOISON(MVT::i32);
9026 }
9027
9028 SDValue QueuePtr =
9029 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
9030
9031 // Offset into amd_queue_t for group_segment_aperture_base_hi /
9032 // private_segment_aperture_base_hi.
9033 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
9034
9035 SDValue Ptr =
9036 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
9037
9038 // TODO: Use custom target PseudoSourceValue.
9039 // TODO: We should use the value from the IR intrinsic call, but it might not
9040 // be available and how do we get it?
9041 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
9042 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
9043 commonAlignment(Align(64), StructOffset),
9046}
9047
9048/// Return true if the value is a known valid address, such that a null check is
9049/// not necessary.
9051 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
9053 return true;
9054
9055 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
9056 return ConstVal->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
9057
9058 // TODO: Search through arithmetic, handle arguments and loads
9059 // marked nonnull.
9060 return false;
9061}
9062
9063SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
9064 SelectionDAG &DAG) const {
9065 SDLoc SL(Op);
9066
9067 const AMDGPUTargetMachine &TM =
9068 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
9069
9070 unsigned DestAS, SrcAS;
9071 SDValue Src;
9072 bool IsNonNull = false;
9073 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
9074 SrcAS = ASC->getSrcAddressSpace();
9075 Src = ASC->getOperand(0);
9076 DestAS = ASC->getDestAddressSpace();
9077 } else {
9078 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
9079 Op.getConstantOperandVal(0) ==
9080 Intrinsic::amdgcn_addrspacecast_nonnull);
9081 Src = Op->getOperand(1);
9082 SrcAS = Op->getConstantOperandVal(2);
9083 DestAS = Op->getConstantOperandVal(3);
9084 IsNonNull = true;
9085 }
9086
9087 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
9088
9089 // flat -> local/private
9090 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
9091 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
9092 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
9093 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
9094
9095 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
9096 Subtarget->hasGloballyAddressableScratch()) {
9097 // flat -> private with globally addressable scratch: subtract
9098 // src_flat_scratch_base_lo.
9099 SDValue FlatScratchBaseLo(
9100 DAG.getMachineNode(
9101 AMDGPU::S_MOV_B32, SL, MVT::i32,
9102 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
9103 0);
9104 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
9105 }
9106
9107 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
9108 return Ptr;
9109
9110 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
9111 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
9112 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
9113
9114 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
9115 SegmentNullPtr);
9116 }
9117 }
9118
9119 // local/private -> flat
9120 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
9121 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
9122 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
9123 SDValue CvtPtr;
9124 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
9125 Subtarget->hasGloballyAddressableScratch()) {
9126 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
9127 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
9128 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
9129 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
9130 ThreadID = DAG.getNode(
9131 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
9132 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
9133 AllOnes, ThreadID);
9134 if (Subtarget->isWave64())
9135 ThreadID = DAG.getNode(
9136 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
9137 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
9138 AllOnes, ThreadID);
9139 SDValue ShAmt = DAG.getShiftAmountConstant(
9140 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
9141 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
9142 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
9143 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
9144 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
9145 // 64-bit hi:lo value.
9146 SDValue FlatScratchBase = {
9147 DAG.getMachineNode(
9148 AMDGPU::S_MOV_B64, SL, MVT::i64,
9149 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
9150 0};
9151 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
9152 } else {
9153 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
9154 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
9155 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
9156 }
9157
9158 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
9159 return CvtPtr;
9160
9161 unsigned NullVal = AMDGPU::getNullPointerValue(SrcAS);
9162 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
9163
9164 SDValue NonNull =
9165 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
9166
9167 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
9168 FlatNullPtr);
9169 }
9170 }
9171
9172 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
9173 Op.getValueType() == MVT::i64) {
9174 const SIMachineFunctionInfo *Info =
9175 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
9176 if (Info->get32BitAddressHighBits() == 0)
9177 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
9178
9179 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
9180 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
9181 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
9182 }
9183
9184 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
9185 Src.getValueType() == MVT::i64)
9186 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
9187
9188 // global <-> flat are no-ops and never emitted.
9189
9190 // Invalid casts are poison.
9191 return DAG.getPOISON(Op->getValueType(0));
9192}
9193
9194// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
9195// the small vector and inserting them into the big vector. That is better than
9196// the default expansion of doing it via a stack slot. Even though the use of
9197// the stack slot would be optimized away afterwards, the stack slot itself
9198// remains.
9199SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
9200 SelectionDAG &DAG) const {
9201 SDValue Vec = Op.getOperand(0);
9202 SDValue Ins = Op.getOperand(1);
9203 SDValue Idx = Op.getOperand(2);
9204 EVT VecVT = Vec.getValueType();
9205 EVT InsVT = Ins.getValueType();
9206 EVT EltVT = VecVT.getVectorElementType();
9207 unsigned InsNumElts = InsVT.getVectorNumElements();
9208 unsigned IdxVal = Idx->getAsZExtVal();
9209 SDLoc SL(Op);
9210
9211 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
9212 // Insert 32-bit registers at a time.
9213 assert(InsNumElts % 2 == 0 && "expect legal vector types");
9214
9215 unsigned VecNumElts = VecVT.getVectorNumElements();
9216 EVT NewVecVT =
9217 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
9218 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
9220 MVT::i32, InsNumElts / 2);
9221
9222 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
9223 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
9224
9225 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
9226 SDValue Elt;
9227 if (InsNumElts == 2) {
9228 Elt = Ins;
9229 } else {
9230 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
9231 DAG.getConstant(I, SL, MVT::i32));
9232 }
9233 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
9234 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
9235 }
9236
9237 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
9238 }
9239
9240 for (unsigned I = 0; I != InsNumElts; ++I) {
9241 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
9242 DAG.getConstant(I, SL, MVT::i32));
9243 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
9244 DAG.getConstant(IdxVal + I, SL, MVT::i32));
9245 }
9246 return Vec;
9247}
9248
9249SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
9250 SelectionDAG &DAG) const {
9251 SDValue Vec = Op.getOperand(0);
9252 SDValue InsVal = Op.getOperand(1);
9253 SDValue Idx = Op.getOperand(2);
9254 EVT VecVT = Vec.getValueType();
9255 EVT EltVT = VecVT.getVectorElementType();
9256 unsigned VecSize = VecVT.getSizeInBits();
9257 unsigned EltSize = EltVT.getSizeInBits();
9258 SDLoc SL(Op);
9259
9260 // Specially handle the case of v4i16 with static indexing.
9261 unsigned NumElts = VecVT.getVectorNumElements();
9262 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
9263 if (NumElts == 4 && EltSize == 16 && KIdx) {
9264 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
9265
9266 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
9267 DAG.getConstant(0, SL, MVT::i32));
9268 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
9269 DAG.getConstant(1, SL, MVT::i32));
9270
9271 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
9272 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
9273
9274 unsigned Idx = KIdx->getZExtValue();
9275 bool InsertLo = Idx < 2;
9276 SDValue InsHalf = DAG.getNode(
9277 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
9278 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
9279 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
9280
9281 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
9282
9283 SDValue Concat =
9284 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
9285 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
9286
9287 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
9288 }
9289
9290 // Static indexing does not lower to stack access, and hence there is no need
9291 // for special custom lowering to avoid stack access.
9292 if (isa<ConstantSDNode>(Idx))
9293 return SDValue();
9294
9295 // Avoid stack access for dynamic indexing by custom lowering to
9296 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
9297
9298 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
9299
9300 MVT IntVT = MVT::getIntegerVT(VecSize);
9301
9302 // Convert vector index to bit-index and get the required bit mask.
9303 assert(isPowerOf2_32(EltSize));
9304 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
9305 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
9306 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
9307 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
9308 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
9309
9310 // 1. Create a congruent vector with the target value in each element.
9311 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
9312 DAG.getSplatBuildVector(VecVT, SL, InsVal));
9313
9314 // 2. Mask off all other indices except the required index within (1).
9315 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
9316
9317 // 3. Mask off the required index within the target vector.
9318 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
9319 SDValue RHS =
9320 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
9321
9322 // 4. Get (2) and (3) ORed into the target vector.
9323 SDValue BFI =
9324 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
9325
9326 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
9327}
9328
9329SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
9330 SelectionDAG &DAG) const {
9331 SDLoc SL(Op);
9332
9333 EVT ResultVT = Op.getValueType();
9334 SDValue Vec = Op.getOperand(0);
9335 SDValue Idx = Op.getOperand(1);
9336 EVT VecVT = Vec.getValueType();
9337 unsigned VecSize = VecVT.getSizeInBits();
9338 EVT EltVT = VecVT.getVectorElementType();
9339
9340 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
9341
9342 // Make sure we do any optimizations that will make it easier to fold
9343 // source modifiers before obscuring it with bit operations.
9344
9345 // XXX - Why doesn't this get called when vector_shuffle is expanded?
9346 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
9347 return Combined;
9348
9349 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
9350 SDValue Lo, Hi;
9351 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
9352
9353 if (VecSize == 128) {
9354 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
9355 Lo = DAG.getBitcast(LoVT,
9356 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9357 DAG.getConstant(0, SL, MVT::i32)));
9358 Hi = DAG.getBitcast(HiVT,
9359 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9360 DAG.getConstant(1, SL, MVT::i32)));
9361 } else if (VecSize == 256) {
9362 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
9363 SDValue Parts[4];
9364 for (unsigned P = 0; P < 4; ++P) {
9365 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9366 DAG.getConstant(P, SL, MVT::i32));
9367 }
9368
9369 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
9370 Parts[0], Parts[1]));
9371 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
9372 Parts[2], Parts[3]));
9373 } else {
9374 assert(VecSize == 512);
9375
9376 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
9377 SDValue Parts[8];
9378 for (unsigned P = 0; P < 8; ++P) {
9379 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9380 DAG.getConstant(P, SL, MVT::i32));
9381 }
9382
9383 Lo = DAG.getBitcast(LoVT,
9384 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
9385 Parts[0], Parts[1], Parts[2], Parts[3]));
9386 Hi = DAG.getBitcast(HiVT,
9387 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
9388 Parts[4], Parts[5], Parts[6], Parts[7]));
9389 }
9390
9391 EVT IdxVT = Idx.getValueType();
9392 unsigned NElem = VecVT.getVectorNumElements();
9393 assert(isPowerOf2_32(NElem));
9394 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
9395 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
9396 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
9397 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
9398 }
9399
9400 assert(VecSize <= 64);
9401
9402 MVT IntVT = MVT::getIntegerVT(VecSize);
9403
9404 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
9405 SDValue VecBC = peekThroughBitcasts(Vec);
9406 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
9407 SDValue Src = VecBC.getOperand(0);
9408 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
9409 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
9410 }
9411
9412 unsigned EltSize = EltVT.getSizeInBits();
9413 assert(isPowerOf2_32(EltSize));
9414
9415 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
9416
9417 // Convert vector index to bit-index (* EltSize)
9418 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
9419
9420 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
9421 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
9422
9423 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
9424 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
9425 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
9426 }
9427
9428 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
9429}
9430
9431static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
9432 assert(Elt % 2 == 0);
9433 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9434}
9435
9436static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
9437 assert(Elt % 2 == 0);
9438 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9439 !(Mask[Elt + 1] & 1);
9440}
9441
9442SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
9443 SelectionDAG &DAG) const {
9444 SDLoc SL(Op);
9445 EVT ResultVT = Op.getValueType();
9446 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
9447 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
9448 const int NewSrcNumElts = 2;
9449 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
9450 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
9451
9452 // Break up the shuffle into registers sized pieces.
9453 //
9454 // We're trying to form sub-shuffles that the register allocation pipeline
9455 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
9456 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
9457 // pair of copies into a consecutive register copy, so use the ordinary
9458 // extract_vector_elt lowering unless we can use the shuffle.
9459 //
9460 // TODO: This is a bit of hack, and we should probably always use
9461 // extract_subvector for the largest possible subvector we can (or at least
9462 // use it for PackVT aligned pieces). However we have worse support for
9463 // combines on them don't directly treat extract_subvector / insert_subvector
9464 // as legal. The DAG scheduler also ends up doing a worse job with the
9465 // extract_subvectors.
9466 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
9467
9468 // vector_shuffle <0,1,6,7> lhs, rhs
9469 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
9470 //
9471 // vector_shuffle <6,7,2,3> lhs, rhs
9472 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
9473 //
9474 // vector_shuffle <6,7,0,1> lhs, rhs
9475 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
9476
9477 // Avoid scalarizing when both halves are reading from consecutive elements.
9478
9479 // If we're treating 2 element shuffles as legal, also create odd-to-even
9480 // shuffles of neighboring pairs.
9481 //
9482 // vector_shuffle <3,2,7,6> lhs, rhs
9483 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
9484 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
9485
9487 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
9488 if (ShouldUseConsecutiveExtract &&
9490 const int Idx = SVN->getMaskElt(I);
9491 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9492 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9493 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
9494 SVN->getOperand(VecIdx),
9495 DAG.getConstant(EltIdx, SL, MVT::i32));
9496 Pieces.push_back(SubVec);
9497 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
9499 int Idx0 = SVN->getMaskElt(I);
9500 int Idx1 = SVN->getMaskElt(I + 1);
9501
9502 SDValue SrcOp0 = SVN->getOperand(0);
9503 SDValue SrcOp1 = SrcOp0;
9504 if (Idx0 >= SrcNumElts) {
9505 SrcOp0 = SVN->getOperand(1);
9506 Idx0 -= SrcNumElts;
9507 }
9508
9509 if (Idx1 >= SrcNumElts) {
9510 SrcOp1 = SVN->getOperand(1);
9511 Idx1 -= SrcNumElts;
9512 }
9513
9514 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9515 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9516
9517 // Extract nearest even aligned piece.
9518 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
9519 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
9520 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
9521 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
9522
9523 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9524 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9525
9526 SDValue Result0 = SubVec0;
9527 SDValue Result1 = SubVec0;
9528
9529 if (SubVec0 != SubVec1) {
9530 NewMaskIdx1 += NewSrcNumElts;
9531 Result1 = SubVec1;
9532 } else {
9533 Result1 = DAG.getPOISON(PackVT);
9534 }
9535
9536 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
9537 {NewMaskIdx0, NewMaskIdx1});
9538 Pieces.push_back(Shuf);
9539 } else {
9540 const int Idx0 = SVN->getMaskElt(I);
9541 const int Idx1 = SVN->getMaskElt(I + 1);
9542 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9543 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9544 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9545 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9546
9547 SDValue Vec0 = SVN->getOperand(VecIdx0);
9548 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
9549 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
9550
9551 SDValue Vec1 = SVN->getOperand(VecIdx1);
9552 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
9553 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
9554 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
9555 }
9556 }
9557
9558 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
9559}
9560
9561SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
9562 SelectionDAG &DAG) const {
9563 SDValue SVal = Op.getOperand(0);
9564 EVT ResultVT = Op.getValueType();
9565 EVT SValVT = SVal.getValueType();
9566 SDValue UndefVal = DAG.getPOISON(SValVT);
9567 SDLoc SL(Op);
9568
9570 VElts.push_back(SVal);
9571 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
9572 VElts.push_back(UndefVal);
9573
9574 return DAG.getBuildVector(ResultVT, SL, VElts);
9575}
9576
9577SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
9578 SelectionDAG &DAG) const {
9579 SDLoc SL(Op);
9580 EVT VT = Op.getValueType();
9581
9582 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9583 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
9584
9585 SDValue Lo = Op.getOperand(0);
9586 SDValue Hi = Op.getOperand(1);
9587
9588 // Avoid adding defined bits with the zero_extend.
9589 if (Hi.isUndef()) {
9590 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9591 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
9592 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
9593 }
9594
9595 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
9596 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
9597
9598 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
9599 DAG.getConstant(16, SL, MVT::i32));
9600 if (Lo.isUndef())
9601 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
9602
9603 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9604 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
9605
9606 SDValue Or =
9607 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
9608 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
9609 }
9610
9611 // Split into 2-element chunks.
9612 const unsigned NumParts = VT.getVectorNumElements() / 2;
9613 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
9614 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
9615
9617 for (unsigned P = 0; P < NumParts; ++P) {
9618 SDValue Vec = DAG.getBuildVector(
9619 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
9620 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
9621 }
9622
9623 SDValue Blend =
9624 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
9625 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
9626}
9627
9629 const GlobalAddressSDNode *GA) const {
9630 // OSes that use ELF REL relocations (instead of RELA) can only store a
9631 // 32-bit addend in the instruction, so it is not safe to allow offset folding
9632 // which can create arbitrary 64-bit addends. (This is only a problem for
9633 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
9634 // the high 32 bits of the addend.)
9635 //
9636 // This should be kept in sync with how HasRelocationAddend is initialized in
9637 // the constructor of ELFAMDGPUAsmBackend.
9638 if (!Subtarget->isAmdHsaOS())
9639 return false;
9640
9641 // We can fold offsets for anything that doesn't require a GOT relocation.
9642 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
9646}
9647
9648static SDValue
9650 const SDLoc &DL, int64_t Offset, EVT PtrVT,
9651 unsigned GAFlags = SIInstrInfo::MO_NONE) {
9652 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
9653 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
9654 // lowered to the following code sequence:
9655 //
9656 // For constant address space:
9657 // s_getpc_b64 s[0:1]
9658 // s_add_u32 s0, s0, $symbol
9659 // s_addc_u32 s1, s1, 0
9660 //
9661 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9662 // a fixup or relocation is emitted to replace $symbol with a literal
9663 // constant, which is a pc-relative offset from the encoding of the $symbol
9664 // operand to the global variable.
9665 //
9666 // For global address space:
9667 // s_getpc_b64 s[0:1]
9668 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9669 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9670 //
9671 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9672 // fixups or relocations are emitted to replace $symbol@*@lo and
9673 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
9674 // which is a 64-bit pc-relative offset from the encoding of the $symbol
9675 // operand to the global variable.
9676 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9677 assert(GAFlags != SIInstrInfo::MO_NONE);
9678
9679 SDValue Ptr =
9680 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
9681 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
9682 }
9683
9684 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
9685 SDValue PtrHi;
9686 if (GAFlags == SIInstrInfo::MO_NONE)
9687 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
9688 else
9689 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
9690 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
9691}
9692
9693SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
9694 SDValue Op,
9695 SelectionDAG &DAG) const {
9696 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
9697 SDLoc DL(GSD);
9698 EVT PtrVT = Op.getValueType();
9699
9700 const GlobalValue *GV = GSD->getGlobal();
9706 GV->hasExternalLinkage()) {
9707 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
9708 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9709 // zero-sized type in other languages to declare the dynamic shared
9710 // memory which size is not known at the compile time. They will be
9711 // allocated by the runtime and placed directly after the static
9712 // allocated ones. They all share the same offset.
9713 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
9714 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9715 // Adjust alignment for that dynamic shared memory array.
9717 MFI->setDynLDSAlign(F, GVar);
9718 MFI->setUsesDynamicLDS(true);
9719 return SDValue(
9720 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
9721 }
9722 }
9724 }
9725
9727 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
9729 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
9730 }
9731
9732 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9733 if (Subtarget->has64BitLiterals()) {
9735 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
9736 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
9737 0);
9738 }
9739
9740 SDValue AddrLo = DAG.getTargetGlobalAddress(
9741 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
9742 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
9743
9744 SDValue AddrHi = DAG.getTargetGlobalAddress(
9745 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
9746 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
9747
9748 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9749 }
9750
9751 if (shouldEmitFixup(GV))
9752 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9753
9754 if (shouldEmitPCReloc(GV))
9755 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
9757
9758 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
9760 PointerType *PtrTy =
9762 const DataLayout &DataLayout = DAG.getDataLayout();
9763 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
9764 MachinePointerInfo PtrInfo =
9766
9767 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9770}
9771
9772SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9773 SelectionDAG &DAG) const {
9774 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9775 const Function &Fn = DAG.getMachineFunction().getFunction();
9776 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9777 Fn, "unsupported external symbol", Op.getDebugLoc()));
9778 return DAG.getPOISON(Op.getValueType());
9779}
9780
9782 const SDLoc &DL, SDValue V) const {
9783 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9784 // the destination register.
9785 //
9786 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9787 // so we will end up with redundant moves to m0.
9788 //
9789 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9790
9791 // A Null SDValue creates a glue result.
9792 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9793 V, Chain);
9794 return SDValue(M0, 0);
9795}
9796
9797SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9798 MVT VT,
9799 unsigned Offset) const {
9800 SDLoc SL(Op);
9801 SDValue Param = lowerKernargMemParameter(
9802 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9803 // The local size values will have the hi 16-bits as zero.
9804 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9805 DAG.getValueType(VT));
9806}
9807
9809 EVT VT) {
9812 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9813 return DAG.getPOISON(VT);
9814}
9815
9817 EVT VT) {
9820 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9821 return DAG.getPOISON(VT);
9822}
9823
9825 ArrayRef<SDValue> Elts) {
9826 assert(!Elts.empty());
9827 MVT Type;
9828 unsigned NumElts = Elts.size();
9829
9830 if (NumElts <= 12) {
9831 Type = MVT::getVectorVT(MVT::f32, NumElts);
9832 } else {
9833 assert(Elts.size() <= 16);
9834 Type = MVT::v16f32;
9835 NumElts = 16;
9836 }
9837
9838 SmallVector<SDValue, 16> VecElts(NumElts);
9839 for (unsigned i = 0; i < Elts.size(); ++i) {
9840 SDValue Elt = Elts[i];
9841 if (Elt.getValueType() != MVT::f32)
9842 Elt = DAG.getBitcast(MVT::f32, Elt);
9843 VecElts[i] = Elt;
9844 }
9845 for (unsigned i = Elts.size(); i < NumElts; ++i)
9846 VecElts[i] = DAG.getPOISON(MVT::f32);
9847
9848 if (NumElts == 1)
9849 return VecElts[0];
9850 return DAG.getBuildVector(Type, DL, VecElts);
9851}
9852
9853static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9854 SDValue Src, int ExtraElts) {
9855 EVT SrcVT = Src.getValueType();
9856
9858
9859 if (SrcVT.isVector())
9860 DAG.ExtractVectorElements(Src, Elts);
9861 else
9862 Elts.push_back(Src);
9863
9864 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9865 while (ExtraElts--)
9866 Elts.push_back(Undef);
9867
9868 return DAG.getBuildVector(CastVT, DL, Elts);
9869}
9870
9871// Re-construct the required return value for a image load intrinsic.
9872// This is more complicated due to the optional use TexFailCtrl which means the
9873// required return type is an aggregate
9875 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9876 bool Unpacked, bool IsD16, int DMaskPop,
9877 int NumVDataDwords, bool IsAtomicPacked16Bit,
9878 const SDLoc &DL) {
9879 // Determine the required return type. This is the same regardless of
9880 // IsTexFail flag
9881 EVT ReqRetVT = ResultTypes[0];
9882 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9883 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9884 ? (ReqRetNumElts + 1) / 2
9885 : ReqRetNumElts;
9886
9887 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9888
9889 MVT DataDwordVT =
9890 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9891
9892 MVT MaskPopVT =
9893 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9894
9895 SDValue Data(Result, 0);
9896 SDValue TexFail;
9897
9898 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9899 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9900 if (MaskPopVT.isVector()) {
9901 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9902 SDValue(Result, 0), ZeroIdx);
9903 } else {
9904 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9905 SDValue(Result, 0), ZeroIdx);
9906 }
9907 }
9908
9909 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9910 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9911 NumDataDwords - MaskPopDwords);
9912
9913 if (IsD16)
9914 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9915
9916 EVT LegalReqRetVT = ReqRetVT;
9917 if (!ReqRetVT.isVector()) {
9918 if (!Data.getValueType().isInteger())
9919 Data = DAG.getNode(ISD::BITCAST, DL,
9920 Data.getValueType().changeTypeToInteger(), Data);
9921 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9922 } else {
9923 // We need to widen the return vector to a legal type
9924 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9925 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9926 LegalReqRetVT =
9928 ReqRetVT.getVectorNumElements() + 1);
9929 }
9930 }
9931 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9932
9933 if (IsTexFail) {
9934 TexFail =
9935 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9936 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9937
9938 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9939 }
9940
9941 if (Result->getNumValues() == 1)
9942 return Data;
9943
9944 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9945}
9946
9947static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9948 SDValue *LWE, bool &IsTexFail) {
9949 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9950
9951 uint64_t Value = TexFailCtrlConst->getZExtValue();
9952 if (Value) {
9953 IsTexFail = true;
9954 }
9955
9956 SDLoc DL(TexFailCtrlConst);
9957 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9958 Value &= ~(uint64_t)0x1;
9959 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9960 Value &= ~(uint64_t)0x2;
9961
9962 return Value == 0;
9963}
9964
9966 MVT PackVectorVT,
9967 SmallVectorImpl<SDValue> &PackedAddrs,
9968 unsigned DimIdx, unsigned EndIdx,
9969 unsigned NumGradients) {
9970 SDLoc DL(Op);
9971 for (unsigned I = DimIdx; I < EndIdx; I++) {
9972 SDValue Addr = Op.getOperand(I);
9973
9974 // Gradients are packed with undef for each coordinate.
9975 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9976 // 1D: undef,dx/dh; undef,dx/dv
9977 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9978 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9979 if (((I + 1) >= EndIdx) ||
9980 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9981 I == DimIdx + NumGradients - 1))) {
9982 if (Addr.getValueType() != MVT::i16)
9983 Addr = DAG.getBitcast(MVT::i16, Addr);
9984 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9985 } else {
9986 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9987 I++;
9988 }
9989 Addr = DAG.getBitcast(MVT::f32, Addr);
9990 PackedAddrs.push_back(Addr);
9991 }
9992}
9993
9994SDValue SITargetLowering::lowerImage(SDValue Op,
9996 SelectionDAG &DAG, bool WithChain) const {
9997 SDLoc DL(Op);
9998 MachineFunction &MF = DAG.getMachineFunction();
9999 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
10000 unsigned IntrOpcode = Intr->BaseOpcode;
10001 // For image atomic: use no-return opcode if result is unused.
10002 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
10003 !Op.getNode()->hasAnyUseOfValue(0))
10004 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
10005 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
10007 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
10008 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
10009 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10010 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10011
10012 SmallVector<EVT, 3> ResultTypes(Op->values());
10013 SmallVector<EVT, 3> OrigResultTypes(Op->values());
10014 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
10015 ResultTypes.erase(&ResultTypes[0]);
10016
10017 bool IsD16 = false;
10018 bool IsG16 = false;
10019 bool IsA16 = false;
10020 SDValue VData;
10021 int NumVDataDwords = 0;
10022 bool AdjustRetType = false;
10023 bool IsAtomicPacked16Bit = false;
10024
10025 // Offset of intrinsic arguments
10026 const unsigned ArgOffset = WithChain ? 2 : 1;
10027
10028 unsigned DMask;
10029 unsigned DMaskLanes = 0;
10030
10031 if (BaseOpcode->Atomic) {
10032 VData = Op.getOperand(2);
10033
10034 IsAtomicPacked16Bit =
10035 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
10036 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
10037 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
10038 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
10039
10040 bool Is64Bit = VData.getValueSizeInBits() == 64;
10041 if (BaseOpcode->AtomicX2) {
10042 SDValue VData2 = Op.getOperand(3);
10043 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
10044 {VData, VData2});
10045 if (Is64Bit)
10046 VData = DAG.getBitcast(MVT::v4i32, VData);
10047
10048 if (!BaseOpcode->NoReturn)
10049 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
10050
10051 DMask = Is64Bit ? 0xf : 0x3;
10052 NumVDataDwords = Is64Bit ? 4 : 2;
10053 } else {
10054 DMask = Is64Bit ? 0x3 : 0x1;
10055 NumVDataDwords = Is64Bit ? 2 : 1;
10056 }
10057 } else {
10058 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
10059 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
10060
10061 if (BaseOpcode->Store) {
10062 VData = Op.getOperand(2);
10063
10064 MVT StoreVT = VData.getSimpleValueType();
10065 if (StoreVT.getScalarType() == MVT::f16) {
10066 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
10067 return Op; // D16 is unsupported for this instruction
10068
10069 IsD16 = true;
10070 VData = handleD16VData(VData, DAG, true);
10071 }
10072
10073 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
10074 } else if (!BaseOpcode->NoReturn) {
10075 // Work out the num dwords based on the dmask popcount and underlying type
10076 // and whether packing is supported.
10077 MVT LoadVT = ResultTypes[0].getSimpleVT();
10078 if (LoadVT.getScalarType() == MVT::f16) {
10079 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
10080 return Op; // D16 is unsupported for this instruction
10081
10082 IsD16 = true;
10083 }
10084
10085 // Confirm that the return type is large enough for the dmask specified
10086 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
10087 (!LoadVT.isVector() && DMaskLanes > 1))
10088 return Op;
10089
10090 // The sq block of gfx8 and gfx9 do not estimate register use correctly
10091 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
10092 // instructions.
10093 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
10094 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
10095 NumVDataDwords = (DMaskLanes + 1) / 2;
10096 else
10097 NumVDataDwords = DMaskLanes;
10098
10099 AdjustRetType = true;
10100 }
10101 }
10102
10103 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
10105
10106 // Check for 16 bit addresses or derivatives and pack if true.
10107 MVT VAddrVT =
10108 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
10109 MVT VAddrScalarVT = VAddrVT.getScalarType();
10110 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10111 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10112
10113 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
10114 VAddrScalarVT = VAddrVT.getScalarType();
10115 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10116 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10117
10118 // Push back extra arguments.
10119 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
10120 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
10121 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
10122 // Special handling of bias when A16 is on. Bias is of type half but
10123 // occupies full 32-bit.
10124 SDValue Bias = DAG.getBuildVector(
10125 MVT::v2f16, DL,
10126 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
10127 VAddrs.push_back(Bias);
10128 } else {
10129 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
10130 "Bias needs to be converted to 16 bit in A16 mode");
10131 VAddrs.push_back(Op.getOperand(ArgOffset + I));
10132 }
10133 }
10134
10135 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
10136 // 16 bit gradients are supported, but are tied to the A16 control
10137 // so both gradients and addresses must be 16 bit
10138 LLVM_DEBUG(
10139 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
10140 "require 16 bit args for both gradients and addresses");
10141 return Op;
10142 }
10143
10144 if (IsA16) {
10145 if (!ST->hasA16()) {
10146 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
10147 "support 16 bit addresses\n");
10148 return Op;
10149 }
10150 }
10151
10152 // We've dealt with incorrect input so we know that if IsA16, IsG16
10153 // are set then we have to compress/pack operands (either address,
10154 // gradient or both)
10155 // In the case where a16 and gradients are tied (no G16 support) then we
10156 // have already verified that both IsA16 and IsG16 are true
10157 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
10158 // Activate g16
10159 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
10161 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
10162 }
10163
10164 // Add gradients (packed or unpacked)
10165 if (IsG16) {
10166 // Pack the gradients
10167 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
10168 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
10169 ArgOffset + Intr->GradientStart,
10170 ArgOffset + Intr->CoordStart, Intr->NumGradients);
10171 } else {
10172 for (unsigned I = ArgOffset + Intr->GradientStart;
10173 I < ArgOffset + Intr->CoordStart; I++)
10174 VAddrs.push_back(Op.getOperand(I));
10175 }
10176
10177 // Add addresses (packed or unpacked)
10178 if (IsA16) {
10179 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
10180 ArgOffset + Intr->CoordStart, VAddrEnd,
10181 0 /* No gradients */);
10182 } else {
10183 // Add uncompressed address
10184 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
10185 VAddrs.push_back(Op.getOperand(I));
10186 }
10187
10188 // If the register allocator cannot place the address registers contiguously
10189 // without introducing moves, then using the non-sequential address encoding
10190 // is always preferable, since it saves VALU instructions and is usually a
10191 // wash in terms of code size or even better.
10192 //
10193 // However, we currently have no way of hinting to the register allocator that
10194 // MIMG addresses should be placed contiguously when it is possible to do so,
10195 // so force non-NSA for the common 2-address case as a heuristic.
10196 //
10197 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
10198 // allocation when possible.
10199 //
10200 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
10201 // set of the remaining addresses.
10202 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
10203 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
10204 const bool UseNSA = ST->hasNSAEncoding() &&
10205 VAddrs.size() >= ST->getNSAThreshold(MF) &&
10206 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
10207 const bool UsePartialNSA =
10208 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
10209
10210 SDValue VAddr;
10211 if (UsePartialNSA) {
10212 VAddr = getBuildDwordsVector(DAG, DL,
10213 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
10214 } else if (!UseNSA) {
10215 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
10216 }
10217
10218 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
10219 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
10220 SDValue Unorm;
10221 if (!BaseOpcode->Sampler) {
10222 Unorm = True;
10223 } else {
10224 uint64_t UnormConst =
10225 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
10226
10227 Unorm = UnormConst ? True : False;
10228 }
10229
10230 SDValue TFE;
10231 SDValue LWE;
10232 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
10233 bool IsTexFail = false;
10234 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
10235 return Op;
10236
10237 if (IsTexFail) {
10238 if (!DMaskLanes) {
10239 // Expecting to get an error flag since TFC is on - and dmask is 0
10240 // Force dmask to be at least 1 otherwise the instruction will fail
10241 DMask = 0x1;
10242 DMaskLanes = 1;
10243 NumVDataDwords = 1;
10244 }
10245 NumVDataDwords += 1;
10246 AdjustRetType = true;
10247 }
10248
10249 // Has something earlier tagged that the return type needs adjusting
10250 // This happens if the instruction is a load or has set TexFailCtrl flags
10251 if (AdjustRetType) {
10252 // NumVDataDwords reflects the true number of dwords required in the return
10253 // type
10254 if (DMaskLanes == 0 && !BaseOpcode->Store) {
10255 // This is a no-op load. This can be eliminated
10256 SDValue Undef = DAG.getPOISON(Op.getValueType());
10257 if (isa<MemSDNode>(Op))
10258 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
10259 return Undef;
10260 }
10261
10262 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
10263 MVT::i32, NumVDataDwords)
10264 : MVT::i32;
10265
10266 ResultTypes[0] = NewVT;
10267 if (ResultTypes.size() == 3) {
10268 // Original result was aggregate type used for TexFailCtrl results
10269 // The actual instruction returns as a vector type which has now been
10270 // created. Remove the aggregate result.
10271 ResultTypes.erase(&ResultTypes[1]);
10272 }
10273 }
10274
10275 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
10276 // Keep GLC only when the atomic's result is actually used.
10277 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
10279 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
10281 return Op;
10282
10284 if (BaseOpcode->Store || BaseOpcode->Atomic)
10285 Ops.push_back(VData); // vdata
10286 if (UsePartialNSA) {
10287 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
10288 Ops.push_back(VAddr);
10289 } else if (UseNSA)
10290 append_range(Ops, VAddrs);
10291 else
10292 Ops.push_back(VAddr);
10293 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
10294 EVT RsrcVT = Rsrc.getValueType();
10295 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
10296 return Op;
10297 Ops.push_back(Rsrc);
10298 if (BaseOpcode->Sampler) {
10299 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
10300 if (Samp.getValueType() != MVT::v4i32)
10301 return Op;
10302 Ops.push_back(Samp);
10303 }
10304 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
10305 if (IsGFX10Plus)
10306 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
10307 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
10308 Ops.push_back(Unorm);
10309 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
10310 Ops.push_back(IsA16 && // r128, a16 for gfx9
10311 ST->hasFeature(AMDGPU::FeatureR128A16)
10312 ? True
10313 : False);
10314 if (IsGFX10Plus)
10315 Ops.push_back(IsA16 ? True : False);
10316
10317 if (!Subtarget->hasGFX90AInsts())
10318 Ops.push_back(TFE); // tfe
10319 else if (TFE->getAsZExtVal()) {
10320 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10322 "TFE is not supported on this GPU", DL.getDebugLoc()));
10323 }
10324
10325 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
10326 Ops.push_back(LWE); // lwe
10327 if (!IsGFX10Plus)
10328 Ops.push_back(DimInfo->DA ? True : False);
10329 if (BaseOpcode->HasD16)
10330 Ops.push_back(IsD16 ? True : False);
10331 if (isa<MemSDNode>(Op))
10332 Ops.push_back(Op.getOperand(0)); // chain
10333
10334 int NumVAddrDwords =
10335 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
10336 int Opcode = -1;
10337
10338 if (IsGFX12Plus) {
10339 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
10340 NumVDataDwords, NumVAddrDwords);
10341 } else if (IsGFX11Plus) {
10342 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
10343 UseNSA ? AMDGPU::MIMGEncGfx11NSA
10344 : AMDGPU::MIMGEncGfx11Default,
10345 NumVDataDwords, NumVAddrDwords);
10346 } else if (IsGFX10Plus) {
10347 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
10348 UseNSA ? AMDGPU::MIMGEncGfx10NSA
10349 : AMDGPU::MIMGEncGfx10Default,
10350 NumVDataDwords, NumVAddrDwords);
10351 } else {
10352 if (Subtarget->hasGFX90AInsts()) {
10353 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
10354 NumVDataDwords, NumVAddrDwords);
10355 if (Opcode == -1) {
10356 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10358 "requested image instruction is not supported on this GPU",
10359 DL.getDebugLoc()));
10360
10361 unsigned Idx = 0;
10362 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
10363 for (EVT VT : OrigResultTypes) {
10364 if (VT == MVT::Other)
10365 RetValues[Idx++] = Op.getOperand(0); // Chain
10366 else
10367 RetValues[Idx++] = DAG.getPOISON(VT);
10368 }
10369
10370 return DAG.getMergeValues(RetValues, DL);
10371 }
10372 }
10373 if (Opcode == -1 &&
10374 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10375 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
10376 NumVDataDwords, NumVAddrDwords);
10377 if (Opcode == -1)
10378 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
10379 NumVDataDwords, NumVAddrDwords);
10380 }
10381 if (Opcode == -1)
10382 return Op;
10383
10384 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
10385 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
10386 MachineMemOperand *MemRef = MemOp->getMemOperand();
10387 DAG.setNodeMemRefs(NewNode, {MemRef});
10388 }
10389
10390 if (BaseOpcode->NoReturn) {
10391 if (BaseOpcode->Atomic)
10392 return DAG.getMergeValues(
10393 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
10394
10395 return SDValue(NewNode, 0);
10396 }
10397
10398 if (BaseOpcode->AtomicX2) {
10400 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
10401 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
10402 }
10403
10404 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
10405 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
10406 NumVDataDwords, IsAtomicPacked16Bit, DL);
10407}
10408
10409SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
10410 SDValue Offset, SDValue CachePolicy,
10411 SelectionDAG &DAG) const {
10412 MachineFunction &MF = DAG.getMachineFunction();
10413
10414 const DataLayout &DataLayout = DAG.getDataLayout();
10415 Align Alignment =
10416 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
10417
10418 MachineMemOperand *MMO = MF.getMachineMemOperand(
10419 MachinePointerInfo(),
10422 VT.getStoreSize(), Alignment);
10423
10424 if (!Offset->isDivergent()) {
10425 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
10426
10427 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
10428 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
10429 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
10430 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
10431 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10432 SDValue BufferLoad =
10433 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
10434 DAG.getVTList(MVT::i32), Ops, VT, MMO);
10435 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
10436 }
10437
10438 // Widen vec3 load to vec4.
10439 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
10440 !Subtarget->hasScalarDwordx3Loads()) {
10441 EVT WidenedVT =
10443 auto WidenedOp = DAG.getMemIntrinsicNode(
10444 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
10445 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
10446 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
10447 DAG.getVectorIdxConstant(0, DL));
10448 return Subvector;
10449 }
10450
10451 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
10452 DAG.getVTList(VT), Ops, VT, MMO);
10453 }
10454
10455 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
10456 // assume that the buffer is unswizzled.
10457 SDValue Ops[] = {
10458 DAG.getEntryNode(), // Chain
10459 Rsrc, // rsrc
10460 DAG.getConstant(0, DL, MVT::i32), // vindex
10461 {}, // voffset
10462 {}, // soffset
10463 {}, // offset
10464 CachePolicy, // cachepolicy
10465 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10466 };
10467 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10468 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
10469 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
10470 }
10471
10473 unsigned NumLoads = 1;
10474 MVT LoadVT = VT.getSimpleVT();
10475 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
10476 assert((LoadVT.getScalarType() == MVT::i32 ||
10477 LoadVT.getScalarType() == MVT::f32));
10478
10479 if (NumElts == 8 || NumElts == 16) {
10480 NumLoads = NumElts / 4;
10481 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
10482 }
10483
10484 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
10485
10486 // Use the alignment to ensure that the required offsets will fit into the
10487 // immediate offsets.
10488 setBufferOffsets(Offset, DAG, &Ops[3],
10489 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
10490
10491 uint64_t InstOffset = Ops[5]->getAsZExtVal();
10492 unsigned LoadSize = LoadVT.getStoreSize();
10493 for (unsigned i = 0; i < NumLoads; ++i) {
10494 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
10495 MachineMemOperand *LoadMMO = MF.getMachineMemOperand(MMO, 16 * i, LoadSize);
10496 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
10497 LoadVT, LoadMMO, DAG));
10498 }
10499
10500 if (NumElts == 8 || NumElts == 16)
10501 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
10502
10503 return Loads[0];
10504}
10505
10506SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
10507 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
10508 if (!Subtarget->hasArchitectedSGPRs())
10509 return {};
10510 SDLoc SL(Op);
10511 MVT VT = MVT::i32;
10512 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
10513 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10514 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
10515}
10516
10517SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
10518 AMDGPU::Hwreg::Id HwReg,
10519 unsigned LowBit,
10520 unsigned Width) const {
10521 SDLoc SL(Op);
10522 using namespace AMDGPU::Hwreg;
10523 return {DAG.getMachineNode(
10524 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10525 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
10526 SL, MVT::i32)),
10527 0};
10528}
10529
10530SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
10531 unsigned Dim,
10532 const ArgDescriptor &Arg) const {
10533 SDLoc SL(Op);
10534 MachineFunction &MF = DAG.getMachineFunction();
10535 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
10536 if (MaxID == 0)
10537 return DAG.getConstant(0, SL, MVT::i32);
10538
10539 // It's undefined behavior if a function marked with the amdgpu-no-*
10540 // attributes uses the corresponding intrinsic.
10541 if (!Arg)
10542 return DAG.getPOISON(Op->getValueType(0));
10543
10544 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
10545 SDLoc(DAG.getEntryNode()), Arg);
10546
10547 // Don't bother inserting AssertZext for packed IDs since we're emitting the
10548 // masking operations anyway.
10549 //
10550 // TODO: We could assert the top bit is 0 for the source copy.
10551 if (Arg.isMasked())
10552 return Val;
10553
10554 // Preserve the known bits after expansion to a copy.
10555 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
10556 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
10557 DAG.getValueType(SmallVT));
10558}
10559
10560SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10561 SelectionDAG &DAG) const {
10562 MachineFunction &MF = DAG.getMachineFunction();
10563 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
10564
10565 EVT VT = Op.getValueType();
10566 SDLoc DL(Op);
10567 unsigned IntrinsicID = Op.getConstantOperandVal(0);
10568
10569 // TODO: Should this propagate fast-math-flags?
10570
10571 switch (IntrinsicID) {
10572 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10573 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
10574 return emitNonHSAIntrinsicError(DAG, DL, VT);
10575 return getPreloadedValue(DAG, *MFI, VT,
10577 }
10578 case Intrinsic::amdgcn_dispatch_ptr:
10579 case Intrinsic::amdgcn_queue_ptr: {
10580 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
10581 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10582 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
10583 DL.getDebugLoc()));
10584 return DAG.getPOISON(VT);
10585 }
10586
10587 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10590 return getPreloadedValue(DAG, *MFI, VT, RegID);
10591 }
10592 case Intrinsic::amdgcn_implicitarg_ptr: {
10593 if (MFI->isEntryFunction())
10594 return getImplicitArgPtr(DAG, DL);
10595 return getPreloadedValue(DAG, *MFI, VT,
10597 }
10598 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10599 if (!AMDGPU::isKernel(MF.getFunction())) {
10600 // This only makes sense to call in a kernel, so just lower to null.
10601 return DAG.getConstant(0, DL, VT);
10602 }
10603
10604 return getPreloadedValue(DAG, *MFI, VT,
10606 }
10607 case Intrinsic::amdgcn_dispatch_id: {
10608 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
10609 }
10610 case Intrinsic::amdgcn_rcp:
10611 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
10612 case Intrinsic::amdgcn_rsq:
10613 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10614 case Intrinsic::amdgcn_rsq_legacy:
10615 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10616 return emitRemovedIntrinsicError(DAG, DL, VT);
10617 return SDValue();
10618 case Intrinsic::amdgcn_rcp_legacy:
10619 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10620 return emitRemovedIntrinsicError(DAG, DL, VT);
10621 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
10622 case Intrinsic::amdgcn_rsq_clamp: {
10623 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10624 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
10625
10626 Type *Type = VT.getTypeForEVT(*DAG.getContext());
10627 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
10628 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
10629
10630 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10631 SDValue Tmp =
10632 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
10633 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
10634 DAG.getConstantFP(Min, DL, VT));
10635 }
10636 case Intrinsic::r600_read_ngroups_x:
10637 if (Subtarget->isAmdHsaOS())
10638 return emitNonHSAIntrinsicError(DAG, DL, VT);
10639
10640 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10642 false);
10643 case Intrinsic::r600_read_ngroups_y:
10644 if (Subtarget->isAmdHsaOS())
10645 return emitNonHSAIntrinsicError(DAG, DL, VT);
10646
10647 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10649 false);
10650 case Intrinsic::r600_read_ngroups_z:
10651 if (Subtarget->isAmdHsaOS())
10652 return emitNonHSAIntrinsicError(DAG, DL, VT);
10653
10654 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10656 false);
10657 case Intrinsic::r600_read_local_size_x:
10658 if (Subtarget->isAmdHsaOS())
10659 return emitNonHSAIntrinsicError(DAG, DL, VT);
10660
10661 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10663 case Intrinsic::r600_read_local_size_y:
10664 if (Subtarget->isAmdHsaOS())
10665 return emitNonHSAIntrinsicError(DAG, DL, VT);
10666
10667 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10669 case Intrinsic::r600_read_local_size_z:
10670 if (Subtarget->isAmdHsaOS())
10671 return emitNonHSAIntrinsicError(DAG, DL, VT);
10672
10673 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10675 case Intrinsic::amdgcn_workgroup_id_x:
10676 return lowerWorkGroupId(DAG, *MFI, VT,
10680 case Intrinsic::amdgcn_workgroup_id_y:
10681 return lowerWorkGroupId(DAG, *MFI, VT,
10685 case Intrinsic::amdgcn_workgroup_id_z:
10686 return lowerWorkGroupId(DAG, *MFI, VT,
10690 case Intrinsic::amdgcn_cluster_id_x:
10691 return Subtarget->hasClusters()
10692 ? getPreloadedValue(DAG, *MFI, VT,
10694 : DAG.getPOISON(VT);
10695 case Intrinsic::amdgcn_cluster_id_y:
10696 return Subtarget->hasClusters()
10697 ? getPreloadedValue(DAG, *MFI, VT,
10699 : DAG.getPOISON(VT);
10700 case Intrinsic::amdgcn_cluster_id_z:
10701 return Subtarget->hasClusters()
10702 ? getPreloadedValue(DAG, *MFI, VT,
10704 : DAG.getPOISON(VT);
10705 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10706 return Subtarget->hasClusters()
10707 ? getPreloadedValue(
10708 DAG, *MFI, VT,
10710 : DAG.getPOISON(VT);
10711 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10712 return Subtarget->hasClusters()
10713 ? getPreloadedValue(
10714 DAG, *MFI, VT,
10716 : DAG.getPOISON(VT);
10717 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10718 return Subtarget->hasClusters()
10719 ? getPreloadedValue(
10720 DAG, *MFI, VT,
10722 : DAG.getPOISON(VT);
10723 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10724 return Subtarget->hasClusters()
10725 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
10726 : SDValue();
10727 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10728 return Subtarget->hasClusters()
10729 ? getPreloadedValue(
10730 DAG, *MFI, VT,
10732 : DAG.getPOISON(VT);
10733 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10734 return Subtarget->hasClusters()
10735 ? getPreloadedValue(
10736 DAG, *MFI, VT,
10738 : DAG.getPOISON(VT);
10739 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10740 return Subtarget->hasClusters()
10741 ? getPreloadedValue(
10742 DAG, *MFI, VT,
10744 : DAG.getPOISON(VT);
10745 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10746 return Subtarget->hasClusters()
10747 ? getPreloadedValue(
10748 DAG, *MFI, VT,
10750 : DAG.getPOISON(VT);
10751 case Intrinsic::amdgcn_wave_id:
10752 return lowerWaveID(DAG, Op);
10753 case Intrinsic::amdgcn_lds_kernel_id: {
10754 if (MFI->isEntryFunction())
10755 return getLDSKernelId(DAG, DL);
10756 return getPreloadedValue(DAG, *MFI, VT,
10758 }
10759 case Intrinsic::amdgcn_workitem_id_x:
10760 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
10761 case Intrinsic::amdgcn_workitem_id_y:
10762 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
10763 case Intrinsic::amdgcn_workitem_id_z:
10764 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
10765 case Intrinsic::amdgcn_wavefrontsize:
10766 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10767 SDLoc(Op), MVT::i32);
10768 case Intrinsic::amdgcn_s_buffer_load: {
10769 unsigned CPol = Op.getConstantOperandVal(3);
10770 // s_buffer_load, because of how it's optimized, can't be volatile
10771 // so reject ones with the volatile bit set.
10772 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10775 return Op;
10776 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
10777 Op.getOperand(3), DAG);
10778 }
10779 case Intrinsic::amdgcn_fdiv_fast:
10780 return lowerFDIV_FAST(Op, DAG);
10781 case Intrinsic::amdgcn_sin:
10782 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10783
10784 case Intrinsic::amdgcn_cos:
10785 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10786
10787 case Intrinsic::amdgcn_mul_u24:
10788 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10789 Op.getOperand(2));
10790 case Intrinsic::amdgcn_mul_i24:
10791 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10792 Op.getOperand(2));
10793
10794 case Intrinsic::amdgcn_log_clamp: {
10795 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10796 return SDValue();
10797
10798 return emitRemovedIntrinsicError(DAG, DL, VT);
10799 }
10800 case Intrinsic::amdgcn_fract:
10801 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10802
10803 case Intrinsic::amdgcn_class:
10804 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10805 Op.getOperand(2));
10806 case Intrinsic::amdgcn_div_fmas:
10807 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10808 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10809
10810 case Intrinsic::amdgcn_div_fixup:
10811 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10812 Op.getOperand(2), Op.getOperand(3));
10813
10814 case Intrinsic::amdgcn_div_scale: {
10815 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10816
10817 // Translate to the operands expected by the machine instruction. The
10818 // first parameter must be the same as the first instruction.
10819 SDValue Numerator = Op.getOperand(1);
10820 SDValue Denominator = Op.getOperand(2);
10821
10822 // Note this order is opposite of the machine instruction's operations,
10823 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10824 // intrinsic has the numerator as the first operand to match a normal
10825 // division operation.
10826
10827 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10828
10829 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10830 Denominator, Numerator);
10831 }
10832 case Intrinsic::amdgcn_icmp: {
10833 // There is a Pat that handles this variant, so return it as-is.
10834 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10835 Op.getConstantOperandVal(2) == 0 &&
10836 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10837 return Op;
10838 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10839 }
10840 case Intrinsic::amdgcn_fcmp: {
10841 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10842 }
10843 case Intrinsic::amdgcn_ballot:
10844 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10845 case Intrinsic::amdgcn_fmed3:
10846 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10847 Op.getOperand(2), Op.getOperand(3));
10848 case Intrinsic::amdgcn_fdot2:
10849 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10850 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10851 case Intrinsic::amdgcn_fmul_legacy:
10852 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10853 Op.getOperand(2));
10854 case Intrinsic::amdgcn_sbfe:
10855 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10856 Op.getOperand(2), Op.getOperand(3));
10857 case Intrinsic::amdgcn_ubfe:
10858 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10859 Op.getOperand(2), Op.getOperand(3));
10860 case Intrinsic::amdgcn_cvt_pkrtz:
10861 case Intrinsic::amdgcn_cvt_pknorm_i16:
10862 case Intrinsic::amdgcn_cvt_pknorm_u16:
10863 case Intrinsic::amdgcn_cvt_pk_i16:
10864 case Intrinsic::amdgcn_cvt_pk_u16: {
10865 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10866 EVT VT = Op.getValueType();
10867 unsigned Opcode;
10868
10869 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10870 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10871 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10872 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10873 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10874 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10875 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10876 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10877 else
10878 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10879
10880 if (isTypeLegal(VT))
10881 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10882
10883 SDValue Node =
10884 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10885 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10886 }
10887 case Intrinsic::amdgcn_fmad_ftz:
10888 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10889 Op.getOperand(2), Op.getOperand(3));
10890
10891 case Intrinsic::amdgcn_if_break:
10892 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10893 Op->getOperand(1), Op->getOperand(2)),
10894 0);
10895
10896 case Intrinsic::amdgcn_groupstaticsize: {
10898 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10899 return Op;
10900
10901 const Module *M = MF.getFunction().getParent();
10902 const GlobalValue *GV =
10903 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10904 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10906 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10907 }
10908 case Intrinsic::amdgcn_is_shared:
10909 case Intrinsic::amdgcn_is_private: {
10910 SDLoc SL(Op);
10911 SDValue SrcVec =
10912 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10913 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10914 DAG.getConstant(1, SL, MVT::i32));
10915
10916 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10918 : AMDGPUAS::PRIVATE_ADDRESS;
10919 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10920 Subtarget->hasGloballyAddressableScratch()) {
10921 SDValue FlatScratchBaseHi(
10922 DAG.getMachineNode(
10923 AMDGPU::S_MOV_B32, DL, MVT::i32,
10924 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10925 0);
10926 // Test bits 63..58 against the aperture address.
10927 return DAG.getSetCC(
10928 SL, MVT::i1,
10929 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10930 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10931 }
10932
10933 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10934 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10935 }
10936 case Intrinsic::amdgcn_perm:
10937 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10938 Op.getOperand(2), Op.getOperand(3));
10939 case Intrinsic::amdgcn_reloc_constant: {
10940 Module *M = MF.getFunction().getParent();
10941 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10942 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10943 auto *RelocSymbol = cast<GlobalVariable>(
10944 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10945 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10947 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10948 }
10949 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10950 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10951 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10952 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10953 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10954 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10955 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10956 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10957 if (Op.getOperand(4).getValueType() == MVT::i32)
10958 return SDValue();
10959
10960 SDLoc SL(Op);
10961 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10962 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10963 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10964 Op.getOperand(3), IndexKeyi32);
10965 }
10966 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10967 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10968 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10969 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10970 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10971 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10972 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10973 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10974 if (Op.getOperand(4).getValueType() == MVT::i64)
10975 return SDValue();
10976
10977 SDLoc SL(Op);
10978 auto IndexKeyi64 =
10979 Op.getOperand(4).getValueType() == MVT::v2i32
10980 ? DAG.getBitcast(MVT::i64, Op.getOperand(4))
10981 : DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10982 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10983 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10984 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10985 Op.getOperand(6)});
10986 }
10987 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10988 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10989 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10990 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10991 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10992 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10993 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10994 ? MVT::i64
10995 : MVT::i32;
10996 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10997 return SDValue();
10998
10999 SDLoc SL(Op);
11000 auto IndexKey =
11001 Op.getOperand(6).getValueType().isVector()
11002 ? DAG.getBitcast(IndexKeyTy, Op.getOperand(6))
11003 : DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
11005 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11006 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
11007 IndexKey, Op.getOperand(7), Op.getOperand(8)};
11008 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
11009 Args.push_back(Op.getOperand(9));
11010 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
11011 }
11012 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
11013 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
11014 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
11015 if (Op.getOperand(6).getValueType() == MVT::i32)
11016 return SDValue();
11017
11018 SDLoc SL(Op);
11019 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
11020 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
11021 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11022 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
11023 IndexKeyi32, Op.getOperand(7)});
11024 }
11025 case Intrinsic::amdgcn_addrspacecast_nonnull:
11026 return lowerADDRSPACECAST(Op, DAG);
11027 case Intrinsic::amdgcn_readlane:
11028 case Intrinsic::amdgcn_readfirstlane:
11029 case Intrinsic::amdgcn_writelane:
11030 case Intrinsic::amdgcn_permlane16:
11031 case Intrinsic::amdgcn_permlanex16:
11032 case Intrinsic::amdgcn_permlane64:
11033 case Intrinsic::amdgcn_set_inactive:
11034 case Intrinsic::amdgcn_set_inactive_chain_arg:
11035 case Intrinsic::amdgcn_mov_dpp8:
11036 case Intrinsic::amdgcn_update_dpp:
11037 return lowerLaneOp(*this, Op.getNode(), DAG);
11038 case Intrinsic::amdgcn_dead: {
11040 for (const EVT ValTy : Op.getNode()->values())
11041 Poisons.push_back(DAG.getPOISON(ValTy));
11042 return DAG.getMergeValues(Poisons, SDLoc(Op));
11043 }
11044 case Intrinsic::amdgcn_wave_shuffle:
11045 return lowerWaveShuffle(*this, Op.getNode(), DAG);
11046 default:
11047 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11049 return lowerImage(Op, ImageDimIntr, DAG, false);
11050
11051 return Op;
11052 }
11053}
11054
11055// On targets not supporting constant in soffset field, turn zero to
11056// SGPR_NULL to avoid generating an extra s_mov with zero.
11058 const GCNSubtarget *Subtarget) {
11059 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
11060 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
11061 return SOffset;
11062}
11063
11064SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
11065 SelectionDAG &DAG,
11066 unsigned NewOpcode) const {
11067 SDLoc DL(Op);
11068
11069 SDValue VData = Op.getOperand(2);
11070 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11071 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11072 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11073 SDValue Ops[] = {
11074 Op.getOperand(0), // Chain
11075 VData, // vdata
11076 Rsrc, // rsrc
11077 DAG.getConstant(0, DL, MVT::i32), // vindex
11078 VOffset, // voffset
11079 SOffset, // soffset
11080 Offset, // offset
11081 Op.getOperand(6), // cachepolicy
11082 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11083 };
11084
11085 auto *M = cast<MemSDNode>(Op);
11086
11087 EVT MemVT = VData.getValueType();
11088 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
11089 M->getMemOperand());
11090}
11091
11092SDValue
11093SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
11094 unsigned NewOpcode) const {
11095 SDLoc DL(Op);
11096
11097 SDValue VData = Op.getOperand(2);
11098 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11099 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11100 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11101 SDValue Ops[] = {
11102 Op.getOperand(0), // Chain
11103 VData, // vdata
11104 Rsrc, // rsrc
11105 Op.getOperand(4), // vindex
11106 VOffset, // voffset
11107 SOffset, // soffset
11108 Offset, // offset
11109 Op.getOperand(7), // cachepolicy
11110 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11111 };
11112
11113 auto *M = cast<MemSDNode>(Op);
11114
11115 EVT MemVT = VData.getValueType();
11116 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
11117 M->getMemOperand());
11118}
11119
11120SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
11121 SelectionDAG &DAG) const {
11122 unsigned IntrID = Op.getConstantOperandVal(1);
11123 SDLoc DL(Op);
11124
11125 switch (IntrID) {
11126 case Intrinsic::amdgcn_ds_ordered_add:
11127 case Intrinsic::amdgcn_ds_ordered_swap: {
11128 MemSDNode *M = cast<MemSDNode>(Op);
11129 SDValue Chain = M->getOperand(0);
11130 SDValue M0 = M->getOperand(2);
11131 SDValue Value = M->getOperand(3);
11132 unsigned IndexOperand = M->getConstantOperandVal(7);
11133 unsigned WaveRelease = M->getConstantOperandVal(8);
11134 unsigned WaveDone = M->getConstantOperandVal(9);
11135
11136 unsigned OrderedCountIndex = IndexOperand & 0x3f;
11137 IndexOperand &= ~0x3f;
11138 unsigned CountDw = 0;
11139
11140 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
11141 CountDw = (IndexOperand >> 24) & 0xf;
11142 IndexOperand &= ~(0xf << 24);
11143
11144 if (CountDw < 1 || CountDw > 4) {
11145 const Function &Fn = DAG.getMachineFunction().getFunction();
11146 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11147 Fn, "ds_ordered_count: dword count must be between 1 and 4",
11148 DL.getDebugLoc()));
11149 CountDw = 1;
11150 }
11151 }
11152
11153 if (IndexOperand) {
11154 const Function &Fn = DAG.getMachineFunction().getFunction();
11155 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11156 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
11157 }
11158
11159 if (WaveDone && !WaveRelease) {
11160 // TODO: Move this to IR verifier
11161 const Function &Fn = DAG.getMachineFunction().getFunction();
11162 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11163 Fn, "ds_ordered_count: wave_done requires wave_release",
11164 DL.getDebugLoc()));
11165 }
11166
11167 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
11168 unsigned ShaderType =
11170 unsigned Offset0 = OrderedCountIndex << 2;
11171 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
11172
11173 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
11174 Offset1 |= (CountDw - 1) << 6;
11175
11176 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
11177 Offset1 |= ShaderType << 2;
11178
11179 unsigned Offset = Offset0 | (Offset1 << 8);
11180
11181 SDValue Ops[] = {
11182 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
11183 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
11184 };
11185 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
11186 M->getVTList(), Ops, M->getMemoryVT(),
11187 M->getMemOperand());
11188 }
11189 case Intrinsic::amdgcn_raw_buffer_load:
11190 case Intrinsic::amdgcn_raw_ptr_buffer_load:
11191 case Intrinsic::amdgcn_raw_atomic_buffer_load:
11192 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
11193 case Intrinsic::amdgcn_raw_buffer_load_format:
11194 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
11195 const bool IsFormat =
11196 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
11197 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
11198
11199 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11200 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
11201 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
11202 SDValue Ops[] = {
11203 Op.getOperand(0), // Chain
11204 Rsrc, // rsrc
11205 DAG.getConstant(0, DL, MVT::i32), // vindex
11206 VOffset, // voffset
11207 SOffset, // soffset
11208 Offset, // offset
11209 Op.getOperand(5), // cachepolicy, swizzled buffer
11210 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11211 };
11212
11213 auto *M = cast<MemSDNode>(Op);
11214 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
11215 }
11216 case Intrinsic::amdgcn_struct_buffer_load:
11217 case Intrinsic::amdgcn_struct_ptr_buffer_load:
11218 case Intrinsic::amdgcn_struct_buffer_load_format:
11219 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
11220 case Intrinsic::amdgcn_struct_atomic_buffer_load:
11221 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
11222 const bool IsFormat =
11223 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
11224 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
11225
11226 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11227 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11228 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11229 SDValue Ops[] = {
11230 Op.getOperand(0), // Chain
11231 Rsrc, // rsrc
11232 Op.getOperand(3), // vindex
11233 VOffset, // voffset
11234 SOffset, // soffset
11235 Offset, // offset
11236 Op.getOperand(6), // cachepolicy, swizzled buffer
11237 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11238 };
11239
11240 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
11241 }
11242 case Intrinsic::amdgcn_raw_tbuffer_load:
11243 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
11244 MemSDNode *M = cast<MemSDNode>(Op);
11245 EVT LoadVT = Op.getValueType();
11246 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11247 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
11248 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
11249
11250 SDValue Ops[] = {
11251 Op.getOperand(0), // Chain
11252 Rsrc, // rsrc
11253 DAG.getConstant(0, DL, MVT::i32), // vindex
11254 VOffset, // voffset
11255 SOffset, // soffset
11256 Offset, // offset
11257 Op.getOperand(5), // format
11258 Op.getOperand(6), // cachepolicy, swizzled buffer
11259 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11260 };
11261
11262 if (LoadVT.getScalarType() == MVT::f16)
11263 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11264 Ops);
11265 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
11266 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
11267 DAG);
11268 }
11269 case Intrinsic::amdgcn_struct_tbuffer_load:
11270 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
11271 MemSDNode *M = cast<MemSDNode>(Op);
11272 EVT LoadVT = Op.getValueType();
11273 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11274 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11275 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11276
11277 SDValue Ops[] = {
11278 Op.getOperand(0), // Chain
11279 Rsrc, // rsrc
11280 Op.getOperand(3), // vindex
11281 VOffset, // voffset
11282 SOffset, // soffset
11283 Offset, // offset
11284 Op.getOperand(6), // format
11285 Op.getOperand(7), // cachepolicy, swizzled buffer
11286 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11287 };
11288
11289 if (LoadVT.getScalarType() == MVT::f16)
11290 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11291 Ops);
11292 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
11293 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
11294 DAG);
11295 }
11296 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
11297 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
11298 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
11299 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
11300 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
11301 return lowerStructBufferAtomicIntrin(Op, DAG,
11302 AMDGPUISD::BUFFER_ATOMIC_FADD);
11303 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
11304 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
11305 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
11306 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
11307 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
11308 return lowerStructBufferAtomicIntrin(Op, DAG,
11309 AMDGPUISD::BUFFER_ATOMIC_FMIN);
11310 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
11311 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
11312 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
11313 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
11314 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
11315 return lowerStructBufferAtomicIntrin(Op, DAG,
11316 AMDGPUISD::BUFFER_ATOMIC_FMAX);
11317 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
11318 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
11319 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
11320 case Intrinsic::amdgcn_raw_buffer_atomic_add:
11321 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
11322 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11323 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
11324 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
11325 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11326 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
11327 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
11328 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
11329 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
11330 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
11331 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
11332 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
11333 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
11334 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
11335 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
11336 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
11337 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
11338 case Intrinsic::amdgcn_raw_buffer_atomic_and:
11339 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
11340 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11341 case Intrinsic::amdgcn_raw_buffer_atomic_or:
11342 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
11343 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11344 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
11345 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
11346 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11347 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
11348 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
11349 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11350 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
11351 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
11352 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11353 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
11354 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
11355 return lowerStructBufferAtomicIntrin(Op, DAG,
11356 AMDGPUISD::BUFFER_ATOMIC_SWAP);
11357 case Intrinsic::amdgcn_struct_buffer_atomic_add:
11358 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
11359 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11360 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
11361 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
11362 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11363 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
11364 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
11365 return lowerStructBufferAtomicIntrin(Op, DAG,
11366 AMDGPUISD::BUFFER_ATOMIC_SMIN);
11367 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
11368 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
11369 return lowerStructBufferAtomicIntrin(Op, DAG,
11370 AMDGPUISD::BUFFER_ATOMIC_UMIN);
11371 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
11372 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
11373 return lowerStructBufferAtomicIntrin(Op, DAG,
11374 AMDGPUISD::BUFFER_ATOMIC_SMAX);
11375 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
11376 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
11377 return lowerStructBufferAtomicIntrin(Op, DAG,
11378 AMDGPUISD::BUFFER_ATOMIC_UMAX);
11379 case Intrinsic::amdgcn_struct_buffer_atomic_and:
11380 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
11381 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11382 case Intrinsic::amdgcn_struct_buffer_atomic_or:
11383 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
11384 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11385 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
11386 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
11387 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11388 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
11389 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
11390 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11391 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
11392 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
11393 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11394 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
11395 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
11396 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
11397 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
11398 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
11399 return lowerStructBufferAtomicIntrin(Op, DAG,
11400 AMDGPUISD::BUFFER_ATOMIC_CSUB);
11401 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
11402 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
11403 return lowerRawBufferAtomicIntrin(Op, DAG,
11404 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11405 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
11406 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
11407 return lowerStructBufferAtomicIntrin(Op, DAG,
11408 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11409 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
11410 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
11411 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
11412 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11413 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11414 SDValue Ops[] = {
11415 Op.getOperand(0), // Chain
11416 Op.getOperand(2), // src
11417 Op.getOperand(3), // cmp
11418 Rsrc, // rsrc
11419 DAG.getConstant(0, DL, MVT::i32), // vindex
11420 VOffset, // voffset
11421 SOffset, // soffset
11422 Offset, // offset
11423 Op.getOperand(7), // cachepolicy
11424 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11425 };
11426 EVT VT = Op.getValueType();
11427 auto *M = cast<MemSDNode>(Op);
11428
11429 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
11430 Op->getVTList(), Ops, VT,
11431 M->getMemOperand());
11432 }
11433 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11434 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11435 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
11436 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
11437 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
11438 SDValue Ops[] = {
11439 Op.getOperand(0), // Chain
11440 Op.getOperand(2), // src
11441 Op.getOperand(3), // cmp
11442 Rsrc, // rsrc
11443 Op.getOperand(5), // vindex
11444 VOffset, // voffset
11445 SOffset, // soffset
11446 Offset, // offset
11447 Op.getOperand(8), // cachepolicy
11448 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11449 };
11450 EVT VT = Op.getValueType();
11451 auto *M = cast<MemSDNode>(Op);
11452
11453 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
11454 Op->getVTList(), Ops, VT,
11455 M->getMemOperand());
11456 }
11457 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11458 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11459 MemSDNode *M = cast<MemSDNode>(Op);
11460 SDValue NodePtr = M->getOperand(2);
11461 SDValue RayExtent = M->getOperand(3);
11462 SDValue InstanceMask = M->getOperand(4);
11463 SDValue RayOrigin = M->getOperand(5);
11464 SDValue RayDir = M->getOperand(6);
11465 SDValue Offsets = M->getOperand(7);
11466 SDValue TDescr = M->getOperand(8);
11467
11468 assert(NodePtr.getValueType() == MVT::i64);
11469 assert(RayDir.getValueType() == MVT::v3f32);
11470
11471 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11472 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11473 return SDValue();
11474 }
11475
11476 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11477 const unsigned NumVDataDwords = 10;
11478 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11479 int Opcode = AMDGPU::getMIMGOpcode(
11480 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11481 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11482 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
11483 assert(Opcode != -1);
11484
11486 Ops.push_back(NodePtr);
11487 Ops.push_back(DAG.getBuildVector(
11488 MVT::v2i32, DL,
11489 {DAG.getBitcast(MVT::i32, RayExtent),
11490 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
11491 Ops.push_back(RayOrigin);
11492 Ops.push_back(RayDir);
11493 Ops.push_back(Offsets);
11494 Ops.push_back(TDescr);
11495 Ops.push_back(M->getChain());
11496
11497 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11498 MachineMemOperand *MemRef = M->getMemOperand();
11499 DAG.setNodeMemRefs(NewNode, {MemRef});
11500 return SDValue(NewNode, 0);
11501 }
11502 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11503 MemSDNode *M = cast<MemSDNode>(Op);
11504 SDValue NodePtr = M->getOperand(2);
11505 SDValue RayExtent = M->getOperand(3);
11506 SDValue RayOrigin = M->getOperand(4);
11507 SDValue RayDir = M->getOperand(5);
11508 SDValue RayInvDir = M->getOperand(6);
11509 SDValue TDescr = M->getOperand(7);
11510
11511 assert(NodePtr.getValueType() == MVT::i32 ||
11512 NodePtr.getValueType() == MVT::i64);
11513 assert(RayDir.getValueType() == MVT::v3f16 ||
11514 RayDir.getValueType() == MVT::v3f32);
11515
11516 if (!Subtarget->hasGFX10_AEncoding()) {
11517 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11518 return SDValue();
11519 }
11520
11521 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
11522 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
11523 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11524 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
11525 const bool Is64 = NodePtr.getValueType() == MVT::i64;
11526 const unsigned NumVDataDwords = 4;
11527 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11528 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11529 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11530 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
11531 IsGFX12Plus;
11532 const unsigned BaseOpcodes[2][2] = {
11533 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11534 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11535 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11536 int Opcode;
11537 if (UseNSA) {
11538 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11539 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11540 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11541 : AMDGPU::MIMGEncGfx10NSA,
11542 NumVDataDwords, NumVAddrDwords);
11543 } else {
11544 assert(!IsGFX12Plus);
11545 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11546 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11547 : AMDGPU::MIMGEncGfx10Default,
11548 NumVDataDwords, NumVAddrDwords);
11549 }
11550 assert(Opcode != -1);
11551
11553
11554 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
11556 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
11557 if (Lanes[0].getValueSizeInBits() == 32) {
11558 for (unsigned I = 0; I < 3; ++I)
11559 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
11560 } else {
11561 if (IsAligned) {
11562 Ops.push_back(DAG.getBitcast(
11563 MVT::i32,
11564 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
11565 Ops.push_back(Lanes[2]);
11566 } else {
11567 SDValue Elt0 = Ops.pop_back_val();
11568 Ops.push_back(DAG.getBitcast(
11569 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
11570 Ops.push_back(DAG.getBitcast(
11571 MVT::i32,
11572 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
11573 }
11574 }
11575 };
11576
11577 if (UseNSA && IsGFX11Plus) {
11578 Ops.push_back(NodePtr);
11579 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11580 Ops.push_back(RayOrigin);
11581 if (IsA16) {
11582 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
11583 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
11584 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
11585 for (unsigned I = 0; I < 3; ++I) {
11586 MergedLanes.push_back(DAG.getBitcast(
11587 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
11588 {DirLanes[I], InvDirLanes[I]})));
11589 }
11590 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
11591 } else {
11592 Ops.push_back(RayDir);
11593 Ops.push_back(RayInvDir);
11594 }
11595 } else {
11596 if (Is64)
11597 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
11598 2);
11599 else
11600 Ops.push_back(NodePtr);
11601
11602 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11603 packLanes(RayOrigin, true);
11604 packLanes(RayDir, true);
11605 packLanes(RayInvDir, false);
11606 }
11607
11608 if (!UseNSA) {
11609 // Build a single vector containing all the operands so far prepared.
11610 if (NumVAddrDwords > 12) {
11611 SDValue Undef = DAG.getPOISON(MVT::i32);
11612 Ops.append(16 - Ops.size(), Undef);
11613 }
11614 assert(Ops.size() >= 8 && Ops.size() <= 12);
11615 SDValue MergedOps =
11616 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
11617 Ops.clear();
11618 Ops.push_back(MergedOps);
11619 }
11620
11621 Ops.push_back(TDescr);
11622 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
11623 Ops.push_back(M->getChain());
11624
11625 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11626 MachineMemOperand *MemRef = M->getMemOperand();
11627 DAG.setNodeMemRefs(NewNode, {MemRef});
11628 return SDValue(NewNode, 0);
11629 }
11630 case Intrinsic::amdgcn_global_atomic_fmin_num:
11631 case Intrinsic::amdgcn_global_atomic_fmax_num:
11632 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11633 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11634 MemSDNode *M = cast<MemSDNode>(Op);
11635 SDValue Ops[] = {
11636 M->getOperand(0), // Chain
11637 M->getOperand(2), // Ptr
11638 M->getOperand(3) // Value
11639 };
11640 unsigned Opcode = 0;
11641 switch (IntrID) {
11642 case Intrinsic::amdgcn_global_atomic_fmin_num:
11643 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11644 Opcode = ISD::ATOMIC_LOAD_FMIN;
11645 break;
11646 }
11647 case Intrinsic::amdgcn_global_atomic_fmax_num:
11648 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11649 Opcode = ISD::ATOMIC_LOAD_FMAX;
11650 break;
11651 }
11652 default:
11653 llvm_unreachable("unhandled atomic opcode");
11654 }
11655 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
11656 Ops, M->getMemOperand());
11657 }
11658 case Intrinsic::amdgcn_s_alloc_vgpr: {
11659 SDValue NumVGPRs = Op.getOperand(2);
11660 if (!NumVGPRs->isDivergent())
11661 return Op;
11662
11663 SDValue ReadFirstLaneID =
11664 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
11665 NumVGPRs = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
11666 ReadFirstLaneID, NumVGPRs);
11667
11668 return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),
11669 Op.getOperand(0), Op.getOperand(1), NumVGPRs);
11670 }
11671 case Intrinsic::amdgcn_s_get_barrier_state:
11672 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11673 SDValue Chain = Op->getOperand(0);
11675 unsigned Opc;
11676
11677 if (isa<ConstantSDNode>(Op->getOperand(2))) {
11678 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
11679 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11680 BarID = (BarID >> 4) & 0x3F;
11681 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11682 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11683 Ops.push_back(K);
11684 Ops.push_back(Chain);
11685 } else {
11686 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11687 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11688 SDValue M0Val;
11689 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
11690 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11691 M0Val = SDValue(
11692 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11693 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11694 0);
11695 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11696 } else
11697 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
11698 }
11699
11700 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11701 return SDValue(NewMI, 0);
11702 }
11703 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11704 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11705 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11706 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11707 SDValue Chain = Op->getOperand(0);
11708 SDValue Ptr = Op->getOperand(2);
11709 EVT VT = Op->getValueType(0);
11710 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
11711 Chain, Ptr, MII->getMemOperand());
11712 }
11713 case Intrinsic::amdgcn_flat_load_monitor_b32:
11714 case Intrinsic::amdgcn_flat_load_monitor_b64:
11715 case Intrinsic::amdgcn_flat_load_monitor_b128: {
11716 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11717 SDValue Chain = Op->getOperand(0);
11718 SDValue Ptr = Op->getOperand(2);
11719 return DAG.getMemIntrinsicNode(AMDGPUISD::FLAT_LOAD_MONITOR, DL,
11720 Op->getVTList(), {Chain, Ptr},
11721 MII->getMemoryVT(), MII->getMemOperand());
11722 }
11723 case Intrinsic::amdgcn_global_load_monitor_b32:
11724 case Intrinsic::amdgcn_global_load_monitor_b64:
11725 case Intrinsic::amdgcn_global_load_monitor_b128: {
11726 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11727 SDValue Chain = Op->getOperand(0);
11728 SDValue Ptr = Op->getOperand(2);
11729 return DAG.getMemIntrinsicNode(AMDGPUISD::GLOBAL_LOAD_MONITOR, DL,
11730 Op->getVTList(), {Chain, Ptr},
11731 MII->getMemoryVT(), MII->getMemOperand());
11732 }
11733 default:
11734
11735 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11737 return lowerImage(Op, ImageDimIntr, DAG, true);
11738
11739 return SDValue();
11740 }
11741}
11742
11743// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11744// dwordx4 if on SI and handle TFE loads.
11745SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11746 SDVTList VTList,
11747 ArrayRef<SDValue> Ops, EVT MemVT,
11748 MachineMemOperand *MMO,
11749 SelectionDAG &DAG) const {
11750 LLVMContext &C = *DAG.getContext();
11751 MachineFunction &MF = DAG.getMachineFunction();
11752 EVT VT = VTList.VTs[0];
11753
11754 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
11755 bool IsTFE = VTList.NumVTs == 3;
11756 if (IsTFE) {
11757 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
11758 unsigned NumOpDWords = NumValueDWords + 1;
11759 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
11760 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
11761 MachineMemOperand *OpDWordsMMO =
11762 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
11763 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
11764 OpDWordsVT, OpDWordsMMO, DAG);
11765 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11766 DAG.getVectorIdxConstant(NumValueDWords, DL));
11767 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
11768 SDValue ValueDWords =
11769 NumValueDWords == 1
11770 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
11772 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
11773 ZeroIdx);
11774 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
11775 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11776 }
11777
11778 if (!Subtarget->hasDwordx3LoadStores() &&
11779 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11780 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
11781 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
11782 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
11783 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
11784 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
11785 WidenedMemVT, WidenedMMO);
11787 DAG.getVectorIdxConstant(0, DL));
11788 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
11789 }
11790
11791 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
11792}
11793
11794SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11795 bool ImageStore) const {
11796 EVT StoreVT = VData.getValueType();
11797
11798 // No change for f16 and legal vector D16 types.
11799 if (!StoreVT.isVector())
11800 return VData;
11801
11802 SDLoc DL(VData);
11803 unsigned NumElements = StoreVT.getVectorNumElements();
11804
11805 if (Subtarget->hasUnpackedD16VMem()) {
11806 // We need to unpack the packed data to store.
11807 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11808 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11809
11810 EVT EquivStoreVT =
11811 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
11812 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
11813 return DAG.UnrollVectorOp(ZExt.getNode());
11814 }
11815
11816 // The sq block of gfx8.1 does not estimate register use correctly for d16
11817 // image store instructions. The data operand is computed as if it were not a
11818 // d16 image instruction.
11819 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11820 // Bitcast to i16
11821 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11822 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11823
11824 // Decompose into scalars
11826 DAG.ExtractVectorElements(IntVData, Elts);
11827
11828 // Group pairs of i16 into v2i16 and bitcast to i32
11829 SmallVector<SDValue, 4> PackedElts;
11830 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11831 SDValue Pair =
11832 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11833 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11834 PackedElts.push_back(IntPair);
11835 }
11836 if ((NumElements % 2) == 1) {
11837 // Handle v3i16
11838 unsigned I = Elts.size() / 2;
11839 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11840 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11841 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11842 PackedElts.push_back(IntPair);
11843 }
11844
11845 // Pad using UNDEF
11846 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11847
11848 // Build final vector
11849 EVT VecVT =
11850 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11851 return DAG.getBuildVector(VecVT, DL, PackedElts);
11852 }
11853
11854 if (NumElements == 3) {
11855 EVT IntStoreVT =
11857 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11858
11859 EVT WidenedStoreVT = EVT::getVectorVT(
11860 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11861 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11862 WidenedStoreVT.getStoreSizeInBits());
11863 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11864 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11865 }
11866
11867 assert(isTypeLegal(StoreVT));
11868 return VData;
11869}
11870
11871static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
11872 switch (Intr) {
11873 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11874 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11875 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11876 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
11877 case Intrinsic::amdgcn_load_async_to_lds:
11878 case Intrinsic::amdgcn_global_load_async_lds:
11879 return true;
11880 }
11881 return false;
11882}
11883
11884SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11885 SelectionDAG &DAG) const {
11886 SDLoc DL(Op);
11887 SDValue Chain = Op.getOperand(0);
11888 unsigned IntrinsicID = Op.getConstantOperandVal(1);
11889
11890 switch (IntrinsicID) {
11891 case Intrinsic::amdgcn_exp_compr: {
11892 if (!Subtarget->hasCompressedExport()) {
11893 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11895 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11896 }
11897 SDValue Src0 = Op.getOperand(4);
11898 SDValue Src1 = Op.getOperand(5);
11899 // Hack around illegal type on SI by directly selecting it.
11900 if (isTypeLegal(Src0.getValueType()))
11901 return SDValue();
11902
11903 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
11904 SDValue Undef = DAG.getPOISON(MVT::f32);
11905 const SDValue Ops[] = {
11906 Op.getOperand(2), // tgt
11907 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
11908 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
11909 Undef, // src2
11910 Undef, // src3
11911 Op.getOperand(7), // vm
11912 DAG.getTargetConstant(1, DL, MVT::i1), // compr
11913 Op.getOperand(3), // en
11914 Op.getOperand(0) // Chain
11915 };
11916
11917 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11918 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11919 }
11920
11921 case Intrinsic::amdgcn_struct_tbuffer_store:
11922 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11923 SDValue VData = Op.getOperand(2);
11924 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11925 if (IsD16)
11926 VData = handleD16VData(VData, DAG);
11927 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11928 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11929 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11930 SDValue Ops[] = {
11931 Chain,
11932 VData, // vdata
11933 Rsrc, // rsrc
11934 Op.getOperand(4), // vindex
11935 VOffset, // voffset
11936 SOffset, // soffset
11937 Offset, // offset
11938 Op.getOperand(7), // format
11939 Op.getOperand(8), // cachepolicy, swizzled buffer
11940 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11941 };
11942 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11943 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11944 MemSDNode *M = cast<MemSDNode>(Op);
11945 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11946 M->getMemoryVT(), M->getMemOperand());
11947 }
11948
11949 case Intrinsic::amdgcn_raw_tbuffer_store:
11950 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11951 SDValue VData = Op.getOperand(2);
11952 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11953 if (IsD16)
11954 VData = handleD16VData(VData, DAG);
11955 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11956 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11957 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11958 SDValue Ops[] = {
11959 Chain,
11960 VData, // vdata
11961 Rsrc, // rsrc
11962 DAG.getConstant(0, DL, MVT::i32), // vindex
11963 VOffset, // voffset
11964 SOffset, // soffset
11965 Offset, // offset
11966 Op.getOperand(6), // format
11967 Op.getOperand(7), // cachepolicy, swizzled buffer
11968 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11969 };
11970 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11971 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11972 MemSDNode *M = cast<MemSDNode>(Op);
11973 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11974 M->getMemoryVT(), M->getMemOperand());
11975 }
11976
11977 case Intrinsic::amdgcn_raw_buffer_store:
11978 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11979 case Intrinsic::amdgcn_raw_buffer_store_format:
11980 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11981 const bool IsFormat =
11982 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11983 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11984
11985 SDValue VData = Op.getOperand(2);
11986 EVT VDataVT = VData.getValueType();
11987 EVT EltType = VDataVT.getScalarType();
11988 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11989 if (IsD16) {
11990 VData = handleD16VData(VData, DAG);
11991 VDataVT = VData.getValueType();
11992 }
11993
11994 if (!isTypeLegal(VDataVT)) {
11995 VData =
11996 DAG.getNode(ISD::BITCAST, DL,
11997 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11998 }
11999
12000 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
12001 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
12002 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
12003 SDValue Ops[] = {
12004 Chain,
12005 VData,
12006 Rsrc,
12007 DAG.getConstant(0, DL, MVT::i32), // vindex
12008 VOffset, // voffset
12009 SOffset, // soffset
12010 Offset, // offset
12011 Op.getOperand(6), // cachepolicy, swizzled buffer
12012 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
12013 };
12014 unsigned Opc =
12015 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
12016 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
12017 MemSDNode *M = cast<MemSDNode>(Op);
12018
12019 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
12020 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
12021 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
12022
12023 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
12024 M->getMemoryVT(), M->getMemOperand());
12025 }
12026
12027 case Intrinsic::amdgcn_struct_buffer_store:
12028 case Intrinsic::amdgcn_struct_ptr_buffer_store:
12029 case Intrinsic::amdgcn_struct_buffer_store_format:
12030 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
12031 const bool IsFormat =
12032 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
12033 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
12034
12035 SDValue VData = Op.getOperand(2);
12036 EVT VDataVT = VData.getValueType();
12037 EVT EltType = VDataVT.getScalarType();
12038 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
12039
12040 if (IsD16) {
12041 VData = handleD16VData(VData, DAG);
12042 VDataVT = VData.getValueType();
12043 }
12044
12045 if (!isTypeLegal(VDataVT)) {
12046 VData =
12047 DAG.getNode(ISD::BITCAST, DL,
12048 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
12049 }
12050
12051 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
12052 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
12053 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
12054 SDValue Ops[] = {
12055 Chain,
12056 VData,
12057 Rsrc,
12058 Op.getOperand(4), // vindex
12059 VOffset, // voffset
12060 SOffset, // soffset
12061 Offset, // offset
12062 Op.getOperand(7), // cachepolicy, swizzled buffer
12063 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
12064 };
12065 unsigned Opc =
12066 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
12067 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
12068 MemSDNode *M = cast<MemSDNode>(Op);
12069
12070 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
12071 EVT VDataType = VData.getValueType().getScalarType();
12072 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
12073 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
12074
12075 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
12076 M->getMemoryVT(), M->getMemOperand());
12077 }
12078 case Intrinsic::amdgcn_raw_buffer_load_lds:
12079 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
12080 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
12081 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
12082 case Intrinsic::amdgcn_struct_buffer_load_lds:
12083 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
12084 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
12085 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
12086 if (!Subtarget->hasVMemToLDSLoad())
12087 return SDValue();
12088 unsigned Opc;
12089 bool HasVIndex =
12090 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
12091 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
12092 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
12093 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
12094 unsigned OpOffset = HasVIndex ? 1 : 0;
12095 SDValue VOffset = Op.getOperand(5 + OpOffset);
12096 bool HasVOffset = !isNullConstant(VOffset);
12097 unsigned Size = Op->getConstantOperandVal(4);
12098
12099 switch (Size) {
12100 default:
12101 return SDValue();
12102 case 1:
12103 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
12104 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
12105 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
12106 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
12107 break;
12108 case 2:
12109 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
12110 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
12111 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
12112 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
12113 break;
12114 case 4:
12115 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
12116 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
12117 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
12118 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
12119 break;
12120 case 12:
12121 if (!Subtarget->hasLDSLoadB96_B128())
12122 return SDValue();
12123 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
12124 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
12125 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
12126 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
12127 break;
12128 case 16:
12129 if (!Subtarget->hasLDSLoadB96_B128())
12130 return SDValue();
12131 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
12132 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
12133 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
12134 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
12135 break;
12136 }
12137
12138 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
12139
12141
12142 if (HasVIndex && HasVOffset)
12143 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
12144 {Op.getOperand(5), // VIndex
12145 VOffset}));
12146 else if (HasVIndex)
12147 Ops.push_back(Op.getOperand(5));
12148 else if (HasVOffset)
12149 Ops.push_back(VOffset);
12150
12151 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
12152 Ops.push_back(Rsrc);
12153 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
12154 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
12155 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
12156 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
12157 Ops.push_back(DAG.getTargetConstant(
12158 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
12159 DL, MVT::i8)); // cpol
12160 Ops.push_back(DAG.getTargetConstant(
12161 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
12162 ? 1
12163 : 0,
12164 DL, MVT::i8)); // swz
12165 Ops.push_back(
12166 DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
12167 Ops.push_back(M0Val.getValue(0)); // Chain
12168 Ops.push_back(M0Val.getValue(1)); // Glue
12169
12170 auto *M = cast<MemSDNode>(Op);
12171 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
12172 DAG.setNodeMemRefs(Load, M->memoperands());
12173
12174 return SDValue(Load, 0);
12175 }
12176 // Buffers are handled by LowerBufferFatPointers, and we're going to go
12177 // for "trust me" that the remaining cases are global pointers until
12178 // such time as we can put two mem operands on an intrinsic.
12179 case Intrinsic::amdgcn_load_to_lds:
12180 case Intrinsic::amdgcn_load_async_to_lds:
12181 case Intrinsic::amdgcn_global_load_lds:
12182 case Intrinsic::amdgcn_global_load_async_lds: {
12183 if (!Subtarget->hasVMemToLDSLoad())
12184 return SDValue();
12185
12186 unsigned Opc;
12187 unsigned Size = Op->getConstantOperandVal(4);
12188 switch (Size) {
12189 default:
12190 return SDValue();
12191 case 1:
12192 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
12193 break;
12194 case 2:
12195 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
12196 break;
12197 case 4:
12198 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
12199 break;
12200 case 12:
12201 if (!Subtarget->hasLDSLoadB96_B128())
12202 return SDValue();
12203 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
12204 break;
12205 case 16:
12206 if (!Subtarget->hasLDSLoadB96_B128())
12207 return SDValue();
12208 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
12209 break;
12210 }
12211
12212 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
12213
12215
12216 SDValue Addr = Op.getOperand(2); // Global ptr
12217 SDValue VOffset;
12218 // Try to split SAddr and VOffset. Global and LDS pointers share the same
12219 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
12220 if (Addr->isDivergent() && Addr->isAnyAdd()) {
12221 SDValue LHS = Addr.getOperand(0);
12222 SDValue RHS = Addr.getOperand(1);
12223
12224 if (LHS->isDivergent())
12225 std::swap(LHS, RHS);
12226
12227 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
12228 RHS.getOperand(0).getValueType() == MVT::i32) {
12229 // add (i64 sgpr), (zero_extend (i32 vgpr))
12230 Addr = LHS;
12231 VOffset = RHS.getOperand(0);
12232 }
12233 }
12234
12235 Ops.push_back(Addr);
12236 if (!Addr->isDivergent()) {
12238 if (!VOffset)
12239 VOffset =
12240 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
12241 DAG.getTargetConstant(0, DL, MVT::i32)),
12242 0);
12243 Ops.push_back(VOffset);
12244 }
12245
12246 Ops.push_back(Op.getOperand(5)); // Offset
12247
12248 unsigned Aux = Op.getConstantOperandVal(6);
12249 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
12250 MVT::i32)); // CPol
12251 Ops.push_back(
12252 DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
12253
12254 Ops.push_back(M0Val.getValue(0)); // Chain
12255 Ops.push_back(M0Val.getValue(1)); // Glue
12256
12257 auto *M = cast<MemSDNode>(Op);
12258 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12259 DAG.setNodeMemRefs(Load, M->memoperands());
12260
12261 return SDValue(Load, 0);
12262 }
12263 case Intrinsic::amdgcn_end_cf:
12264 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
12265 Op->getOperand(2), Chain),
12266 0);
12267 case Intrinsic::amdgcn_s_barrier_signal_var: {
12268 // Member count of 0 means to re-use a previous member count,
12269 // which, if the named barrier is statically chosen, means we can use
12270 // the immarg form. Otherwisee, fall through to constructiong M0 as for
12271 // s_barrier_init.
12272 SDValue CntOp = Op->getOperand(3);
12273 auto *CntC = dyn_cast<ConstantSDNode>(CntOp);
12274 if (CntC && CntC->isZero()) {
12275 SDValue Chain = Op->getOperand(0);
12276 SDValue BarOp = Op->getOperand(2);
12278
12279 std::optional<uint64_t> BarVal;
12280 if (auto *C = dyn_cast<ConstantSDNode>(BarOp))
12281 BarVal = C->getZExtValue();
12282 else if (auto *GA = dyn_cast<GlobalAddressSDNode>(BarOp))
12284 *GA->getGlobal()))
12285 BarVal = *Addr + GA->getOffset();
12286
12287 if (BarVal) {
12288 unsigned BarID = (*BarVal >> 4) & 0x3F;
12289 Ops.push_back(DAG.getTargetConstant(BarID, DL, MVT::i32));
12290 Ops.push_back(Chain);
12291 auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
12292 Op->getVTList(), Ops);
12293 return SDValue(NewMI, 0);
12294 }
12295 }
12296 [[fallthrough]];
12297 }
12298 case Intrinsic::amdgcn_s_barrier_init: {
12299 // these two intrinsics have two operands: barrier pointer and member count
12300 SDValue Chain = Op->getOperand(0);
12302 SDValue BarOp = Op->getOperand(2);
12303 SDValue CntOp = Op->getOperand(3);
12304 SDValue M0Val;
12305 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
12306 ? AMDGPU::S_BARRIER_INIT_M0
12307 : AMDGPU::S_BARRIER_SIGNAL_M0;
12308 // extract the BarrierID from bits 4-9 of BarOp
12309 SDValue BarID;
12310 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
12311 DAG.getShiftAmountConstant(4, MVT::i32, DL));
12312 BarID =
12313 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
12314 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12315 0);
12316 // Member count should be put into M0[ShAmt:+6]
12317 // Barrier ID should be put into M0[5:0]
12318 M0Val =
12319 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
12320 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12321 0);
12322 constexpr unsigned ShAmt = 16;
12323 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
12324 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
12325
12326 M0Val = SDValue(
12327 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
12328
12329 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
12330
12331 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12332 return SDValue(NewMI, 0);
12333 }
12334 case Intrinsic::amdgcn_s_wakeup_barrier: {
12335 if (!Subtarget->hasSWakeupBarrier())
12336 return SDValue();
12337 [[fallthrough]];
12338 }
12339 case Intrinsic::amdgcn_s_barrier_join: {
12340 // these three intrinsics have one operand: barrier pointer
12341 SDValue Chain = Op->getOperand(0);
12343 SDValue BarOp = Op->getOperand(2);
12344 unsigned Opc;
12345
12346 if (isa<ConstantSDNode>(BarOp)) {
12347 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
12348 switch (IntrinsicID) {
12349 default:
12350 return SDValue();
12351 case Intrinsic::amdgcn_s_barrier_join:
12352 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
12353 break;
12354 case Intrinsic::amdgcn_s_wakeup_barrier:
12355 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
12356 break;
12357 }
12358 // extract the BarrierID from bits 4-9 of the immediate
12359 unsigned BarID = (BarVal >> 4) & 0x3F;
12360 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
12361 Ops.push_back(K);
12362 Ops.push_back(Chain);
12363 } else {
12364 switch (IntrinsicID) {
12365 default:
12366 return SDValue();
12367 case Intrinsic::amdgcn_s_barrier_join:
12368 Opc = AMDGPU::S_BARRIER_JOIN_M0;
12369 break;
12370 case Intrinsic::amdgcn_s_wakeup_barrier:
12371 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
12372 break;
12373 }
12374 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
12375 SDValue M0Val;
12376 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
12377 DAG.getShiftAmountConstant(4, MVT::i32, DL));
12378 M0Val =
12379 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
12380 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12381 0);
12382 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
12383 }
12384
12385 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12386 return SDValue(NewMI, 0);
12387 }
12388 case Intrinsic::amdgcn_s_prefetch_data: {
12389 // For non-global address space preserve the chain and remove the call.
12391 return Op.getOperand(0);
12392 return Op;
12393 }
12394 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
12395 SDValue Ops[] = {
12396 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
12397 Op.getOperand(3), // offset
12398 Op.getOperand(4), // length
12399 };
12400
12401 MemSDNode *M = cast<MemSDNode>(Op);
12402 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
12403 Op->getVTList(), Ops, M->getMemoryVT(),
12404 M->getMemOperand());
12405 }
12406 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
12407 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
12408 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
12409 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
12410 SDValue Chain = Op->getOperand(0);
12411 SDValue Ptr = Op->getOperand(2);
12412 SDValue Val = Op->getOperand(3);
12413 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
12414 Ptr, MII->getMemOperand());
12415 }
12416 default: {
12417 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12419 return lowerImage(Op, ImageDimIntr, DAG, true);
12420
12421 return Op;
12422 }
12423 }
12424}
12425
12426// Return whether the operation has NoUnsignedWrap property.
12427static bool isNoUnsignedWrap(SDValue Addr) {
12428 return (Addr.getOpcode() == ISD::ADD &&
12429 Addr->getFlags().hasNoUnsignedWrap()) ||
12430 Addr->getOpcode() == ISD::OR;
12431}
12432
12434 EVT PtrVT) const {
12435 return PtrVT == MVT::i64;
12436}
12437
12439 EVT PtrVT) const {
12440 return true;
12441}
12442
12443// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
12444// offset (the offset that is included in bounds checking and swizzling, to be
12445// split between the instruction's voffset and immoffset fields) and soffset
12446// (the offset that is excluded from bounds checking and swizzling, to go in
12447// the instruction's soffset field). This function takes the first kind of
12448// offset and figures out how to split it between voffset and immoffset.
12449std::pair<SDValue, SDValue>
12450SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
12451 SDLoc DL(Offset);
12452 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
12453 SDValue N0 = Offset;
12454 ConstantSDNode *C1 = nullptr;
12455
12456 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
12457 N0 = SDValue();
12458 else if (DAG.isBaseWithConstantOffset(N0)) {
12459 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12460 // being added, so we can only safely match a 32-bit addition with no
12461 // unsigned overflow.
12462 bool CheckNUW = Subtarget->hasGFX1250Insts();
12463 if (!CheckNUW || isNoUnsignedWrap(N0)) {
12464 C1 = cast<ConstantSDNode>(N0.getOperand(1));
12465 N0 = N0.getOperand(0);
12466 }
12467 }
12468
12469 if (C1) {
12470 unsigned ImmOffset = C1->getZExtValue();
12471 // If the immediate value is too big for the immoffset field, put only bits
12472 // that would normally fit in the immoffset field. The remaining value that
12473 // is copied/added for the voffset field is a large power of 2, and it
12474 // stands more chance of being CSEd with the copy/add for another similar
12475 // load/store.
12476 // However, do not do that rounding down if that is a negative
12477 // number, as it appears to be illegal to have a negative offset in the
12478 // vgpr, even if adding the immediate offset makes it positive.
12479 unsigned Overflow = ImmOffset & ~MaxImm;
12480 ImmOffset -= Overflow;
12481 if ((int32_t)Overflow < 0) {
12482 Overflow += ImmOffset;
12483 ImmOffset = 0;
12484 }
12485 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
12486 if (Overflow) {
12487 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
12488 if (!N0)
12489 N0 = OverflowVal;
12490 else {
12491 SDValue Ops[] = {N0, OverflowVal};
12492 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
12493 }
12494 }
12495 }
12496 if (!N0)
12497 N0 = DAG.getConstant(0, DL, MVT::i32);
12498 if (!C1)
12499 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
12500 return {N0, SDValue(C1, 0)};
12501}
12502
12503// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
12504// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
12505// pointed to by Offsets.
12506void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
12507 SelectionDAG &DAG, SDValue *Offsets,
12508 Align Alignment) const {
12509 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12510 SDLoc DL(CombinedOffset);
12511 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
12512 uint32_t Imm = C->getZExtValue();
12513 uint32_t SOffset, ImmOffset;
12514 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12515 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
12516 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
12517 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
12518 return;
12519 }
12520 }
12521 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
12522 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12523 // being added, so we can only safely match a 32-bit addition with no
12524 // unsigned overflow.
12525 bool CheckNUW = Subtarget->hasGFX1250Insts();
12526 SDValue N0 = CombinedOffset.getOperand(0);
12527 SDValue N1 = CombinedOffset.getOperand(1);
12528 uint32_t SOffset, ImmOffset;
12529 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
12530 if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(CombinedOffset)) &&
12531 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
12532 Offsets[0] = N0;
12533 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
12534 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
12535 return;
12536 }
12537 }
12538
12539 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12540 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
12541 : DAG.getConstant(0, DL, MVT::i32);
12542
12543 Offsets[0] = CombinedOffset;
12544 Offsets[1] = SOffsetZero;
12545 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
12546}
12547
12548SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
12549 SelectionDAG &DAG) const {
12550 if (!MaybePointer.getValueType().isScalarInteger())
12551 return MaybePointer;
12552
12553 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
12554 return Rsrc;
12555}
12556
12557// Wrap a global or flat pointer into a buffer intrinsic using the flags
12558// specified in the intrinsic.
12559SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
12560 SelectionDAG &DAG) const {
12561 SDLoc Loc(Op);
12562
12563 SDValue Pointer = Op->getOperand(1);
12564 SDValue Stride = Op->getOperand(2);
12565 SDValue NumRecords = Op->getOperand(3);
12566 SDValue Flags = Op->getOperand(4);
12567
12568 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
12569 SDValue Rsrc;
12570
12571 if (Subtarget->has45BitNumRecordsBufferResource()) {
12572 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
12573 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
12574 // num_records.
12575 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
12576 SDValue NumRecordsLHS =
12577 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
12578 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
12579 SDValue LowHalf =
12580 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
12581
12582 // Build the higher 64-bit value, which has the higher 38-bit num_records,
12583 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
12584 SDValue NumRecordsRHS =
12585 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
12586 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
12587 SDValue ShiftedStride =
12588 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12589 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
12590 SDValue ExtShiftedStrideVec =
12591 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
12592 SDValue ExtShiftedStride =
12593 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
12594 SDValue ShiftedFlags =
12595 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
12596 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
12597 SDValue ExtShiftedFlagsVec =
12598 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
12599 SDValue ExtShiftedFlags =
12600 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
12601 SDValue CombinedFields =
12602 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12603 SDValue HighHalf =
12604 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12605
12606 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
12607 } else {
12608 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
12609 auto [LowHalf, HighHalf] =
12610 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12611 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
12612 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
12613 SDValue ShiftedStride =
12614 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12615 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
12616 SDValue NewHighHalf =
12617 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
12618
12619 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
12620 NumRecords, Flags);
12621 }
12622
12623 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
12624 return RsrcPtr;
12625}
12626
12627// Handle 8 bit and 16 bit buffer loads
12628SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
12629 EVT LoadVT, SDLoc DL,
12631 MachineMemOperand *MMO,
12632 bool IsTFE) const {
12633 EVT IntVT = LoadVT.changeTypeToInteger();
12634
12635 if (IsTFE) {
12636 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
12637 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12638 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12639 MachineFunction &MF = DAG.getMachineFunction();
12640 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
12641 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
12642 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
12643 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12644 DAG.getConstant(1, DL, MVT::i32));
12645 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12646 DAG.getConstant(0, DL, MVT::i32));
12647 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
12648 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
12649 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
12650 }
12651
12652 unsigned Opc = LoadVT.getScalarType() == MVT::i8
12653 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12654 : AMDGPUISD::BUFFER_LOAD_USHORT;
12655
12656 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
12657 SDValue BufferLoad =
12658 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
12659 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
12660 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
12661
12662 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
12663}
12664
12665// Handle 8 bit and 16 bit buffer stores
12666SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
12667 EVT VDataType, SDLoc DL,
12668 SDValue Ops[],
12669 MemSDNode *M) const {
12670 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12671 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
12672
12673 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
12674 Ops[1] = BufferStoreExt;
12675 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12676 : AMDGPUISD::BUFFER_STORE_SHORT;
12677 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
12678 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
12679 M->getMemOperand());
12680}
12681
12683 SDValue Op, const SDLoc &SL, EVT VT) {
12684 if (VT.bitsLT(Op.getValueType()))
12685 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
12686
12687 switch (ExtType) {
12688 case ISD::SEXTLOAD:
12689 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
12690 case ISD::ZEXTLOAD:
12691 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
12692 case ISD::EXTLOAD:
12693 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
12694 case ISD::NON_EXTLOAD:
12695 return Op;
12696 }
12697
12698 llvm_unreachable("invalid ext type");
12699}
12700
12701// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
12702// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
12703SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
12704 DAGCombinerInfo &DCI) const {
12705 SelectionDAG &DAG = DCI.DAG;
12706 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
12707 return SDValue();
12708
12709 // FIXME: Constant loads should all be marked invariant.
12710 unsigned AS = Ld->getAddressSpace();
12711 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
12713 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
12714 return SDValue();
12715
12716 // Don't do this early, since it may interfere with adjacent load merging for
12717 // illegal types. We can avoid losing alignment information for exotic types
12718 // pre-legalize.
12719 EVT MemVT = Ld->getMemoryVT();
12720 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
12721 MemVT.getSizeInBits() >= 32)
12722 return SDValue();
12723
12724 SDLoc SL(Ld);
12725
12726 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
12727 "unexpected vector extload");
12728
12729 // TODO: Drop only high part of range.
12730 SDValue Ptr = Ld->getBasePtr();
12731 SDValue NewLoad = DAG.getLoad(
12732 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
12733 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
12734 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
12735 nullptr); // Drop ranges
12736
12737 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
12738 if (MemVT.isFloatingPoint()) {
12740 "unexpected fp extload");
12741 TruncVT = MemVT.changeTypeToInteger();
12742 }
12743
12744 SDValue Cvt = NewLoad;
12745 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12746 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
12747 DAG.getValueType(TruncVT));
12748 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
12750 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
12751 } else {
12753 }
12754
12755 EVT VT = Ld->getValueType(0);
12756 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
12757
12758 DCI.AddToWorklist(Cvt.getNode());
12759
12760 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
12761 // the appropriate extension from the 32-bit load.
12762 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
12763 DCI.AddToWorklist(Cvt.getNode());
12764
12765 // Handle conversion back to floating point if necessary.
12766 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
12767
12768 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
12769}
12770
12772 const SIMachineFunctionInfo &Info) {
12773 // TODO: Should check if the address can definitely not access stack.
12774 if (Info.isEntryFunction())
12775 return Info.getUserSGPRInfo().hasFlatScratchInit();
12776 return true;
12777}
12778
12779SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12780 SDLoc DL(Op);
12781 LoadSDNode *Load = cast<LoadSDNode>(Op);
12782 ISD::LoadExtType ExtType = Load->getExtensionType();
12783 EVT MemVT = Load->getMemoryVT();
12784 MachineMemOperand *MMO = Load->getMemOperand();
12785
12786 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
12787 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
12788 return SDValue();
12789
12790 // FIXME: Copied from PPC
12791 // First, load into 32 bits, then truncate to 1 bit.
12792
12793 SDValue Chain = Load->getChain();
12794 SDValue BasePtr = Load->getBasePtr();
12795
12796 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12797
12798 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
12799 RealMemVT, MMO);
12800
12801 if (!MemVT.isVector()) {
12802 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
12803 NewLD.getValue(1)};
12804
12805 return DAG.getMergeValues(Ops, DL);
12806 }
12807
12809 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12810 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
12811 DAG.getConstant(I, DL, MVT::i32));
12812
12813 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
12814 }
12815
12816 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
12817
12818 return DAG.getMergeValues(Ops, DL);
12819 }
12820
12821 if (!MemVT.isVector())
12822 return SDValue();
12823
12824 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12825 "Custom lowering for non-i32 vectors hasn't been implemented.");
12826
12827 Align Alignment = Load->getAlign();
12828 unsigned AS = Load->getAddressSpace();
12829 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12830 AS == AMDGPUAS::FLAT_ADDRESS &&
12831 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12832 return SplitVectorLoad(Op, DAG);
12833 }
12834
12835 MachineFunction &MF = DAG.getMachineFunction();
12836 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12837 // If there is a possibility that flat instruction access scratch memory
12838 // then we need to use the same legalization rules we use for private.
12839 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12840 !Subtarget->hasMultiDwordFlatScratchAddressing())
12841 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
12844
12845 unsigned NumElements = MemVT.getVectorNumElements();
12846
12847 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12849 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12850 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12851 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
12852 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12853 Alignment >= Align(4) && NumElements < 32) {
12854 if (MemVT.isPow2VectorType() ||
12855 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12856 return SDValue();
12857 return WidenOrSplitVectorLoad(Op, DAG);
12858 }
12859 // Non-uniform loads will be selected to MUBUF instructions, so they
12860 // have the same legalization requirements as global and private
12861 // loads.
12862 //
12863 }
12864 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12867 if (NumElements > 4)
12868 return SplitVectorLoad(Op, DAG);
12869 // v3 loads not supported on SI.
12870 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12871 return WidenOrSplitVectorLoad(Op, DAG);
12872
12873 // v3 and v4 loads are supported for private and global memory.
12874 return SDValue();
12875 }
12876 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12877 // Depending on the setting of the private_element_size field in the
12878 // resource descriptor, we can only make private accesses up to a certain
12879 // size.
12880 switch (Subtarget->getMaxPrivateElementSize()) {
12881 case 4: {
12882 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
12883 return DAG.getMergeValues({Op0, Op1}, DL);
12884 }
12885 case 8:
12886 if (NumElements > 2)
12887 return SplitVectorLoad(Op, DAG);
12888 return SDValue();
12889 case 16:
12890 // Same as global/flat
12891 if (NumElements > 4)
12892 return SplitVectorLoad(Op, DAG);
12893 // v3 loads not supported on SI.
12894 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12895 return WidenOrSplitVectorLoad(Op, DAG);
12896
12897 return SDValue();
12898 default:
12899 llvm_unreachable("unsupported private_element_size");
12900 }
12901 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12902 unsigned Fast = 0;
12903 auto Flags = Load->getMemOperand()->getFlags();
12905 Load->getAlign(), Flags, &Fast) &&
12906 Fast > 1)
12907 return SDValue();
12908
12909 if (MemVT.isVector())
12910 return SplitVectorLoad(Op, DAG);
12911 }
12912
12914 MemVT, *Load->getMemOperand())) {
12915 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
12916 return DAG.getMergeValues({Op0, Op1}, DL);
12917 }
12918
12919 return SDValue();
12920}
12921
12922SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12923 EVT VT = Op.getValueType();
12924 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12925 VT.getSizeInBits() == 512)
12926 return splitTernaryVectorOp(Op, DAG);
12927
12928 assert(VT.getSizeInBits() == 64);
12929
12930 SDLoc DL(Op);
12931 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
12932
12933 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
12934 SDValue One = DAG.getConstant(1, DL, MVT::i32);
12935
12936 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
12937 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
12938
12939 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
12940 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
12941
12942 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
12943
12944 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
12945 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
12946
12947 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
12948
12949 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
12950 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
12951}
12952
12953// Catch division cases where we can use shortcuts with rcp and rsq
12954// instructions.
12955SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12956 SelectionDAG &DAG) const {
12957 SDLoc SL(Op);
12958 SDValue LHS = Op.getOperand(0);
12959 SDValue RHS = Op.getOperand(1);
12960 EVT VT = Op.getValueType();
12961 const SDNodeFlags Flags = Op->getFlags();
12962
12963 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12964
12965 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
12966 // Without !fpmath accuracy information, we can't do more because we don't
12967 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12968 // f16 is always accurate enough
12969 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12970 return SDValue();
12971
12972 if (CLHS->isExactlyValue(1.0)) {
12973 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12974 // the CI documentation has a worst case error of 1 ulp.
12975 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12976 // use it as long as we aren't trying to use denormals.
12977 //
12978 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12979
12980 // 1.0 / sqrt(x) -> rsq(x)
12981
12982 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12983 // error seems really high at 2^29 ULP.
12984 // 1.0 / x -> rcp(x)
12985 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12986 }
12987
12988 // Same as for 1.0, but expand the sign out of the constant.
12989 if (CLHS->isExactlyValue(-1.0)) {
12990 // -1.0 / x -> rcp (fneg x)
12991 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12992 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12993 }
12994 }
12995
12996 // For f16 and bf16 require afn or arcp.
12997 // For f32 require afn.
12998 if (!AllowInaccurateRcp &&
12999 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
13000 return SDValue();
13001
13002 // Turn into multiply by the reciprocal.
13003 // x / y -> x * (1.0 / y)
13004 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
13005 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
13006}
13007
13008SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
13009 SelectionDAG &DAG) const {
13010 SDLoc SL(Op);
13011 SDValue X = Op.getOperand(0);
13012 SDValue Y = Op.getOperand(1);
13013 EVT VT = Op.getValueType();
13014 const SDNodeFlags Flags = Op->getFlags();
13015
13016 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
13017 if (!AllowInaccurateDiv)
13018 return SDValue();
13019
13020 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
13021 SDValue One = DAG.getConstantFP(1.0, SL, VT);
13022
13023 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
13024 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
13025
13026 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
13027 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
13028 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
13029 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
13030 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
13031 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
13032}
13033
13034static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
13035 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
13036 SDNodeFlags Flags) {
13037 if (GlueChain->getNumValues() <= 1) {
13038 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
13039 }
13040
13041 assert(GlueChain->getNumValues() == 3);
13042
13043 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
13044 switch (Opcode) {
13045 default:
13046 llvm_unreachable("no chain equivalent for opcode");
13047 case ISD::FMUL:
13048 Opcode = AMDGPUISD::FMUL_W_CHAIN;
13049 break;
13050 }
13051
13052 return DAG.getNode(Opcode, SL, VTList,
13053 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
13054 Flags);
13055}
13056
13057static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
13058 EVT VT, SDValue A, SDValue B, SDValue C,
13059 SDValue GlueChain, SDNodeFlags Flags) {
13060 if (GlueChain->getNumValues() <= 1) {
13061 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
13062 }
13063
13064 assert(GlueChain->getNumValues() == 3);
13065
13066 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
13067 switch (Opcode) {
13068 default:
13069 llvm_unreachable("no chain equivalent for opcode");
13070 case ISD::FMA:
13071 Opcode = AMDGPUISD::FMA_W_CHAIN;
13072 break;
13073 }
13074
13075 return DAG.getNode(Opcode, SL, VTList,
13076 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
13077 Flags);
13078}
13079
13080SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
13081 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
13082 return FastLowered;
13083
13084 SDLoc SL(Op);
13085 EVT VT = Op.getValueType();
13086 SDValue LHS = Op.getOperand(0);
13087 SDValue RHS = Op.getOperand(1);
13088
13089 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
13090 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
13091
13092 if (VT == MVT::bf16) {
13093 SDValue ExtDiv =
13094 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
13095 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
13096 DAG.getTargetConstant(0, SL, MVT::i32));
13097 }
13098
13099 assert(VT == MVT::f16);
13100
13101 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
13102 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
13103 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
13104 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
13105 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
13106 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
13107 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
13108 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
13109 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
13110 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
13111 // q16.u = opx(V_CVT_F16_F32, q32.u);
13112 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
13113
13114 // We will use ISD::FMA on targets that don't support ISD::FMAD.
13115 unsigned FMADOpCode =
13117 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
13118 SDValue Rcp =
13119 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
13120 SDValue Quot =
13121 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
13122 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
13123 Op->getFlags());
13124 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
13125 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
13126 Op->getFlags());
13127 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
13128 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
13129 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
13130 DAG.getConstant(0xff800000, SL, MVT::i32));
13131 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
13132 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
13133 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
13134 DAG.getTargetConstant(0, SL, MVT::i32));
13135 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
13136 Op->getFlags());
13137}
13138
13139// Faster 2.5 ULP division that does not support denormals.
13140SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
13141 SDNodeFlags Flags = Op->getFlags();
13142 SDLoc SL(Op);
13143 SDValue LHS = Op.getOperand(1);
13144 SDValue RHS = Op.getOperand(2);
13145
13146 // TODO: The combiner should probably handle elimination of redundant fabs.
13148 ? RHS
13149 : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
13150
13151 const APFloat K0Val(0x1p+96f);
13152 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
13153
13154 const APFloat K1Val(0x1p-32f);
13155 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
13156
13157 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
13158
13159 EVT SetCCVT =
13160 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
13161
13162 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
13163
13164 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
13165
13166 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
13167
13168 // rcp does not support denormals.
13169 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
13170
13171 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
13172
13173 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
13174}
13175
13176// Returns immediate value for setting the F32 denorm mode when using the
13177// S_DENORM_MODE instruction.
13179 const SIMachineFunctionInfo *Info,
13180 const GCNSubtarget *ST) {
13181 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
13182 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
13183 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
13184 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
13185}
13186
13187SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
13188 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
13189 return FastLowered;
13190
13191 // The selection matcher assumes anything with a chain selecting to a
13192 // mayRaiseFPException machine instruction. Since we're introducing a chain
13193 // here, we need to explicitly report nofpexcept for the regular fdiv
13194 // lowering.
13195 SDNodeFlags Flags = Op->getFlags();
13196 Flags.setNoFPExcept(true);
13197
13198 SDLoc SL(Op);
13199 SDValue LHS = Op.getOperand(0);
13200 SDValue RHS = Op.getOperand(1);
13201
13202 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
13203
13204 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
13205
13206 SDValue DenominatorScaled =
13207 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
13208 SDValue NumeratorScaled =
13209 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
13210
13211 // Denominator is scaled to not be denormal, so using rcp is ok.
13212 SDValue ApproxRcp =
13213 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
13214 SDValue NegDivScale0 =
13215 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
13216
13217 using namespace AMDGPU::Hwreg;
13218 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
13219 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
13220
13221 const MachineFunction &MF = DAG.getMachineFunction();
13222 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13223 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
13224
13225 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
13226 const bool HasDynamicDenormals =
13227 (DenormMode.Input == DenormalMode::Dynamic) ||
13228 (DenormMode.Output == DenormalMode::Dynamic);
13229
13230 SDValue SavedDenormMode;
13231
13232 if (!PreservesDenormals) {
13233 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
13234 // lowering. The chain dependence is insufficient, and we need glue. We do
13235 // not need the glue variants in a strictfp function.
13236
13237 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
13238
13239 SDValue Glue = DAG.getEntryNode();
13240 if (HasDynamicDenormals) {
13241 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
13242 DAG.getVTList(MVT::i32, MVT::Glue),
13243 {BitField, Glue});
13244 SavedDenormMode = SDValue(GetReg, 0);
13245
13246 Glue = DAG.getMergeValues(
13247 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
13248 }
13249
13250 SDNode *EnableDenorm;
13251 if (Subtarget->hasDenormModeInst()) {
13252 const SDValue EnableDenormValue =
13253 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
13254
13255 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
13256 EnableDenormValue)
13257 .getNode();
13258 } else {
13259 const SDValue EnableDenormValue =
13260 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
13261 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
13262 {EnableDenormValue, BitField, Glue});
13263 }
13264
13265 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
13266 SDValue(EnableDenorm, 1)};
13267
13268 NegDivScale0 = DAG.getMergeValues(Ops, SL);
13269 }
13270
13271 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
13272 ApproxRcp, One, NegDivScale0, Flags);
13273
13274 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
13275 ApproxRcp, Fma0, Flags);
13276
13277 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
13278 Fma1, Flags);
13279
13280 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
13281 NumeratorScaled, Mul, Flags);
13282
13283 SDValue Fma3 =
13284 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
13285
13286 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
13287 NumeratorScaled, Fma3, Flags);
13288
13289 if (!PreservesDenormals) {
13290 SDNode *DisableDenorm;
13291 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
13292 const SDValue DisableDenormValue = getSPDenormModeValue(
13293 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
13294
13295 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
13296 DisableDenorm =
13297 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
13298 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
13299 .getNode();
13300 } else {
13301 assert(HasDynamicDenormals == (bool)SavedDenormMode);
13302 const SDValue DisableDenormValue =
13303 HasDynamicDenormals
13304 ? SavedDenormMode
13305 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
13306
13307 DisableDenorm = DAG.getMachineNode(
13308 AMDGPU::S_SETREG_B32, SL, MVT::Other,
13309 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
13310 }
13311
13312 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
13313 SDValue(DisableDenorm, 0), DAG.getRoot());
13314 DAG.setRoot(OutputChain);
13315 }
13316
13317 SDValue Scale = NumeratorScaled.getValue(1);
13318 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
13319 {Fma4, Fma1, Fma3, Scale}, Flags);
13320
13321 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
13322}
13323
13324SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
13325 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
13326 return FastLowered;
13327
13328 SDLoc SL(Op);
13329 SDValue X = Op.getOperand(0);
13330 SDValue Y = Op.getOperand(1);
13331
13332 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
13333
13334 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
13335
13336 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
13337
13338 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
13339
13340 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
13341
13342 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
13343
13344 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
13345
13346 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
13347
13348 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
13349
13350 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
13351 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
13352
13353 SDValue Fma4 =
13354 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
13355
13356 SDValue Scale;
13357
13358 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
13359 // Workaround a hardware bug on SI where the condition output from div_scale
13360 // is not usable.
13361
13362 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
13363
13364 // Figure out if the scale to use for div_fmas.
13365 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
13366 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
13367 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
13368 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
13369
13370 SDValue NumHi =
13371 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
13372 SDValue DenHi =
13373 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
13374
13375 SDValue Scale0Hi =
13376 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
13377 SDValue Scale1Hi =
13378 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
13379
13380 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
13381 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
13382 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
13383 } else {
13384 Scale = DivScale1.getValue(1);
13385 }
13386
13387 SDValue Fmas =
13388 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
13389
13390 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
13391}
13392
13393SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
13394 EVT VT = Op.getValueType();
13395
13396 if (VT == MVT::f32)
13397 return LowerFDIV32(Op, DAG);
13398
13399 if (VT == MVT::f64)
13400 return LowerFDIV64(Op, DAG);
13401
13402 if (VT == MVT::f16 || VT == MVT::bf16)
13403 return LowerFDIV16(Op, DAG);
13404
13405 llvm_unreachable("Unexpected type for fdiv");
13406}
13407
13408SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
13409 SDLoc dl(Op);
13410 SDValue Val = Op.getOperand(0);
13411 EVT VT = Val.getValueType();
13412 EVT ResultExpVT = Op->getValueType(1);
13413 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
13414
13415 SDValue Mant = DAG.getNode(
13417 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
13418
13419 SDValue Exp = DAG.getNode(
13420 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
13421 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
13422
13423 if (Subtarget->hasFractBug()) {
13424 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
13425 SDValue Inf =
13427
13428 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
13429 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
13430 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
13431 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
13432 }
13433
13434 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
13435 return DAG.getMergeValues({Mant, CastExp}, dl);
13436}
13437
13438SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
13439 SDLoc DL(Op);
13440 StoreSDNode *Store = cast<StoreSDNode>(Op);
13441 EVT VT = Store->getMemoryVT();
13442
13443 if (VT == MVT::i1) {
13444 return DAG.getTruncStore(
13445 Store->getChain(), DL,
13446 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
13447 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
13448 }
13449
13450 assert(VT.isVector() &&
13451 Store->getValue().getValueType().getScalarType() == MVT::i32);
13452
13453 unsigned AS = Store->getAddressSpace();
13454 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13455 AS == AMDGPUAS::FLAT_ADDRESS &&
13456 Store->getAlign().value() < VT.getStoreSize() &&
13457 VT.getSizeInBits() > 32) {
13458 return SplitVectorStore(Op, DAG);
13459 }
13460
13461 MachineFunction &MF = DAG.getMachineFunction();
13462 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
13463 // If there is a possibility that flat instruction access scratch memory
13464 // then we need to use the same legalization rules we use for private.
13465 if (AS == AMDGPUAS::FLAT_ADDRESS &&
13466 !Subtarget->hasMultiDwordFlatScratchAddressing())
13467 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
13470
13471 unsigned NumElements = VT.getVectorNumElements();
13473 if (NumElements > 4)
13474 return SplitVectorStore(Op, DAG);
13475 // v3 stores not supported on SI.
13476 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13477 return SplitVectorStore(Op, DAG);
13478
13480 VT, *Store->getMemOperand()))
13481 return expandUnalignedStore(Store, DAG);
13482
13483 return SDValue();
13484 }
13485 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13486 switch (Subtarget->getMaxPrivateElementSize()) {
13487 case 4:
13488 return scalarizeVectorStore(Store, DAG);
13489 case 8:
13490 if (NumElements > 2)
13491 return SplitVectorStore(Op, DAG);
13492 return SDValue();
13493 case 16:
13494 if (NumElements > 4 ||
13495 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13496 return SplitVectorStore(Op, DAG);
13497 return SDValue();
13498 default:
13499 llvm_unreachable("unsupported private_element_size");
13500 }
13501 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
13502 unsigned Fast = 0;
13503 auto Flags = Store->getMemOperand()->getFlags();
13505 Store->getAlign(), Flags, &Fast) &&
13506 Fast > 1)
13507 return SDValue();
13508
13509 if (VT.isVector())
13510 return SplitVectorStore(Op, DAG);
13511
13512 return expandUnalignedStore(Store, DAG);
13513 }
13514
13515 // Probably an invalid store. If so we'll end up emitting a selection error.
13516 return SDValue();
13517}
13518
13519// Avoid the full correct expansion for f32 sqrt when promoting from f16.
13520SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
13521 SDLoc SL(Op);
13522 assert(!Subtarget->has16BitInsts());
13523 SDNodeFlags Flags = Op->getFlags();
13524 SDValue Ext =
13525 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
13526
13527 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
13528 SDValue Sqrt =
13529 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
13530
13531 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
13532 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
13533}
13534
13535SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
13536 SDLoc DL(Op);
13537 SDNodeFlags Flags = Op->getFlags();
13538 MVT VT = Op.getValueType().getSimpleVT();
13539 const SDValue X = Op.getOperand(0);
13540
13541 if (allowApproxFunc(DAG, Flags)) {
13542 // Instruction is 1ulp but ignores denormals.
13543 return DAG.getNode(
13545 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
13546 }
13547
13548 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
13549 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
13550
13551 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
13552
13553 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
13554
13555 SDValue SqrtX =
13556 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
13557
13558 SDValue SqrtS;
13559 if (needsDenormHandlingF32(DAG, X, Flags)) {
13560 SDValue SqrtID =
13561 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
13562 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
13563
13564 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
13565 SDValue SqrtSNextDownInt =
13566 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13567 DAG.getAllOnesConstant(DL, MVT::i32));
13568 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
13569
13570 SDValue NegSqrtSNextDown =
13571 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
13572
13573 SDValue SqrtVP =
13574 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
13575
13576 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13577 DAG.getConstant(1, DL, MVT::i32));
13578 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
13579
13580 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
13581 SDValue SqrtVS =
13582 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
13583
13584 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
13585 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
13586
13587 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
13588 Flags);
13589
13590 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
13591 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
13592 Flags);
13593 } else {
13594 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
13595
13596 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
13597
13598 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
13599 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
13600 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
13601
13602 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
13603 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
13604 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
13605
13606 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
13607 SDValue SqrtD =
13608 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
13609 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
13610 }
13611
13612 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
13613
13614 SDValue ScaledDown =
13615 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
13616
13617 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
13618 SDValue IsZeroOrInf =
13619 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13620 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13621
13622 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
13623}
13624
13625SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
13626 // For double type, the SQRT and RSQ instructions don't have required
13627 // precision, we apply Goldschmidt's algorithm to improve the result:
13628 //
13629 // y0 = rsq(x)
13630 // g0 = x * y0
13631 // h0 = 0.5 * y0
13632 //
13633 // r0 = 0.5 - h0 * g0
13634 // g1 = g0 * r0 + g0
13635 // h1 = h0 * r0 + h0
13636 //
13637 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
13638 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
13639 // h2 = h1 * r1 + h1
13640 //
13641 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
13642 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
13643 //
13644 // sqrt(x) = g3
13645
13646 SDNodeFlags Flags = Op->getFlags();
13647
13648 SDLoc DL(Op);
13649
13650 SDValue X = Op.getOperand(0);
13651 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
13652
13653 SDValue SqrtX = X;
13654 SDValue Scaling;
13655 if (!Flags.hasApproximateFuncs()) {
13656 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
13657 Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
13658
13659 // Scale up input if it is too small.
13660 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
13661 SDValue ScaleUp =
13662 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
13663 SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
13664 }
13665
13666 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
13667
13668 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
13669
13670 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
13671 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
13672
13673 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
13674 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
13675
13676 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
13677
13678 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
13679
13680 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
13681 SDValue SqrtD0 =
13682 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
13683
13684 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
13685
13686 SDValue SqrtRet = SqrtS2;
13687 if (!Flags.hasApproximateFuncs()) {
13688 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
13689 SDValue SqrtD1 =
13690 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
13691
13692 SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
13693
13694 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
13695 SDValue ScaleDown = DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling,
13696 ScaleDownFactor, ZeroInt);
13697 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
13698 }
13699
13700 // TODO: Check for DAZ and expand to subnormals
13701
13702 SDValue IsZeroOrInf;
13703 if (Flags.hasNoInfs()) {
13704 SDValue Zero = DAG.getConstantFP(0.0, DL, MVT::f64);
13705 IsZeroOrInf = DAG.getSetCC(DL, MVT::i1, SqrtX, Zero, ISD::SETOEQ);
13706 } else {
13707 IsZeroOrInf =
13708 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13709 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13710 }
13711
13712 // If x is +INF, +0, or -0, use its original value
13713 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
13714 Flags);
13715}
13716
13717SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
13718 SDLoc DL(Op);
13719 EVT VT = Op.getValueType();
13720 SDValue Arg = Op.getOperand(0);
13721 SDValue TrigVal;
13722
13723 // Propagate fast-math flags so that the multiply we introduce can be folded
13724 // if Arg is already the result of a multiply by constant.
13725 auto Flags = Op->getFlags();
13726
13727 // AMDGPUISD nodes of vector type must be unrolled here since
13728 // they will not be expanded elsewhere.
13729 auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
13730 if (!V.getValueType().isVector())
13731 return V;
13732
13733 return DAG.UnrollVectorOp(cast<SDNode>(V));
13734 };
13735
13736 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
13737
13738 if (Subtarget->hasTrigReducedRange()) {
13739 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13740 TrigVal = UnrollIfVec(DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags));
13741 } else {
13742 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13743 }
13744
13745 switch (Op.getOpcode()) {
13746 case ISD::FCOS:
13747 TrigVal = DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
13748 break;
13749 case ISD::FSIN:
13750 TrigVal = DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
13751 break;
13752 default:
13753 llvm_unreachable("Wrong trig opcode");
13754 }
13755
13756 return UnrollIfVec(TrigVal);
13757}
13758
13759SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
13760 SelectionDAG &DAG) const {
13761 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
13762 assert(AtomicNode->isCompareAndSwap());
13763 unsigned AS = AtomicNode->getAddressSpace();
13764
13765 // No custom lowering required for local address space
13767 return Op;
13768
13769 // Non-local address space requires custom lowering for atomic compare
13770 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13771 SDLoc DL(Op);
13772 SDValue ChainIn = Op.getOperand(0);
13773 SDValue Addr = Op.getOperand(1);
13774 SDValue Old = Op.getOperand(2);
13775 SDValue New = Op.getOperand(3);
13776 EVT VT = Op.getValueType();
13777 MVT SimpleVT = VT.getSimpleVT();
13778 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
13779
13780 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
13781 SDValue Ops[] = {ChainIn, Addr, NewOld};
13782
13783 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
13784 Op->getVTList(), Ops, VT,
13785 AtomicNode->getMemOperand());
13786}
13787
13788//===----------------------------------------------------------------------===//
13789// Custom DAG optimizations
13790//===----------------------------------------------------------------------===//
13791
13792SDValue
13793SITargetLowering::performUCharToFloatCombine(SDNode *N,
13794 DAGCombinerInfo &DCI) const {
13795 EVT VT = N->getValueType(0);
13796 EVT ScalarVT = VT.getScalarType();
13797 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13798 return SDValue();
13799
13800 SelectionDAG &DAG = DCI.DAG;
13801 SDLoc DL(N);
13802
13803 SDValue Src = N->getOperand(0);
13804 EVT SrcVT = Src.getValueType();
13805
13806 // TODO: We could try to match extracting the higher bytes, which would be
13807 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
13808 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
13809 // about in practice.
13810 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13811 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
13812 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
13813 DCI.AddToWorklist(Cvt.getNode());
13814
13815 // For the f16 case, fold to a cast to f32 and then cast back to f16.
13816 if (ScalarVT != MVT::f32) {
13817 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
13818 DAG.getTargetConstant(0, DL, MVT::i32));
13819 }
13820 return Cvt;
13821 }
13822 }
13823
13824 return SDValue();
13825}
13826
13827SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13828 DAGCombinerInfo &DCI) const {
13829 SDValue MagnitudeOp = N->getOperand(0);
13830 SDValue SignOp = N->getOperand(1);
13831
13832 // The generic combine for fcopysign + fp cast is too conservative with
13833 // vectors, and also gets confused by the splitting we will perform here, so
13834 // peek through FP casts.
13835 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13836 SignOp.getOpcode() == ISD::FP_ROUND)
13837 SignOp = SignOp.getOperand(0);
13838
13839 SelectionDAG &DAG = DCI.DAG;
13840 SDLoc DL(N);
13841 EVT SignVT = SignOp.getValueType();
13842
13843 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13844 // lower half with a copy.
13845 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13846 EVT MagVT = MagnitudeOp.getValueType();
13847
13848 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13849
13850 if (MagVT.getScalarType() == MVT::f64) {
13851 EVT F32VT = MagVT.isVector()
13852 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13853 : MVT::v2f32;
13854
13855 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
13856
13858 for (unsigned I = 0; I != NumElts; ++I) {
13859 SDValue MagLo =
13860 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13861 DAG.getConstant(2 * I, DL, MVT::i32));
13862 SDValue MagHi =
13863 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13864 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13865
13866 SDValue SignOpElt =
13867 MagVT.isVector()
13869 SignOp, DAG.getConstant(I, DL, MVT::i32))
13870 : SignOp;
13871
13872 SDValue HiOp =
13873 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
13874
13875 SDValue Vector =
13876 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
13877
13878 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
13879 NewElts.push_back(NewElt);
13880 }
13881
13882 if (NewElts.size() == 1)
13883 return NewElts[0];
13884
13885 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
13886 }
13887
13888 if (SignVT.getScalarType() != MVT::f64)
13889 return SDValue();
13890
13891 // Reduce width of sign operand, we only need the highest bit.
13892 //
13893 // fcopysign f64:x, f64:y ->
13894 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13895 // TODO: In some cases it might make sense to go all the way to f16.
13896
13897 EVT F32VT = MagVT.isVector()
13898 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13899 : MVT::v2f32;
13900
13901 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
13902
13903 SmallVector<SDValue, 8> F32Signs;
13904 for (unsigned I = 0; I != NumElts; ++I) {
13905 // Take sign from odd elements of cast vector
13906 SDValue SignAsF32 =
13907 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
13908 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13909 F32Signs.push_back(SignAsF32);
13910 }
13911
13912 SDValue NewSign =
13913 NumElts == 1
13914 ? F32Signs.back()
13916 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
13917 F32Signs);
13918
13919 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
13920 NewSign);
13921}
13922
13923// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13924// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13925// bits
13926
13927// This is a variant of
13928// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13929//
13930// The normal DAG combiner will do this, but only if the add has one use since
13931// that would increase the number of instructions.
13932//
13933// This prevents us from seeing a constant offset that can be folded into a
13934// memory instruction's addressing mode. If we know the resulting add offset of
13935// a pointer can be folded into an addressing offset, we can replace the pointer
13936// operand with the add of new constant offset. This eliminates one of the uses,
13937// and may allow the remaining use to also be simplified.
13938//
13939SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13940 EVT MemVT,
13941 DAGCombinerInfo &DCI) const {
13942 SDValue N0 = N->getOperand(0);
13943 SDValue N1 = N->getOperand(1);
13944
13945 // We only do this to handle cases where it's profitable when there are
13946 // multiple uses of the add, so defer to the standard combine.
13947 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13948 return SDValue();
13949
13950 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
13951 if (!CN1)
13952 return SDValue();
13953
13954 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13955 if (!CAdd)
13956 return SDValue();
13957
13958 SelectionDAG &DAG = DCI.DAG;
13959
13960 if (N0->getOpcode() == ISD::OR &&
13961 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
13962 return SDValue();
13963
13964 // If the resulting offset is too large, we can't fold it into the
13965 // addressing mode offset.
13966 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13967 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
13968
13969 AddrMode AM;
13970 AM.HasBaseReg = true;
13971 AM.BaseOffs = Offset.getSExtValue();
13972 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
13973 return SDValue();
13974
13975 SDLoc SL(N);
13976 EVT VT = N->getValueType(0);
13977
13978 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
13979 SDValue COffset = DAG.getConstant(Offset, SL, VT);
13980
13981 SDNodeFlags Flags;
13982 Flags.setNoUnsignedWrap(
13983 N->getFlags().hasNoUnsignedWrap() &&
13984 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13985
13986 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13987 // be sure that the new left operand is a proper base pointer.
13988 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
13989}
13990
13991/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13992/// by the chain and intrinsic ID. Theoretically we would also need to check the
13993/// specific intrinsic, but they all place the pointer operand first.
13994static unsigned getBasePtrIndex(const MemSDNode *N) {
13995 switch (N->getOpcode()) {
13996 case ISD::STORE:
13999 return 2;
14000 default:
14001 return 1;
14002 }
14003}
14004
14005SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
14006 DAGCombinerInfo &DCI) const {
14007 SelectionDAG &DAG = DCI.DAG;
14008
14009 unsigned PtrIdx = getBasePtrIndex(N);
14010 SDValue Ptr = N->getOperand(PtrIdx);
14011
14012 // TODO: We could also do this for multiplies.
14013 if (Ptr.getOpcode() == ISD::SHL) {
14014 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
14015 N->getMemoryVT(), DCI);
14016 if (NewPtr) {
14017 SmallVector<SDValue, 8> NewOps(N->ops());
14018
14019 NewOps[PtrIdx] = NewPtr;
14020 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
14021 }
14022 }
14023
14024 return SDValue();
14025}
14026
14027static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
14028 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
14029 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
14030 (Opc == ISD::XOR && Val == 0);
14031}
14032
14033// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
14034// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
14035// integer combine opportunities since most 64-bit operations are decomposed
14036// this way. TODO: We won't want this for SALU especially if it is an inline
14037// immediate.
14038SDValue SITargetLowering::splitBinaryBitConstantOp(
14039 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
14040 const ConstantSDNode *CRHS) const {
14041 uint64_t Val = CRHS->getZExtValue();
14042 uint32_t ValLo = Lo_32(Val);
14043 uint32_t ValHi = Hi_32(Val);
14044 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14045
14046 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
14048 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
14049 // We have 64-bit scalar and/or/xor, but do not have vector forms.
14050 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
14051 !CRHS->user_begin()->isDivergent())
14052 return SDValue();
14053
14054 // If we need to materialize a 64-bit immediate, it will be split up later
14055 // anyway. Avoid creating the harder to understand 64-bit immediate
14056 // materialization.
14057 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
14058 }
14059
14060 return SDValue();
14061}
14062
14064 if (V.getValueType() != MVT::i1)
14065 return false;
14066 switch (V.getOpcode()) {
14067 default:
14068 break;
14069 case ISD::SETCC:
14070 case ISD::IS_FPCLASS:
14071 case AMDGPUISD::FP_CLASS:
14072 return true;
14073 case ISD::AND:
14074 case ISD::OR:
14075 case ISD::XOR:
14076 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
14077 case ISD::SADDO:
14078 case ISD::UADDO:
14079 case ISD::SSUBO:
14080 case ISD::USUBO:
14081 case ISD::SMULO:
14082 case ISD::UMULO:
14083 return V.getResNo() == 1;
14085 unsigned IntrinsicID = V.getConstantOperandVal(0);
14086 switch (IntrinsicID) {
14087 case Intrinsic::amdgcn_is_shared:
14088 case Intrinsic::amdgcn_is_private:
14089 return true;
14090 default:
14091 return false;
14092 }
14093
14094 return false;
14095 }
14096 }
14097 return false;
14098}
14099
14100// If a constant has all zeroes or all ones within each byte return it.
14101// Otherwise return 0.
14103 // 0xff for any zero byte in the mask
14104 uint32_t ZeroByteMask = 0;
14105 if (!(C & 0x000000ff))
14106 ZeroByteMask |= 0x000000ff;
14107 if (!(C & 0x0000ff00))
14108 ZeroByteMask |= 0x0000ff00;
14109 if (!(C & 0x00ff0000))
14110 ZeroByteMask |= 0x00ff0000;
14111 if (!(C & 0xff000000))
14112 ZeroByteMask |= 0xff000000;
14113 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
14114 if ((NonZeroByteMask & C) != NonZeroByteMask)
14115 return 0; // Partial bytes selected.
14116 return C;
14117}
14118
14119// Check if a node selects whole bytes from its operand 0 starting at a byte
14120// boundary while masking the rest. Returns select mask as in the v_perm_b32
14121// or -1 if not succeeded.
14122// Note byte select encoding:
14123// value 0-3 selects corresponding source byte;
14124// value 0xc selects zero;
14125// value 0xff selects 0xff.
14127 assert(V.getValueSizeInBits() == 32);
14128
14129 if (V.getNumOperands() != 2)
14130 return ~0;
14131
14132 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
14133 if (!N1)
14134 return ~0;
14135
14136 uint32_t C = N1->getZExtValue();
14137
14138 switch (V.getOpcode()) {
14139 default:
14140 break;
14141 case ISD::AND:
14142 if (uint32_t ConstMask = getConstantPermuteMask(C))
14143 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
14144 break;
14145
14146 case ISD::OR:
14147 if (uint32_t ConstMask = getConstantPermuteMask(C))
14148 return (0x03020100 & ~ConstMask) | ConstMask;
14149 break;
14150
14151 case ISD::SHL:
14152 if (C % 8)
14153 return ~0;
14154
14155 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
14156
14157 case ISD::SRL:
14158 if (C % 8)
14159 return ~0;
14160
14161 return uint32_t(0x0c0c0c0c03020100ull >> C);
14162 }
14163
14164 return ~0;
14165}
14166
14167SDValue SITargetLowering::performAndCombine(SDNode *N,
14168 DAGCombinerInfo &DCI) const {
14169 if (DCI.isBeforeLegalize())
14170 return SDValue();
14171
14172 SelectionDAG &DAG = DCI.DAG;
14173 EVT VT = N->getValueType(0);
14174 SDValue LHS = N->getOperand(0);
14175 SDValue RHS = N->getOperand(1);
14176
14177 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
14178 if (VT == MVT::i64 && CRHS) {
14179 if (SDValue Split =
14180 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
14181 return Split;
14182 }
14183
14184 if (CRHS && VT == MVT::i32) {
14185 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
14186 // nb = number of trailing zeroes in mask
14187 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
14188 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
14189 uint64_t Mask = CRHS->getZExtValue();
14190 unsigned Bits = llvm::popcount(Mask);
14191 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
14192 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
14193 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
14194 unsigned Shift = CShift->getZExtValue();
14195 unsigned NB = CRHS->getAPIntValue().countr_zero();
14196 unsigned Offset = NB + Shift;
14197 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
14198 SDLoc SL(N);
14199 SDValue BFE =
14200 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
14201 DAG.getConstant(Offset, SL, MVT::i32),
14202 DAG.getConstant(Bits, SL, MVT::i32));
14203 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
14204 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
14205 DAG.getValueType(NarrowVT));
14206 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
14207 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
14208 return Shl;
14209 }
14210 }
14211 }
14212
14213 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14214 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
14215 isa<ConstantSDNode>(LHS.getOperand(2))) {
14216 uint32_t Sel = getConstantPermuteMask(Mask);
14217 if (!Sel)
14218 return SDValue();
14219
14220 // Select 0xc for all zero bytes
14221 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
14222 SDLoc DL(N);
14223 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14224 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14225 }
14226 }
14227
14228 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
14229 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
14230 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
14231 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
14232 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
14233
14234 SDValue X = LHS.getOperand(0);
14235 SDValue Y = RHS.getOperand(0);
14236 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
14237 !isTypeLegal(X.getValueType()))
14238 return SDValue();
14239
14240 if (LCC == ISD::SETO) {
14241 if (X != LHS.getOperand(1))
14242 return SDValue();
14243
14244 if (RCC == ISD::SETUNE) {
14245 const ConstantFPSDNode *C1 =
14246 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
14247 if (!C1 || !C1->isInfinity() || C1->isNegative())
14248 return SDValue();
14249
14250 const uint32_t Mask = SIInstrFlags::N_NORMAL |
14254
14255 static_assert(
14258 0x3ff) == Mask,
14259 "mask not equal");
14260
14261 SDLoc DL(N);
14262 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
14263 DAG.getConstant(Mask, DL, MVT::i32));
14264 }
14265 }
14266 }
14267
14268 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
14269 std::swap(LHS, RHS);
14270
14271 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14272 RHS.hasOneUse()) {
14273 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
14274 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
14275 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
14276 // | n_nan)
14277 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14278 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
14279 (RHS.getOperand(0) == LHS.getOperand(0) &&
14280 LHS.getOperand(0) == LHS.getOperand(1))) {
14281 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
14282 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
14283 : Mask->getZExtValue() & OrdMask;
14284
14285 SDLoc DL(N);
14286 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
14287 DAG.getConstant(NewMask, DL, MVT::i32));
14288 }
14289 }
14290
14291 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
14292 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
14293 // and x, (sext cc from i1) => select cc, x, 0
14294 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
14295 std::swap(LHS, RHS);
14296 if (isBoolSGPR(RHS.getOperand(0)))
14297 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
14298 DAG.getConstant(0, SDLoc(N), MVT::i32));
14299 }
14300
14301 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14302 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14303 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14304 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14305 uint32_t LHSMask = getPermuteMask(LHS);
14306 uint32_t RHSMask = getPermuteMask(RHS);
14307 if (LHSMask != ~0u && RHSMask != ~0u) {
14308 // Canonicalize the expression in an attempt to have fewer unique masks
14309 // and therefore fewer registers used to hold the masks.
14310 if (LHSMask > RHSMask) {
14311 std::swap(LHSMask, RHSMask);
14312 std::swap(LHS, RHS);
14313 }
14314
14315 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14316 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14317 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14318 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14319
14320 // Check of we need to combine values from two sources within a byte.
14321 if (!(LHSUsedLanes & RHSUsedLanes) &&
14322 // If we select high and lower word keep it for SDWA.
14323 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14324 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14325 // Each byte in each mask is either selector mask 0-3, or has higher
14326 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
14327 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
14328 // mask which is not 0xff wins. By anding both masks we have a correct
14329 // result except that 0x0c shall be corrected to give 0x0c only.
14330 uint32_t Mask = LHSMask & RHSMask;
14331 for (unsigned I = 0; I < 32; I += 8) {
14332 uint32_t ByteSel = 0xff << I;
14333 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
14334 Mask &= (0x0c << I) & 0xffffffff;
14335 }
14336
14337 // Add 4 to each active LHS lane. It will not affect any existing 0xff
14338 // or 0x0c.
14339 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
14340 SDLoc DL(N);
14341
14342 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14343 RHS.getOperand(0),
14344 DAG.getConstant(Sel, DL, MVT::i32));
14345 }
14346 }
14347 }
14348
14349 return SDValue();
14350}
14351
14352// A key component of v_perm is a mapping between byte position of the src
14353// operands, and the byte position of the dest. To provide such, we need: 1. the
14354// node that provides x byte of the dest of the OR, and 2. the byte of the node
14355// used to provide that x byte. calculateByteProvider finds which node provides
14356// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
14357// and finds an ultimate src and byte position For example: The supported
14358// LoadCombine pattern for vector loads is as follows
14359// t1
14360// or
14361// / \
14362// t2 t3
14363// zext shl
14364// | | \
14365// t4 t5 16
14366// or anyext
14367// / \ |
14368// t6 t7 t8
14369// srl shl or
14370// / | / \ / \
14371// t9 t10 t11 t12 t13 t14
14372// trunc* 8 trunc* 8 and and
14373// | | / | | \
14374// t15 t16 t17 t18 t19 t20
14375// trunc* 255 srl -256
14376// | / \
14377// t15 t15 16
14378//
14379// *In this example, the truncs are from i32->i16
14380//
14381// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
14382// respectively. calculateSrcByte would find (given node) -> ultimate src &
14383// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
14384// After finding the mapping, we can combine the tree into vperm t15, t16,
14385// 0x05000407
14386
14387// Find the source and byte position from a node.
14388// \p DestByte is the byte position of the dest of the or that the src
14389// ultimately provides. \p SrcIndex is the byte of the src that maps to this
14390// dest of the or byte. \p Depth tracks how many recursive iterations we have
14391// performed.
14392static const std::optional<ByteProvider<SDValue>>
14393calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
14394 unsigned Depth = 0) {
14395 // We may need to recursively traverse a series of SRLs
14396 if (Depth >= 6)
14397 return std::nullopt;
14398
14399 if (Op.getValueSizeInBits() < 8)
14400 return std::nullopt;
14401
14402 if (Op.getValueType().isVector())
14403 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
14404
14405 switch (Op->getOpcode()) {
14406 case ISD::TRUNCATE: {
14407 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
14408 }
14409
14410 case ISD::ANY_EXTEND:
14411 case ISD::SIGN_EXTEND:
14412 case ISD::ZERO_EXTEND:
14414 SDValue NarrowOp = Op->getOperand(0);
14415 auto NarrowVT = NarrowOp.getValueType();
14416 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
14417 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
14418 NarrowVT = VTSign->getVT();
14419 }
14420 if (!NarrowVT.isByteSized())
14421 return std::nullopt;
14422 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
14423
14424 if (SrcIndex >= NarrowByteWidth)
14425 return std::nullopt;
14426 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
14427 }
14428
14429 case ISD::SRA:
14430 case ISD::SRL: {
14431 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14432 if (!ShiftOp)
14433 return std::nullopt;
14434
14435 uint64_t BitShift = ShiftOp->getZExtValue();
14436
14437 if (BitShift % 8 != 0)
14438 return std::nullopt;
14439
14440 SrcIndex += BitShift / 8;
14441
14442 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
14443 }
14444
14445 default: {
14446 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
14447 }
14448 }
14449 llvm_unreachable("fully handled switch");
14450}
14451
14452// For a byte position in the result of an Or, traverse the tree and find the
14453// node (and the byte of the node) which ultimately provides this {Or,
14454// BytePosition}. \p Op is the operand we are currently examining. \p Index is
14455// the byte position of the Op that corresponds with the originally requested
14456// byte of the Or \p Depth tracks how many recursive iterations we have
14457// performed. \p StartingIndex is the originally requested byte of the Or
14458static const std::optional<ByteProvider<SDValue>>
14459calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
14460 unsigned StartingIndex = 0) {
14461 // Finding Src tree of RHS of or typically requires at least 1 additional
14462 // depth
14463 if (Depth > 6)
14464 return std::nullopt;
14465
14466 unsigned BitWidth = Op.getScalarValueSizeInBits();
14467 if (BitWidth % 8 != 0)
14468 return std::nullopt;
14469 if (Index > BitWidth / 8 - 1)
14470 return std::nullopt;
14471
14472 bool IsVec = Op.getValueType().isVector();
14473 switch (Op.getOpcode()) {
14474 case ISD::OR: {
14475 if (IsVec)
14476 return std::nullopt;
14477
14478 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
14479 StartingIndex);
14480 if (!RHS)
14481 return std::nullopt;
14482 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
14483 StartingIndex);
14484 if (!LHS)
14485 return std::nullopt;
14486 // A well formed Or will have two ByteProviders for each byte, one of which
14487 // is constant zero
14488 if (!LHS->isConstantZero() && !RHS->isConstantZero())
14489 return std::nullopt;
14490 if (!LHS || LHS->isConstantZero())
14491 return RHS;
14492 if (!RHS || RHS->isConstantZero())
14493 return LHS;
14494 return std::nullopt;
14495 }
14496
14497 case ISD::AND: {
14498 if (IsVec)
14499 return std::nullopt;
14500
14501 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14502 if (!BitMaskOp)
14503 return std::nullopt;
14504
14505 uint32_t BitMask = BitMaskOp->getZExtValue();
14506 // Bits we expect for our StartingIndex
14507 uint32_t IndexMask = 0xFF << (Index * 8);
14508
14509 if ((IndexMask & BitMask) != IndexMask) {
14510 // If the result of the and partially provides the byte, then it
14511 // is not well formatted
14512 if (IndexMask & BitMask)
14513 return std::nullopt;
14515 }
14516
14517 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
14518 }
14519
14520 case ISD::FSHR: {
14521 if (IsVec)
14522 return std::nullopt;
14523
14524 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
14525 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
14526 if (!ShiftOp || Op.getValueType().isVector())
14527 return std::nullopt;
14528
14529 uint64_t BitsProvided = Op.getValueSizeInBits();
14530 if (BitsProvided % 8 != 0)
14531 return std::nullopt;
14532
14533 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
14534 if (BitShift % 8)
14535 return std::nullopt;
14536
14537 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14538 uint64_t ByteShift = BitShift / 8;
14539
14540 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14541 uint64_t BytesProvided = BitsProvided / 8;
14542 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
14543 NewIndex %= BytesProvided;
14544 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
14545 }
14546
14547 case ISD::SRA:
14548 case ISD::SRL: {
14549 if (IsVec)
14550 return std::nullopt;
14551
14552 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14553 if (!ShiftOp)
14554 return std::nullopt;
14555
14556 uint64_t BitShift = ShiftOp->getZExtValue();
14557 if (BitShift % 8)
14558 return std::nullopt;
14559
14560 auto BitsProvided = Op.getScalarValueSizeInBits();
14561 if (BitsProvided % 8 != 0)
14562 return std::nullopt;
14563
14564 uint64_t BytesProvided = BitsProvided / 8;
14565 uint64_t ByteShift = BitShift / 8;
14566 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
14567 // If the byte we are trying to provide (as tracked by index) falls in this
14568 // range, then the SRL provides the byte. The byte of interest of the src of
14569 // the SRL is Index + ByteShift
14570 return BytesProvided - ByteShift > Index
14571 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
14572 Index + ByteShift)
14574 }
14575
14576 case ISD::SHL: {
14577 if (IsVec)
14578 return std::nullopt;
14579
14580 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14581 if (!ShiftOp)
14582 return std::nullopt;
14583
14584 uint64_t BitShift = ShiftOp->getZExtValue();
14585 if (BitShift % 8 != 0)
14586 return std::nullopt;
14587 uint64_t ByteShift = BitShift / 8;
14588
14589 // If we are shifting by an amount greater than (or equal to)
14590 // the index we are trying to provide, then it provides 0s. If not,
14591 // then this bytes are not definitively 0s, and the corresponding byte
14592 // of interest is Index - ByteShift of the src
14593 return Index < ByteShift
14595 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
14596 Depth + 1, StartingIndex);
14597 }
14598 case ISD::ANY_EXTEND:
14599 case ISD::SIGN_EXTEND:
14600 case ISD::ZERO_EXTEND:
14602 case ISD::AssertZext:
14603 case ISD::AssertSext: {
14604 if (IsVec)
14605 return std::nullopt;
14606
14607 SDValue NarrowOp = Op->getOperand(0);
14608 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
14609 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
14610 Op->getOpcode() == ISD::AssertZext ||
14611 Op->getOpcode() == ISD::AssertSext) {
14612 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
14613 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14614 }
14615 if (NarrowBitWidth % 8 != 0)
14616 return std::nullopt;
14617 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14618
14619 if (Index >= NarrowByteWidth)
14620 return Op.getOpcode() == ISD::ZERO_EXTEND
14621 ? std::optional<ByteProvider<SDValue>>(
14623 : std::nullopt;
14624 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
14625 }
14626
14627 case ISD::TRUNCATE: {
14628 if (IsVec)
14629 return std::nullopt;
14630
14631 uint64_t NarrowByteWidth = BitWidth / 8;
14632
14633 if (NarrowByteWidth >= Index) {
14634 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
14635 StartingIndex);
14636 }
14637
14638 return std::nullopt;
14639 }
14640
14641 case ISD::CopyFromReg: {
14642 if (BitWidth / 8 > Index)
14643 return calculateSrcByte(Op, StartingIndex, Index);
14644
14645 return std::nullopt;
14646 }
14647
14648 case ISD::LOAD: {
14649 auto *L = cast<LoadSDNode>(Op.getNode());
14650
14651 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14652 if (NarrowBitWidth % 8 != 0)
14653 return std::nullopt;
14654 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14655
14656 // If the width of the load does not reach byte we are trying to provide for
14657 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
14658 // question
14659 if (Index >= NarrowByteWidth) {
14660 return L->getExtensionType() == ISD::ZEXTLOAD
14661 ? std::optional<ByteProvider<SDValue>>(
14663 : std::nullopt;
14664 }
14665
14666 if (NarrowByteWidth > Index) {
14667 return calculateSrcByte(Op, StartingIndex, Index);
14668 }
14669
14670 return std::nullopt;
14671 }
14672
14673 case ISD::BSWAP: {
14674 if (IsVec)
14675 return std::nullopt;
14676
14677 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
14678 Depth + 1, StartingIndex);
14679 }
14680
14682 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14683 if (!IdxOp)
14684 return std::nullopt;
14685 auto VecIdx = IdxOp->getZExtValue();
14686 auto ScalarSize = Op.getScalarValueSizeInBits();
14687 if (ScalarSize < 32)
14688 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14689 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
14690 StartingIndex, Index);
14691 }
14692
14693 case AMDGPUISD::PERM: {
14694 if (IsVec)
14695 return std::nullopt;
14696
14697 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
14698 if (!PermMask)
14699 return std::nullopt;
14700
14701 auto IdxMask =
14702 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14703 if (IdxMask > 0x07 && IdxMask != 0x0c)
14704 return std::nullopt;
14705
14706 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
14707 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14708
14709 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
14712 }
14713
14714 default: {
14715 return std::nullopt;
14716 }
14717 }
14718
14719 llvm_unreachable("fully handled switch");
14720}
14721
14722// Returns true if the Operand is a scalar and is 16 bits
14723static bool isExtendedFrom16Bits(SDValue &Operand) {
14724
14725 switch (Operand.getOpcode()) {
14726 case ISD::ANY_EXTEND:
14727 case ISD::SIGN_EXTEND:
14728 case ISD::ZERO_EXTEND: {
14729 auto OpVT = Operand.getOperand(0).getValueType();
14730 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
14731 }
14732 case ISD::LOAD: {
14733 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
14734 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
14735 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
14736 ExtType == ISD::EXTLOAD) {
14737 auto MemVT = L->getMemoryVT();
14738 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
14739 }
14740 return L->getMemoryVT().getSizeInBits() == 16;
14741 }
14742 default:
14743 return false;
14744 }
14745}
14746
14747// Returns true if the mask matches consecutive bytes, and the first byte
14748// begins at a power of 2 byte offset from 0th byte
14749static bool addresses16Bits(int Mask) {
14750 int Low8 = Mask & 0xff;
14751 int Hi8 = (Mask & 0xff00) >> 8;
14752
14753 assert(Low8 < 8 && Hi8 < 8);
14754 // Are the bytes contiguous in the order of increasing addresses.
14755 bool IsConsecutive = (Hi8 - Low8 == 1);
14756 // Is the first byte at location that is aligned for 16 bit instructions.
14757 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
14758 // In this case, we still need code to extract the 16 bit operand, so it
14759 // is better to use i8 v_perm
14760 bool Is16Aligned = !(Low8 % 2);
14761
14762 return IsConsecutive && Is16Aligned;
14763}
14764
14765// Do not lower into v_perm if the operands are actually 16 bit
14766// and the selected bits (based on PermMask) correspond with two
14767// easily addressable 16 bit operands.
14769 SDValue &OtherOp) {
14770 int Low16 = PermMask & 0xffff;
14771 int Hi16 = (PermMask & 0xffff0000) >> 16;
14772
14773 auto TempOp = peekThroughBitcasts(Op);
14774 auto TempOtherOp = peekThroughBitcasts(OtherOp);
14775
14776 auto OpIs16Bit =
14777 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
14778 if (!OpIs16Bit)
14779 return true;
14780
14781 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14782 isExtendedFrom16Bits(TempOtherOp);
14783 if (!OtherOpIs16Bit)
14784 return true;
14785
14786 // Do we cleanly address both
14787 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
14788}
14789
14791 unsigned DWordOffset) {
14792 SDValue Ret;
14793
14794 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14795 // ByteProvider must be at least 8 bits
14796 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14797
14798 if (TypeSize <= 32)
14799 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
14800
14801 if (Src.getValueType().isVector()) {
14802 auto ScalarTySize = Src.getScalarValueSizeInBits();
14803 auto ScalarTy = Src.getValueType().getScalarType();
14804 if (ScalarTySize == 32) {
14805 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
14806 DAG.getConstant(DWordOffset, SL, MVT::i32));
14807 }
14808 if (ScalarTySize > 32) {
14809 Ret = DAG.getNode(
14810 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
14811 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14812 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14813 if (ShiftVal)
14814 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
14815 DAG.getConstant(ShiftVal, SL, MVT::i32));
14816 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14817 }
14818
14819 assert(ScalarTySize < 32);
14820 auto NumElements = TypeSize / ScalarTySize;
14821 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14822 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14823 auto NumElementsIn32 = 32 / ScalarTySize;
14824 auto NumAvailElements = DWordOffset < Trunc32Elements
14825 ? NumElementsIn32
14826 : NumElements - NormalizedTrunc;
14827
14829 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
14830 NumAvailElements);
14831
14832 Ret = DAG.getBuildVector(
14833 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
14834 VecSrcs);
14835 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14836 }
14837
14838 /// Scalar Type
14839 auto ShiftVal = 32 * DWordOffset;
14840 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
14841 DAG.getConstant(ShiftVal, SL, MVT::i32));
14842 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14843}
14844
14846 SelectionDAG &DAG = DCI.DAG;
14847 [[maybe_unused]] EVT VT = N->getValueType(0);
14849
14850 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14851 assert(VT == MVT::i32);
14852 for (int i = 0; i < 4; i++) {
14853 // Find the ByteProvider that provides the ith byte of the result of OR
14854 std::optional<ByteProvider<SDValue>> P =
14855 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
14856 // TODO support constantZero
14857 if (!P || P->isConstantZero())
14858 return SDValue();
14859
14860 PermNodes.push_back(*P);
14861 }
14862 if (PermNodes.size() != 4)
14863 return SDValue();
14864
14865 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14866 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14867 uint64_t PermMask = 0x00000000;
14868 for (size_t i = 0; i < PermNodes.size(); i++) {
14869 auto PermOp = PermNodes[i];
14870 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14871 // by sizeof(Src2) = 4
14872 int SrcByteAdjust = 4;
14873
14874 // If the Src uses a byte from a different DWORD, then it corresponds
14875 // with a difference source
14876 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14877 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14878 if (SecondSrc)
14879 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14880 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14881 return SDValue();
14882
14883 // Set the index of the second distinct Src node
14884 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14885 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14886 SrcByteAdjust = 0;
14887 }
14888 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14890 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14891 }
14892 SDLoc DL(N);
14893 SDValue Op = *PermNodes[FirstSrc.first].Src;
14894 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
14895 assert(Op.getValueSizeInBits() == 32);
14896
14897 // Check that we are not just extracting the bytes in order from an op
14898 if (!SecondSrc) {
14899 int Low16 = PermMask & 0xffff;
14900 int Hi16 = (PermMask & 0xffff0000) >> 16;
14901
14902 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14903 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14904
14905 // The perm op would really just produce Op. So combine into Op
14906 if (WellFormedLow && WellFormedHi)
14907 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
14908 }
14909
14910 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14911
14912 if (SecondSrc) {
14913 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
14914 assert(OtherOp.getValueSizeInBits() == 32);
14915 }
14916
14917 // Check that we haven't just recreated the same FSHR node.
14918 if (N->getOpcode() == ISD::FSHR &&
14919 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
14920 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
14921 return SDValue();
14922
14923 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14924
14925 assert(Op.getValueType().isByteSized() &&
14926 OtherOp.getValueType().isByteSized());
14927
14928 // If the ultimate src is less than 32 bits, then we will only be
14929 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14930 // CalculateByteProvider would not have returned Op as source if we
14931 // used a byte that is outside its ValueType. Thus, we are free to
14932 // ANY_EXTEND as the extended bits are dont-cares.
14933 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
14934 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
14935
14936 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
14937 DAG.getConstant(PermMask, DL, MVT::i32));
14938 }
14939 return SDValue();
14940}
14941
14942SDValue SITargetLowering::performOrCombine(SDNode *N,
14943 DAGCombinerInfo &DCI) const {
14944 SelectionDAG &DAG = DCI.DAG;
14945 SDValue LHS = N->getOperand(0);
14946 SDValue RHS = N->getOperand(1);
14947
14948 EVT VT = N->getValueType(0);
14949 if (VT == MVT::i1) {
14950 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14951 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14952 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14953 SDValue Src = LHS.getOperand(0);
14954 if (Src != RHS.getOperand(0))
14955 return SDValue();
14956
14957 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
14958 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14959 if (!CLHS || !CRHS)
14960 return SDValue();
14961
14962 // Only 10 bits are used.
14963 static const uint32_t MaxMask = 0x3ff;
14964
14965 uint32_t NewMask =
14966 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14967 SDLoc DL(N);
14968 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
14969 DAG.getConstant(NewMask, DL, MVT::i32));
14970 }
14971
14972 return SDValue();
14973 }
14974
14975 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14977 LHS.getOpcode() == AMDGPUISD::PERM &&
14978 isa<ConstantSDNode>(LHS.getOperand(2))) {
14979 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
14980 if (!Sel)
14981 return SDValue();
14982
14983 Sel |= LHS.getConstantOperandVal(2);
14984 SDLoc DL(N);
14985 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14986 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14987 }
14988
14989 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14990 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14991 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14992 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14993
14994 // If all the uses of an or need to extract the individual elements, do not
14995 // attempt to lower into v_perm
14996 auto usesCombinedOperand = [](SDNode *OrUse) {
14997 // If we have any non-vectorized use, then it is a candidate for v_perm
14998 if (OrUse->getOpcode() != ISD::BITCAST ||
14999 !OrUse->getValueType(0).isVector())
15000 return true;
15001
15002 // If we have any non-vectorized use, then it is a candidate for v_perm
15003 for (auto *VUser : OrUse->users()) {
15004 if (!VUser->getValueType(0).isVector())
15005 return true;
15006
15007 // If the use of a vector is a store, then combining via a v_perm
15008 // is beneficial.
15009 // TODO -- whitelist more uses
15010 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
15011 if (VUser->getOpcode() == VectorwiseOp)
15012 return true;
15013 }
15014 return false;
15015 };
15016
15017 if (!any_of(N->users(), usesCombinedOperand))
15018 return SDValue();
15019
15020 uint32_t LHSMask = getPermuteMask(LHS);
15021 uint32_t RHSMask = getPermuteMask(RHS);
15022
15023 if (LHSMask != ~0u && RHSMask != ~0u) {
15024 // Canonicalize the expression in an attempt to have fewer unique masks
15025 // and therefore fewer registers used to hold the masks.
15026 if (LHSMask > RHSMask) {
15027 std::swap(LHSMask, RHSMask);
15028 std::swap(LHS, RHS);
15029 }
15030
15031 // Select 0xc for each lane used from source operand. Zero has 0xc mask
15032 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
15033 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15034 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15035
15036 // Check of we need to combine values from two sources within a byte.
15037 if (!(LHSUsedLanes & RHSUsedLanes) &&
15038 // If we select high and lower word keep it for SDWA.
15039 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
15040 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
15041 // Kill zero bytes selected by other mask. Zero value is 0xc.
15042 LHSMask &= ~RHSUsedLanes;
15043 RHSMask &= ~LHSUsedLanes;
15044 // Add 4 to each active LHS lane
15045 LHSMask |= LHSUsedLanes & 0x04040404;
15046 // Combine masks
15047 uint32_t Sel = LHSMask | RHSMask;
15048 SDLoc DL(N);
15049
15050 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
15051 RHS.getOperand(0),
15052 DAG.getConstant(Sel, DL, MVT::i32));
15053 }
15054 }
15055 if (LHSMask == ~0u || RHSMask == ~0u) {
15056 if (SDValue Perm = matchPERM(N, DCI))
15057 return Perm;
15058 }
15059 }
15060
15061 // Detect identity v2i32 OR and replace with identity source node.
15062 // Specifically an Or that has operands constructed from the same source node
15063 // via extract_vector_elt and build_vector. I.E.
15064 // v2i32 or(
15065 // v2i32 build_vector(
15066 // i32 extract_elt(%IdentitySrc, 0),
15067 // i32 0
15068 // ),
15069 // v2i32 build_vector(
15070 // i32 0,
15071 // i32 extract_elt(%IdentitySrc, 1)
15072 // ) )
15073 // =>
15074 // v2i32 %IdentitySrc
15075
15076 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
15077 RHS->getOpcode() == ISD::BUILD_VECTOR) {
15078
15079 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
15080 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
15081
15082 // Test for and normalise build vectors.
15083 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
15084
15085 // Get the extract_vector_element operands.
15086 SDValue LEVE = LHS->getOperand(0);
15087 SDValue REVE = RHS->getOperand(1);
15088
15089 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15091 // Check that different elements from the same vector are
15092 // extracted.
15093 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
15094 LEVE->getOperand(1) != REVE->getOperand(1)) {
15095 SDValue IdentitySrc = LEVE.getOperand(0);
15096 return IdentitySrc;
15097 }
15098 }
15099 }
15100 }
15101
15102 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
15103 return SDValue();
15104
15105 // TODO: This could be a generic combine with a predicate for extracting the
15106 // high half of an integer being free.
15107
15108 // (or i64:x, (zero_extend i32:y)) ->
15109 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
15110 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
15111 RHS.getOpcode() != ISD::ZERO_EXTEND)
15112 std::swap(LHS, RHS);
15113
15114 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
15115 SDValue ExtSrc = RHS.getOperand(0);
15116 EVT SrcVT = ExtSrc.getValueType();
15117 if (SrcVT == MVT::i32) {
15118 SDLoc SL(N);
15119 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
15120 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
15121
15122 DCI.AddToWorklist(LowOr.getNode());
15123 DCI.AddToWorklist(HiBits.getNode());
15124
15125 SDValue Vec =
15126 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
15127 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
15128 }
15129 }
15130
15131 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
15132 if (CRHS) {
15133 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
15134 N->getOperand(0), CRHS))
15135 return Split;
15136 }
15137
15138 return SDValue();
15139}
15140
15141SDValue SITargetLowering::performXorCombine(SDNode *N,
15142 DAGCombinerInfo &DCI) const {
15143 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
15144 return RV;
15145
15146 SDValue LHS = N->getOperand(0);
15147 SDValue RHS = N->getOperand(1);
15148
15149 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
15150 SelectionDAG &DAG = DCI.DAG;
15151
15152 EVT VT = N->getValueType(0);
15153 if (CRHS && VT == MVT::i64) {
15154 if (SDValue Split =
15155 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
15156 return Split;
15157 }
15158
15159 // v2i32 (xor (vselect cc, x, y), K) ->
15160 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
15161 // replaced with source modifiers when the select is lowered to CNDMASK.
15162 unsigned Opc = LHS.getOpcode();
15163 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
15164 (Opc == ISD::SELECT && VT == MVT::i64)) &&
15165 CRHS && CRHS->getAPIntValue().isSignMask()) {
15166 SDValue CC = LHS->getOperand(0);
15167 SDValue TRUE = LHS->getOperand(1);
15168 SDValue FALSE = LHS->getOperand(2);
15169 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
15170 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
15171 SDValue XSelect =
15172 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
15173 return XSelect;
15174 }
15175
15176 // Make sure to apply the 64-bit constant splitting fold before trying to fold
15177 // fneg-like xors into 64-bit select.
15178 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
15179 // This looks like an fneg, try to fold as a source modifier.
15180 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
15182 // xor (select c, a, b), 0x80000000 ->
15183 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
15184 SDLoc DL(N);
15185 SDValue CastLHS =
15186 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
15187 SDValue CastRHS =
15188 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
15189 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
15190 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
15191 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
15192 LHS->getOperand(0), FNegLHS, FNegRHS);
15193 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
15194 }
15195 }
15196
15197 return SDValue();
15198}
15199
15200SDValue
15201SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
15202 DAGCombinerInfo &DCI) const {
15203 if (!Subtarget->has16BitInsts() ||
15204 DCI.getDAGCombineLevel() < AfterLegalizeTypes)
15205 return SDValue();
15206
15207 EVT VT = N->getValueType(0);
15208 if (VT != MVT::i32)
15209 return SDValue();
15210
15211 SDValue Src = N->getOperand(0);
15212 if (Src.getValueType() != MVT::i16)
15213 return SDValue();
15214
15215 if (!Src->hasOneUse())
15216 return SDValue();
15217
15218 // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
15219 // possible we're missing out on some combine opportunities, but we'd need to
15220 // weigh the cost of extracting the byte from the upper dwords.
15221
15222 std::optional<ByteProvider<SDValue>> BP0 =
15223 calculateByteProvider(SDValue(N, 0), 0, 0, 0);
15224 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
15225 return SDValue();
15226 SDValue V0 = *BP0->Src;
15227
15228 std::optional<ByteProvider<SDValue>> BP1 =
15229 calculateByteProvider(SDValue(N, 0), 1, 0, 1);
15230 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
15231 return SDValue();
15232
15233 SDValue V1 = *BP1->Src;
15234
15235 if (V0 == V1)
15236 return SDValue();
15237
15238 SelectionDAG &DAG = DCI.DAG;
15239 SDLoc DL(N);
15240 uint32_t PermMask = 0x0c0c0c0c;
15241 if (V0) {
15242 V0 = DAG.getBitcastedAnyExtOrTrunc(V0, DL, MVT::i32);
15243 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
15244 }
15245
15246 if (V1) {
15247 V1 = DAG.getBitcastedAnyExtOrTrunc(V1, DL, MVT::i32);
15248 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
15249 }
15250
15251 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, V0, V1,
15252 DAG.getConstant(PermMask, DL, MVT::i32));
15253}
15254
15255SDValue
15256SITargetLowering::performSignExtendInRegCombine(SDNode *N,
15257 DAGCombinerInfo &DCI) const {
15258 SDValue Src = N->getOperand(0);
15259 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
15260
15261 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
15262 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
15263 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
15264 VTSign->getVT() == MVT::i8) ||
15265 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
15266 VTSign->getVT() == MVT::i16))) {
15267 assert(Subtarget->hasScalarSubwordLoads() &&
15268 "s_buffer_load_{u8, i8} are supported "
15269 "in GFX12 (or newer) architectures.");
15270 EVT VT = Src.getValueType();
15271 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
15272 ? AMDGPUISD::SBUFFER_LOAD_BYTE
15273 : AMDGPUISD::SBUFFER_LOAD_SHORT;
15274 SDLoc DL(N);
15275 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
15276 SDValue Ops[] = {
15277 Src.getOperand(0), // source register
15278 Src.getOperand(1), // offset
15279 Src.getOperand(2) // cachePolicy
15280 };
15281 auto *M = cast<MemSDNode>(Src);
15282 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
15283 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
15284 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
15285 return LoadVal;
15286 }
15287 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
15288 VTSign->getVT() == MVT::i8) ||
15289 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
15290 VTSign->getVT() == MVT::i16)) &&
15291 Src.hasOneUse()) {
15292 auto *M = cast<MemSDNode>(Src);
15293 SDValue Ops[] = {Src.getOperand(0), // Chain
15294 Src.getOperand(1), // rsrc
15295 Src.getOperand(2), // vindex
15296 Src.getOperand(3), // voffset
15297 Src.getOperand(4), // soffset
15298 Src.getOperand(5), // offset
15299 Src.getOperand(6), Src.getOperand(7)};
15300 // replace with BUFFER_LOAD_BYTE/SHORT
15301 SDVTList ResList =
15302 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
15303 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
15304 ? AMDGPUISD::BUFFER_LOAD_BYTE
15305 : AMDGPUISD::BUFFER_LOAD_SHORT;
15306 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
15307 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
15308 return DCI.DAG.getMergeValues(
15309 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
15310 }
15311 return SDValue();
15312}
15313
15314SDValue SITargetLowering::performClassCombine(SDNode *N,
15315 DAGCombinerInfo &DCI) const {
15316 SelectionDAG &DAG = DCI.DAG;
15317 SDValue Mask = N->getOperand(1);
15318
15319 // fp_class x, 0 -> false
15320 if (isNullConstant(Mask))
15321 return DAG.getConstant(0, SDLoc(N), MVT::i1);
15322
15323 if (N->getOperand(0).isUndef())
15324 return DAG.getUNDEF(MVT::i1);
15325
15326 return SDValue();
15327}
15328
15329SDValue SITargetLowering::performRcpCombine(SDNode *N,
15330 DAGCombinerInfo &DCI) const {
15331 EVT VT = N->getValueType(0);
15332 SDValue N0 = N->getOperand(0);
15333
15334 if (N0.isUndef()) {
15335 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
15336 SDLoc(N), VT);
15337 }
15338
15339 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
15340 N0.getOpcode() == ISD::SINT_TO_FP)) {
15341 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
15342 N->getFlags());
15343 }
15344
15345 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
15346 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
15347 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
15348 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
15349 N->getFlags());
15350 }
15351
15353}
15354
15356 SDNodeFlags UserFlags,
15357 unsigned MaxDepth) const {
15358 unsigned Opcode = Op.getOpcode();
15359 if (Opcode == ISD::FCANONICALIZE)
15360 return true;
15361
15362 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
15363 const auto &F = CFP->getValueAPF();
15364 if (F.isNaN() && F.isSignaling())
15365 return false;
15366 if (!F.isDenormal())
15367 return true;
15368
15369 DenormalMode Mode =
15370 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
15371 return Mode == DenormalMode::getIEEE();
15372 }
15373
15374 // If source is a result of another standard FP operation it is already in
15375 // canonical form.
15376 if (MaxDepth == 0)
15377 return false;
15378
15379 switch (Opcode) {
15380 // These will flush denorms if required.
15381 case ISD::FADD:
15382 case ISD::FSUB:
15383 case ISD::FMUL:
15384 case ISD::FCEIL:
15385 case ISD::FFLOOR:
15386 case ISD::FMA:
15387 case ISD::FMAD:
15388 case ISD::FSQRT:
15389 case ISD::FDIV:
15390 case ISD::FREM:
15391 case ISD::FP_ROUND:
15392 case ISD::FP_EXTEND:
15393 case ISD::FP16_TO_FP:
15394 case ISD::FP_TO_FP16:
15395 case ISD::BF16_TO_FP:
15396 case ISD::FP_TO_BF16:
15397 case ISD::FLDEXP:
15398 case AMDGPUISD::FMUL_LEGACY:
15399 case AMDGPUISD::FMAD_FTZ:
15400 case AMDGPUISD::RCP:
15401 case AMDGPUISD::RSQ:
15402 case AMDGPUISD::RSQ_CLAMP:
15403 case AMDGPUISD::RCP_LEGACY:
15404 case AMDGPUISD::RCP_IFLAG:
15405 case AMDGPUISD::LOG:
15406 case AMDGPUISD::EXP:
15407 case AMDGPUISD::DIV_SCALE:
15408 case AMDGPUISD::DIV_FMAS:
15409 case AMDGPUISD::DIV_FIXUP:
15410 case AMDGPUISD::FRACT:
15411 case AMDGPUISD::CVT_PKRTZ_F16_F32:
15412 case AMDGPUISD::CVT_F32_UBYTE0:
15413 case AMDGPUISD::CVT_F32_UBYTE1:
15414 case AMDGPUISD::CVT_F32_UBYTE2:
15415 case AMDGPUISD::CVT_F32_UBYTE3:
15416 case AMDGPUISD::FP_TO_FP16:
15417 case AMDGPUISD::SIN_HW:
15418 case AMDGPUISD::COS_HW:
15419 return true;
15420
15421 // It can/will be lowered or combined as a bit operation.
15422 // Need to check their input recursively to handle.
15423 case ISD::FNEG:
15424 case ISD::FABS:
15425 case ISD::FCOPYSIGN:
15426 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15427
15428 case ISD::AND:
15429 if (Op.getValueType() == MVT::i32) {
15430 // Be careful as we only know it is a bitcast floating point type. It
15431 // could be f32, v2f16, we have no way of knowing. Luckily the constant
15432 // value that we optimize for, which comes up in fp32 to bf16 conversions,
15433 // is valid to optimize for all types.
15434 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
15435 if (RHS->getZExtValue() == 0xffff0000) {
15436 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15437 }
15438 }
15439 }
15440 break;
15441
15442 case ISD::FSIN:
15443 case ISD::FCOS:
15444 case ISD::FSINCOS:
15445 return Op.getValueType().getScalarType() != MVT::f16;
15446
15447 case ISD::FMINNUM:
15448 case ISD::FMAXNUM:
15449 case ISD::FMINNUM_IEEE:
15450 case ISD::FMAXNUM_IEEE:
15451 case ISD::FMINIMUM:
15452 case ISD::FMAXIMUM:
15453 case ISD::FMINIMUMNUM:
15454 case ISD::FMAXIMUMNUM:
15455 case AMDGPUISD::CLAMP:
15456 case AMDGPUISD::FMED3:
15457 case AMDGPUISD::FMAX3:
15458 case AMDGPUISD::FMIN3:
15459 case AMDGPUISD::FMAXIMUM3:
15460 case AMDGPUISD::FMINIMUM3: {
15461 // FIXME: Shouldn't treat the generic operations different based these.
15462 // However, we aren't really required to flush the result from
15463 // minnum/maxnum..
15464
15465 // snans will be quieted, so we only need to worry about denormals.
15466 if (Subtarget->supportsMinMaxDenormModes() ||
15467 // FIXME: denormalsEnabledForType is broken for dynamic
15468 denormalsEnabledForType(DAG, Op.getValueType()))
15469 return true;
15470
15471 // Flushing may be required.
15472 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
15473 // targets need to check their input recursively.
15474
15475 // FIXME: Does this apply with clamp? It's implemented with max.
15476 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
15477 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
15478 return false;
15479 }
15480
15481 return true;
15482 }
15483 case ISD::SELECT: {
15484 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
15485 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
15486 }
15487 case ISD::BUILD_VECTOR: {
15488 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
15489 SDValue SrcOp = Op.getOperand(i);
15490 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
15491 return false;
15492 }
15493
15494 return true;
15495 }
15498 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15499 }
15501 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
15502 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
15503 }
15504 case ISD::UNDEF:
15505 // Could be anything.
15506 return false;
15507
15508 case ISD::BITCAST:
15509 // TODO: This is incorrect as it loses track of the operand's type. We may
15510 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
15511 // same bits that are canonicalized in one type need not be in the other.
15512 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15513 case ISD::TRUNCATE: {
15514 // Hack round the mess we make when legalizing extract_vector_elt
15515 if (Op.getValueType() == MVT::i16) {
15516 SDValue TruncSrc = Op.getOperand(0);
15517 if (TruncSrc.getValueType() == MVT::i32 &&
15518 TruncSrc.getOpcode() == ISD::BITCAST &&
15519 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
15520 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
15521 }
15522 }
15523 return false;
15524 }
15526 unsigned IntrinsicID = Op.getConstantOperandVal(0);
15527 // TODO: Handle more intrinsics
15528 switch (IntrinsicID) {
15529 case Intrinsic::amdgcn_cvt_pkrtz:
15530 case Intrinsic::amdgcn_cubeid:
15531 case Intrinsic::amdgcn_frexp_mant:
15532 case Intrinsic::amdgcn_fdot2:
15533 case Intrinsic::amdgcn_rcp:
15534 case Intrinsic::amdgcn_rsq:
15535 case Intrinsic::amdgcn_rsq_clamp:
15536 case Intrinsic::amdgcn_rcp_legacy:
15537 case Intrinsic::amdgcn_rsq_legacy:
15538 case Intrinsic::amdgcn_trig_preop:
15539 case Intrinsic::amdgcn_tanh:
15540 case Intrinsic::amdgcn_log:
15541 case Intrinsic::amdgcn_exp2:
15542 case Intrinsic::amdgcn_sqrt:
15543 return true;
15544 default:
15545 break;
15546 }
15547
15548 break;
15549 }
15550 default:
15551 break;
15552 }
15553
15554 // FIXME: denormalsEnabledForType is broken for dynamic
15555 return denormalsEnabledForType(DAG, Op.getValueType()) &&
15556 (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op));
15557}
15558
15560 unsigned MaxDepth) const {
15561 const MachineRegisterInfo &MRI = MF.getRegInfo();
15562 MachineInstr *MI = MRI.getVRegDef(Reg);
15563 unsigned Opcode = MI->getOpcode();
15564
15565 if (Opcode == AMDGPU::G_FCANONICALIZE)
15566 return true;
15567
15568 std::optional<FPValueAndVReg> FCR;
15569 // Constant splat (can be padded with undef) or scalar constant.
15570 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
15571 if (FCR->Value.isSignaling())
15572 return false;
15573 if (!FCR->Value.isDenormal())
15574 return true;
15575
15576 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
15577 return Mode == DenormalMode::getIEEE();
15578 }
15579
15580 if (MaxDepth == 0)
15581 return false;
15582
15583 switch (Opcode) {
15584 case AMDGPU::G_FADD:
15585 case AMDGPU::G_FSUB:
15586 case AMDGPU::G_FMUL:
15587 case AMDGPU::G_FCEIL:
15588 case AMDGPU::G_FFLOOR:
15589 case AMDGPU::G_FRINT:
15590 case AMDGPU::G_FNEARBYINT:
15591 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15592 case AMDGPU::G_INTRINSIC_TRUNC:
15593 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15594 case AMDGPU::G_FMA:
15595 case AMDGPU::G_FMAD:
15596 case AMDGPU::G_FSQRT:
15597 case AMDGPU::G_FDIV:
15598 case AMDGPU::G_FREM:
15599 case AMDGPU::G_FPOW:
15600 case AMDGPU::G_FPEXT:
15601 case AMDGPU::G_FLOG:
15602 case AMDGPU::G_FLOG2:
15603 case AMDGPU::G_FLOG10:
15604 case AMDGPU::G_FPTRUNC:
15605 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15606 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15607 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15608 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15609 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15610 return true;
15611 case AMDGPU::G_FNEG:
15612 case AMDGPU::G_FABS:
15613 case AMDGPU::G_FCOPYSIGN:
15614 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
15615 case AMDGPU::G_FMINNUM:
15616 case AMDGPU::G_FMAXNUM:
15617 case AMDGPU::G_FMINNUM_IEEE:
15618 case AMDGPU::G_FMAXNUM_IEEE:
15619 case AMDGPU::G_FMINIMUM:
15620 case AMDGPU::G_FMAXIMUM:
15621 case AMDGPU::G_FMINIMUMNUM:
15622 case AMDGPU::G_FMAXIMUMNUM: {
15623 if (Subtarget->supportsMinMaxDenormModes() ||
15624 // FIXME: denormalsEnabledForType is broken for dynamic
15625 denormalsEnabledForType(MRI.getType(Reg), MF))
15626 return true;
15627
15628 [[fallthrough]];
15629 }
15630 case AMDGPU::G_BUILD_VECTOR:
15631 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
15632 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
15633 return false;
15634 return true;
15635 case AMDGPU::G_INTRINSIC:
15636 case AMDGPU::G_INTRINSIC_CONVERGENT:
15637 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15638 case Intrinsic::amdgcn_fmul_legacy:
15639 case Intrinsic::amdgcn_fmad_ftz:
15640 case Intrinsic::amdgcn_sqrt:
15641 case Intrinsic::amdgcn_fmed3:
15642 case Intrinsic::amdgcn_sin:
15643 case Intrinsic::amdgcn_cos:
15644 case Intrinsic::amdgcn_log:
15645 case Intrinsic::amdgcn_exp2:
15646 case Intrinsic::amdgcn_log_clamp:
15647 case Intrinsic::amdgcn_rcp:
15648 case Intrinsic::amdgcn_rcp_legacy:
15649 case Intrinsic::amdgcn_rsq:
15650 case Intrinsic::amdgcn_rsq_clamp:
15651 case Intrinsic::amdgcn_rsq_legacy:
15652 case Intrinsic::amdgcn_div_scale:
15653 case Intrinsic::amdgcn_div_fmas:
15654 case Intrinsic::amdgcn_div_fixup:
15655 case Intrinsic::amdgcn_fract:
15656 case Intrinsic::amdgcn_cvt_pkrtz:
15657 case Intrinsic::amdgcn_cubeid:
15658 case Intrinsic::amdgcn_cubema:
15659 case Intrinsic::amdgcn_cubesc:
15660 case Intrinsic::amdgcn_cubetc:
15661 case Intrinsic::amdgcn_frexp_mant:
15662 case Intrinsic::amdgcn_fdot2:
15663 case Intrinsic::amdgcn_trig_preop:
15664 case Intrinsic::amdgcn_tanh:
15665 return true;
15666 default:
15667 break;
15668 }
15669
15670 [[fallthrough]];
15671 default:
15672 return false;
15673 }
15674
15675 llvm_unreachable("invalid operation");
15676}
15677
15678// Constant fold canonicalize.
15679SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
15680 const SDLoc &SL, EVT VT,
15681 const APFloat &C) const {
15682 // Flush denormals to 0 if not enabled.
15683 if (C.isDenormal()) {
15684 DenormalMode Mode =
15685 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
15686 if (Mode == DenormalMode::getPreserveSign()) {
15687 return DAG.getConstantFP(
15688 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
15689 }
15690
15691 if (Mode != DenormalMode::getIEEE())
15692 return SDValue();
15693 }
15694
15695 if (C.isNaN()) {
15696 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
15697 if (C.isSignaling()) {
15698 // Quiet a signaling NaN.
15699 // FIXME: Is this supposed to preserve payload bits?
15700 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
15701 }
15702
15703 // Make sure it is the canonical NaN bitpattern.
15704 //
15705 // TODO: Can we use -1 as the canonical NaN value since it's an inline
15706 // immediate?
15707 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
15708 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
15709 }
15710
15711 // Already canonical.
15712 return DAG.getConstantFP(C, SL, VT);
15713}
15714
15716 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
15717}
15718
15719SDValue
15720SITargetLowering::performFCanonicalizeCombine(SDNode *N,
15721 DAGCombinerInfo &DCI) const {
15722 SelectionDAG &DAG = DCI.DAG;
15723 SDValue N0 = N->getOperand(0);
15724 EVT VT = N->getValueType(0);
15725
15726 // fcanonicalize undef -> qnan
15727 if (N0.isUndef()) {
15729 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
15730 }
15731
15732 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
15733 EVT VT = N->getValueType(0);
15734 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
15735 }
15736
15737 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
15738 // (fcanonicalize k)
15739 //
15740 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
15741
15742 // TODO: This could be better with wider vectors that will be split to v2f16,
15743 // and to consider uses since there aren't that many packed operations.
15744 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
15745 isTypeLegal(MVT::v2f16)) {
15746 SDLoc SL(N);
15747 SDValue NewElts[2];
15748 SDValue Lo = N0.getOperand(0);
15749 SDValue Hi = N0.getOperand(1);
15750 EVT EltVT = Lo.getValueType();
15751
15753 for (unsigned I = 0; I != 2; ++I) {
15754 SDValue Op = N0.getOperand(I);
15755 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
15756 NewElts[I] =
15757 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
15758 } else if (Op.isUndef()) {
15759 // Handled below based on what the other operand is.
15760 NewElts[I] = Op;
15761 } else {
15762 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
15763 }
15764 }
15765
15766 // If one half is undef, and one is constant, prefer a splat vector rather
15767 // than the normal qNaN. If it's a register, prefer 0.0 since that's
15768 // cheaper to use and may be free with a packed operation.
15769 if (NewElts[0].isUndef()) {
15770 if (isa<ConstantFPSDNode>(NewElts[1]))
15771 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
15772 ? NewElts[1]
15773 : DAG.getConstantFP(0.0f, SL, EltVT);
15774 }
15775
15776 if (NewElts[1].isUndef()) {
15777 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
15778 ? NewElts[0]
15779 : DAG.getConstantFP(0.0f, SL, EltVT);
15780 }
15781
15782 return DAG.getBuildVector(VT, SL, NewElts);
15783 }
15784 }
15785
15786 return SDValue();
15787}
15788
15789static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
15790 switch (Opc) {
15791 case ISD::FMAXNUM:
15792 case ISD::FMAXNUM_IEEE:
15793 case ISD::FMAXIMUMNUM:
15794 return AMDGPUISD::FMAX3;
15795 case ISD::FMAXIMUM:
15796 return AMDGPUISD::FMAXIMUM3;
15797 case ISD::SMAX:
15798 return AMDGPUISD::SMAX3;
15799 case ISD::UMAX:
15800 return AMDGPUISD::UMAX3;
15801 case ISD::FMINNUM:
15802 case ISD::FMINNUM_IEEE:
15803 case ISD::FMINIMUMNUM:
15804 return AMDGPUISD::FMIN3;
15805 case ISD::FMINIMUM:
15806 return AMDGPUISD::FMINIMUM3;
15807 case ISD::SMIN:
15808 return AMDGPUISD::SMIN3;
15809 case ISD::UMIN:
15810 return AMDGPUISD::UMIN3;
15811 default:
15812 llvm_unreachable("Not a min/max opcode");
15813 }
15814}
15815
15816SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15817 const SDLoc &SL, SDValue Src,
15818 SDValue MinVal,
15819 SDValue MaxVal,
15820 bool Signed) const {
15821
15822 // med3 comes from
15823 // min(max(x, K0), K1), K0 < K1
15824 // max(min(x, K0), K1), K1 < K0
15825 //
15826 // "MinVal" and "MaxVal" respectively refer to the rhs of the
15827 // min/max op.
15828 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
15829 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
15830
15831 if (!MinK || !MaxK)
15832 return SDValue();
15833
15834 if (Signed) {
15835 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
15836 return SDValue();
15837 } else {
15838 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
15839 return SDValue();
15840 }
15841
15842 EVT VT = MinK->getValueType(0);
15843 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15844 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15845 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15846
15847 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15848 // not available, but this is unlikely to be profitable as constants
15849 // will often need to be materialized & extended, especially on
15850 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
15851 return SDValue();
15852}
15853
15856 return C;
15857
15859 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
15860 return C;
15861 }
15862
15863 return nullptr;
15864}
15865
15866SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
15867 const SDLoc &SL, SDValue Op0,
15868 SDValue Op1,
15869 bool IsKnownNoNaNs) const {
15870 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
15871 if (!K1)
15872 return SDValue();
15873
15874 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
15875 if (!K0)
15876 return SDValue();
15877
15878 // Ordered >= (although NaN inputs should have folded away by now).
15879 if (K0->getValueAPF() > K1->getValueAPF())
15880 return SDValue();
15881
15882 // med3 with a nan input acts like
15883 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15884 //
15885 // So the result depends on whether the IEEE mode bit is enabled or not with a
15886 // signaling nan input.
15887 // ieee=1
15888 // s0 snan: yields s2
15889 // s1 snan: yields s2
15890 // s2 snan: qnan
15891
15892 // s0 qnan: min(s1, s2)
15893 // s1 qnan: min(s0, s2)
15894 // s2 qnan: min(s0, s1)
15895
15896 // ieee=0
15897 // s0 snan: min(s1, s2)
15898 // s1 snan: min(s0, s2)
15899 // s2 snan: qnan
15900
15901 // s0 qnan: min(s1, s2)
15902 // s1 qnan: min(s0, s2)
15903 // s2 qnan: min(s0, s1)
15904 const MachineFunction &MF = DAG.getMachineFunction();
15905 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15906
15907 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15908 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15909 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15910 EVT VT = Op0.getValueType();
15911 if (Info->getMode().DX10Clamp) {
15912 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15913 // hardware fmed3 behavior converting to a min.
15914 // FIXME: Should this be allowing -0.0?
15915 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
15916 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
15917 }
15918
15919 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15920 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15921 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15922 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15923 // then give the other result, which is different from med3 with a NaN
15924 // input.
15925 SDValue Var = Op0.getOperand(0);
15926 if (!IsKnownNoNaNs && !DAG.isKnownNeverSNaN(Var))
15927 return SDValue();
15928
15929 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15930
15931 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
15932 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
15933 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
15934 SDValue(K0, 0), SDValue(K1, 0));
15935 }
15936 }
15937
15938 return SDValue();
15939}
15940
15941/// \return true if the subtarget supports minimum3 and maximum3 with the given
15942/// base min/max opcode \p Opc for type \p VT.
15943static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15944 EVT VT) {
15945 switch (Opc) {
15946 case ISD::FMINNUM:
15947 case ISD::FMAXNUM:
15948 case ISD::FMINNUM_IEEE:
15949 case ISD::FMAXNUM_IEEE:
15950 case ISD::FMINIMUMNUM:
15951 case ISD::FMAXIMUMNUM:
15952 case AMDGPUISD::FMIN_LEGACY:
15953 case AMDGPUISD::FMAX_LEGACY:
15954 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15955 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15956 case ISD::FMINIMUM:
15957 case ISD::FMAXIMUM:
15958 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15959 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15960 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15961 case ISD::SMAX:
15962 case ISD::SMIN:
15963 case ISD::UMAX:
15964 case ISD::UMIN:
15965 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15966 default:
15967 return false;
15968 }
15969
15970 llvm_unreachable("not a min/max opcode");
15971}
15972
15973SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15974 DAGCombinerInfo &DCI) const {
15975 SelectionDAG &DAG = DCI.DAG;
15976
15977 EVT VT = N->getValueType(0);
15978 unsigned Opc = N->getOpcode();
15979 SDValue Op0 = N->getOperand(0);
15980 SDValue Op1 = N->getOperand(1);
15981
15982 // Only do this if the inner op has one use since this will just increases
15983 // register pressure for no benefit.
15984
15985 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
15986 auto IsTreeWithCombinableChildren = [Opc](SDValue Op) {
15987 return Op.getOperand(0).getOpcode() == Opc &&
15988 Op.getOperand(1).getOpcode() == Opc &&
15989 (Op.getOperand(0).hasOneUse() || Op.getOperand(1).hasOneUse());
15990 };
15991
15992 // Tree reduction: when both operands are the same min/max op, restructure
15993 // to keep a 2-op node on top so higher tree levels can still combine.
15994 //
15995 // max(max(a, b), max(c, d)) -> max(max3(a, b, c), d)
15996 // min(min(a, b), min(c, d)) -> min(min3(a, b, c), d)
15997 //
15998 // Defer when either inner op is a tree node with combinable children.
15999 if (Op0.getOpcode() == Opc && Op0.hasOneUse() && Op1.getOpcode() == Opc &&
16000 Op1.hasOneUse() && !IsTreeWithCombinableChildren(Op0) &&
16001 !IsTreeWithCombinableChildren(Op1)) {
16002 SDLoc DL(N);
16003 SDValue Inner =
16005 Op0.getOperand(1), Op1.getOperand(0));
16006 return DAG.getNode(Opc, DL, VT, Inner, Op1.getOperand(1));
16007 }
16008
16009 // max(max(a, b), c) -> max3(a, b, c)
16010 // min(min(a, b), c) -> min3(a, b, c)
16011 // Deferred when Op0 is a tree node with combinable children.
16012 if (Op0.getOpcode() == Opc && Op0.hasOneUse() &&
16013 !IsTreeWithCombinableChildren(Op0)) {
16014 SDLoc DL(N);
16015 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
16016 Op0.getOperand(0), Op0.getOperand(1), Op1);
16017 }
16018
16019 // Try commuted.
16020 // max(a, max(b, c)) -> max3(a, b, c)
16021 // min(a, min(b, c)) -> min3(a, b, c)
16022 // Deferred when Op1 is a tree node with combinable children.
16023 if (Op1.getOpcode() == Opc && Op1.hasOneUse() &&
16024 !IsTreeWithCombinableChildren(Op1)) {
16025 SDLoc DL(N);
16026 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
16027 Op0, Op1.getOperand(0), Op1.getOperand(1));
16028 }
16029 }
16030
16031 // umin(sffbh(x), bitwidth) -> sffbh(x) if x is known to be not 0 or -1.
16032 SDValue FfbhSrc;
16033 uint64_t Clamp = 0;
16034 if (Opc == ISD::UMIN &&
16035 sd_match(Op0,
16037 sd_match(Op1, m_ConstInt(Clamp))) {
16038 unsigned BitWidth = FfbhSrc.getValueType().getScalarSizeInBits();
16039 if (Clamp >= BitWidth) {
16040 KnownBits Known = DAG.computeKnownBits(FfbhSrc);
16041 if (Known.isNonZero() && !Known.isAllOnes())
16042 return Op0;
16043 }
16044 }
16045
16046 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
16047 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
16048 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
16049 if (SDValue Med3 = performIntMed3ImmCombine(
16050 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
16051 return Med3;
16052 }
16053 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
16054 if (SDValue Med3 = performIntMed3ImmCombine(
16055 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
16056 return Med3;
16057 }
16058
16059 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
16060 if (SDValue Med3 = performIntMed3ImmCombine(
16061 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
16062 return Med3;
16063 }
16064 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
16065 if (SDValue Med3 = performIntMed3ImmCombine(
16066 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
16067 return Med3;
16068 }
16069
16070 // if !is_snan(x):
16071 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16072 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16073 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16074 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16075 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
16078 (Opc == AMDGPUISD::FMIN_LEGACY &&
16079 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
16080 (VT == MVT::f32 || VT == MVT::f64 ||
16081 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
16082 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
16083 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
16084 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
16085 Op0.hasOneUse()) {
16086 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1,
16087 N->getFlags().hasNoNaNs()))
16088 return Res;
16089 }
16090
16091 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
16092 // for some types, but at a higher cost since it's implemented with a 3
16093 // operand form.
16094 const SDNodeFlags Flags = N->getFlags();
16095 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
16096 !Subtarget->hasIEEEMinimumMaximumInsts() &&
16098 unsigned NewOpc =
16100 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
16101 }
16102
16103 return SDValue();
16104}
16105
16109 // FIXME: Should this be allowing -0.0?
16110 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
16111 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
16112 }
16113 }
16114
16115 return false;
16116}
16117
16118// FIXME: Should only worry about snans for version with chain.
16119SDValue SITargetLowering::performFMed3Combine(SDNode *N,
16120 DAGCombinerInfo &DCI) const {
16121 EVT VT = N->getValueType(0);
16122 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
16123 // NaNs. With a NaN input, the order of the operands may change the result.
16124
16125 SelectionDAG &DAG = DCI.DAG;
16126 SDLoc SL(N);
16127
16128 SDValue Src0 = N->getOperand(0);
16129 SDValue Src1 = N->getOperand(1);
16130 SDValue Src2 = N->getOperand(2);
16131
16132 if (isClampZeroToOne(Src0, Src1)) {
16133 // const_a, const_b, x -> clamp is safe in all cases including signaling
16134 // nans.
16135 // FIXME: Should this be allowing -0.0?
16136 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
16137 }
16138
16139 const MachineFunction &MF = DAG.getMachineFunction();
16140 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16141
16142 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
16143 // handling no dx10-clamp?
16144 if (Info->getMode().DX10Clamp) {
16145 // If NaNs is clamped to 0, we are free to reorder the inputs.
16146
16147 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
16148 std::swap(Src0, Src1);
16149
16150 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
16151 std::swap(Src1, Src2);
16152
16153 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
16154 std::swap(Src0, Src1);
16155
16156 if (isClampZeroToOne(Src1, Src2))
16157 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
16158 }
16159
16160 return SDValue();
16161}
16162
16163SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
16164 DAGCombinerInfo &DCI) const {
16165 SDValue Src0 = N->getOperand(0);
16166 SDValue Src1 = N->getOperand(1);
16167 if (Src0.isUndef() && Src1.isUndef())
16168 return DCI.DAG.getUNDEF(N->getValueType(0));
16169 return SDValue();
16170}
16171
16172// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
16173// expanded into a set of cmp/select instructions.
16175 unsigned NumElem,
16176 bool IsDivergentIdx,
16177 const GCNSubtarget *Subtarget) {
16179 return false;
16180
16181 unsigned VecSize = EltSize * NumElem;
16182
16183 // Sub-dword vectors of size 2 dword or less have better implementation.
16184 if (VecSize <= 64 && EltSize < 32)
16185 return false;
16186
16187 // Always expand the rest of sub-dword instructions, otherwise it will be
16188 // lowered via memory.
16189 if (EltSize < 32)
16190 return true;
16191
16192 // Always do this if var-idx is divergent, otherwise it will become a loop.
16193 if (IsDivergentIdx)
16194 return true;
16195
16196 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
16197 unsigned NumInsts = NumElem /* Number of compares */ +
16198 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
16199
16200 // On some architectures (GFX9) movrel is not available and it's better
16201 // to expand.
16202 if (Subtarget->useVGPRIndexMode())
16203 return NumInsts <= 16;
16204
16205 // If movrel is available, use it instead of expanding for vector of 8
16206 // elements.
16207 if (Subtarget->hasMovrel())
16208 return NumInsts <= 15;
16209
16210 return true;
16211}
16212
16214 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
16215 if (isa<ConstantSDNode>(Idx))
16216 return false;
16217
16218 SDValue Vec = N->getOperand(0);
16219 EVT VecVT = Vec.getValueType();
16220 EVT EltVT = VecVT.getVectorElementType();
16221 unsigned EltSize = EltVT.getSizeInBits();
16222 unsigned NumElem = VecVT.getVectorNumElements();
16223
16225 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
16226}
16227
16228SDValue
16229SITargetLowering::performExtractVectorEltCombine(SDNode *N,
16230 DAGCombinerInfo &DCI) const {
16231 SDValue Vec = N->getOperand(0);
16232 SelectionDAG &DAG = DCI.DAG;
16233
16234 EVT VecVT = Vec.getValueType();
16235 EVT VecEltVT = VecVT.getVectorElementType();
16236 EVT ResVT = N->getValueType(0);
16237
16238 unsigned VecSize = VecVT.getSizeInBits();
16239 unsigned VecEltSize = VecEltVT.getSizeInBits();
16240
16241 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
16243 SDLoc SL(N);
16244 SDValue Idx = N->getOperand(1);
16245 SDValue Elt =
16246 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
16247 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
16248 }
16249
16250 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
16251 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
16252 // There are optimisations to transform 64-bit shifts into 32-bit shifts
16253 // depending on the shift operand. See e.g. performSraCombine().
16254 // This combine ensures that the optimisation is compatible with v2i32
16255 // legalised AND.
16256 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
16257 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
16258
16260 if (!C || C->getZExtValue() != 0x1f)
16261 return SDValue();
16262
16263 SDLoc SL(N);
16264 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
16265 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
16266 Vec->getOperand(0), N->getOperand(1));
16267 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
16268 DAG.ReplaceAllUsesWith(N, A.getNode());
16269 }
16270
16271 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
16272 // =>
16273 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
16274 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
16275 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
16276 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
16277 SDLoc SL(N);
16278 SDValue Idx = N->getOperand(1);
16279 unsigned Opc = Vec.getOpcode();
16280
16281 switch (Opc) {
16282 default:
16283 break;
16284 // TODO: Support other binary operations.
16285 case ISD::FADD:
16286 case ISD::FSUB:
16287 case ISD::FMUL:
16288 case ISD::ADD:
16289 case ISD::UMIN:
16290 case ISD::UMAX:
16291 case ISD::SMIN:
16292 case ISD::SMAX:
16293 case ISD::FMAXNUM:
16294 case ISD::FMINNUM:
16295 case ISD::FMAXNUM_IEEE:
16296 case ISD::FMINNUM_IEEE:
16297 case ISD::FMAXIMUM:
16298 case ISD::FMINIMUM: {
16299 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
16300 Vec.getOperand(0), Idx);
16301 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
16302 Vec.getOperand(1), Idx);
16303
16304 DCI.AddToWorklist(Elt0.getNode());
16305 DCI.AddToWorklist(Elt1.getNode());
16306 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
16307 }
16308 }
16309 }
16310
16311 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
16313 SDLoc SL(N);
16314 SDValue Idx = N->getOperand(1);
16315 SDValue V;
16316 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
16317 SDValue IC = DAG.getVectorIdxConstant(I, SL);
16318 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
16319 if (I == 0)
16320 V = Elt;
16321 else
16322 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
16323 }
16324 return V;
16325 }
16326
16327 // EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx)
16328 // =>
16329 // i32:Lo(k) if Idx == 0, or
16330 // i32:Hi(k) if Idx == 1
16331 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
16332 if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {
16333 SDLoc SL(N);
16334 SDValue PeekThrough = Vec.getOperand(0);
16335 auto *KImm = dyn_cast<ConstantSDNode>(PeekThrough);
16336 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
16337 uint64_t KImmValue = KImm->getZExtValue();
16338 return DAG.getConstant(
16339 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
16340 }
16341 auto *KFPImm = dyn_cast<ConstantFPSDNode>(PeekThrough);
16342 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
16343 uint64_t KFPImmValue =
16344 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
16345 return DAG.getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
16346 0xffffffff,
16347 SL, MVT::i32);
16348 }
16349 }
16350
16351 if (!DCI.isBeforeLegalize())
16352 return SDValue();
16353
16354 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
16355 // elements. This exposes more load reduction opportunities by replacing
16356 // multiple small extract_vector_elements with a single 32-bit extract.
16357 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
16358 VecSize > 32 && VecSize % 32 == 0 && Idx) {
16359 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
16360
16361 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
16362 unsigned EltIdx = BitIndex / 32;
16363 unsigned LeftoverBitIdx = BitIndex % 32;
16364 SDLoc SL(N);
16365
16366 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
16367 DCI.AddToWorklist(Cast.getNode());
16368
16369 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
16370 DAG.getConstant(EltIdx, SL, MVT::i32));
16371 DCI.AddToWorklist(Elt.getNode());
16372 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
16373 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
16374 DCI.AddToWorklist(Srl.getNode());
16375
16376 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
16377 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
16378 DCI.AddToWorklist(Trunc.getNode());
16379
16380 if (VecEltVT == ResVT) {
16381 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
16382 }
16383
16384 assert(ResVT.isScalarInteger());
16385 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
16386 }
16387
16388 return SDValue();
16389}
16390
16391SDValue
16392SITargetLowering::performInsertVectorEltCombine(SDNode *N,
16393 DAGCombinerInfo &DCI) const {
16394 SDValue Vec = N->getOperand(0);
16395 SDValue Idx = N->getOperand(2);
16396 EVT VecVT = Vec.getValueType();
16397 EVT EltVT = VecVT.getVectorElementType();
16398
16399 // INSERT_VECTOR_ELT (<n x e>, var-idx)
16400 // => BUILD_VECTOR n x select (e, const-idx)
16402 return SDValue();
16403
16404 SelectionDAG &DAG = DCI.DAG;
16405 SDLoc SL(N);
16406 SDValue Ins = N->getOperand(1);
16407 EVT IdxVT = Idx.getValueType();
16408
16410 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
16411 SDValue IC = DAG.getConstant(I, SL, IdxVT);
16412 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
16413 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
16414 Ops.push_back(V);
16415 }
16416
16417 return DAG.getBuildVector(VecVT, SL, Ops);
16418}
16419
16420/// Return the source of an fp_extend from f16 to f32, or a converted FP
16421/// constant.
16423 if (Src.getOpcode() == ISD::FP_EXTEND &&
16424 Src.getOperand(0).getValueType() == MVT::f16) {
16425 return Src.getOperand(0);
16426 }
16427
16428 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
16429 APFloat Val = CFP->getValueAPF();
16430 bool LosesInfo = true;
16432 if (!LosesInfo)
16433 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
16434 }
16435
16436 return SDValue();
16437}
16438
16439SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
16440 DAGCombinerInfo &DCI) const {
16441 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
16442 "combine only useful on gfx8");
16443
16444 SDValue TruncSrc = N->getOperand(0);
16445 EVT VT = N->getValueType(0);
16446 if (VT != MVT::f16)
16447 return SDValue();
16448
16449 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
16450 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
16451 return SDValue();
16452
16453 SelectionDAG &DAG = DCI.DAG;
16454 SDLoc SL(N);
16455
16456 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
16457 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
16458 // casting back.
16459
16460 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
16461 // fmin(fmax(a, b), fmax(fmin(a, b), c))
16462 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
16463 if (!A)
16464 return SDValue();
16465
16466 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
16467 if (!B)
16468 return SDValue();
16469
16470 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
16471 if (!C)
16472 return SDValue();
16473
16474 // This changes signaling nan behavior. If an input is a signaling nan, it
16475 // would have been quieted by the fpext originally. We don't care because
16476 // these are unconstrained ops. If we needed to insert quieting canonicalizes
16477 // we would be worse off than just doing the promotion.
16478 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
16479 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
16480 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
16481 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
16482}
16483
16484unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
16485 const SDNode *N0,
16486 const SDNode *N1) const {
16487 EVT VT = N0->getValueType(0);
16488
16489 // Only do this if we are not trying to support denormals. v_mad_f32 does not
16490 // support denormals ever.
16491 if (((VT == MVT::f32 &&
16493 (VT == MVT::f16 && Subtarget->hasMadF16() &&
16496 return ISD::FMAD;
16497
16498 const TargetOptions &Options = DAG.getTarget().Options;
16499 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
16500 (N0->getFlags().hasAllowContract() &&
16501 N1->getFlags().hasAllowContract())) &&
16503 return ISD::FMA;
16504 }
16505
16506 return 0;
16507}
16508
16509// For a reassociatable opcode perform:
16510// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
16511SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
16512 SelectionDAG &DAG) const {
16513 EVT VT = N->getValueType(0);
16514 if (VT != MVT::i32 && VT != MVT::i64)
16515 return SDValue();
16516
16517 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
16518 return SDValue();
16519
16520 unsigned Opc = N->getOpcode();
16521 SDValue Op0 = N->getOperand(0);
16522 SDValue Op1 = N->getOperand(1);
16523
16524 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
16525 return SDValue();
16526
16527 if (Op0->isDivergent())
16528 std::swap(Op0, Op1);
16529
16530 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
16531 return SDValue();
16532
16533 SDValue Op2 = Op1.getOperand(1);
16534 Op1 = Op1.getOperand(0);
16535 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
16536 return SDValue();
16537
16538 if (Op1->isDivergent())
16539 std::swap(Op1, Op2);
16540
16541 SDLoc SL(N);
16542 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
16543 return DAG.getNode(Opc, SL, VT, Add1, Op2);
16544}
16545
16546static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
16547 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
16549 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
16550 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
16551 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
16552}
16553
16554// Fold
16555// y = lshr i64 x, 32
16556// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
16557// with Const.hi == -1
16558// To
16559// res = mad_u64_u32 y.lo ,Const.lo, x.lo
16561 SDValue MulLHS, SDValue MulRHS,
16562 SDValue AddRHS) {
16563 if (MulRHS.getOpcode() == ISD::SRL)
16564 std::swap(MulLHS, MulRHS);
16565
16566 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
16567 return SDValue();
16568
16569 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
16570 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
16571 MulLHS.getOperand(0) != AddRHS)
16572 return SDValue();
16573
16575 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
16576 return SDValue();
16577
16578 SDValue ConstMul =
16579 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
16580 return getMad64_32(DAG, SL, MVT::i64,
16581 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
16582 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
16583}
16584
16585// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
16586// multiplies, if any.
16587//
16588// Full 64-bit multiplies that feed into an addition are lowered here instead
16589// of using the generic expansion. The generic expansion ends up with
16590// a tree of ADD nodes that prevents us from using the "add" part of the
16591// MAD instruction. The expansion produced here results in a chain of ADDs
16592// instead of a tree.
16593SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
16594 DAGCombinerInfo &DCI) const {
16595 assert(N->isAnyAdd());
16596
16597 SelectionDAG &DAG = DCI.DAG;
16598 EVT VT = N->getValueType(0);
16599 SDLoc SL(N);
16600 SDValue LHS = N->getOperand(0);
16601 SDValue RHS = N->getOperand(1);
16602
16603 if (VT.isVector())
16604 return SDValue();
16605
16606 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
16607 // result in scalar registers for uniform values.
16608 if (!N->isDivergent() && Subtarget->hasSMulHi())
16609 return SDValue();
16610
16611 unsigned NumBits = VT.getScalarSizeInBits();
16612 if (NumBits <= 32 || NumBits > 64)
16613 return SDValue();
16614
16615 if (LHS.getOpcode() != ISD::MUL) {
16616 assert(RHS.getOpcode() == ISD::MUL);
16617 std::swap(LHS, RHS);
16618 }
16619
16620 // Avoid the fold if it would unduly increase the number of multiplies due to
16621 // multiple uses, except on hardware with full-rate multiply-add (which is
16622 // part of full-rate 64-bit ops).
16623 if (!Subtarget->hasFullRate64Ops()) {
16624 unsigned NumUsers = 0;
16625 for (SDNode *User : LHS->users()) {
16626 // There is a use that does not feed into addition, so the multiply can't
16627 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
16628 if (!User->isAnyAdd())
16629 return SDValue();
16630
16631 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
16632 // MUL + 3xADD + 3xADDC over 3xMAD.
16633 ++NumUsers;
16634 if (NumUsers >= 3)
16635 return SDValue();
16636 }
16637 }
16638
16639 SDValue MulLHS = LHS.getOperand(0);
16640 SDValue MulRHS = LHS.getOperand(1);
16641 SDValue AddRHS = RHS;
16642
16643 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
16644 return FoldedMAD;
16645
16646 // Always check whether operands are small unsigned values, since that
16647 // knowledge is useful in more cases. Check for small signed values only if
16648 // doing so can unlock a shorter code sequence.
16649 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
16650 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
16651
16652 bool MulSignedLo = false;
16653 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16654 MulSignedLo =
16655 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
16656 }
16657
16658 // The operands and final result all have the same number of bits. If
16659 // operands need to be extended, they can be extended with garbage. The
16660 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
16661 // truncated away in the end.
16662 if (VT != MVT::i64) {
16663 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
16664 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
16665 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
16666 }
16667
16668 // The basic code generated is conceptually straightforward. Pseudo code:
16669 //
16670 // accum = mad_64_32 lhs.lo, rhs.lo, accum
16671 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
16672 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
16673 //
16674 // The second and third lines are optional, depending on whether the factors
16675 // are {sign,zero}-extended or not.
16676 //
16677 // The actual DAG is noisier than the pseudo code, but only due to
16678 // instructions that disassemble values into low and high parts, and
16679 // assemble the final result.
16680 SDValue One = DAG.getConstant(1, SL, MVT::i32);
16681
16682 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
16683 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
16684 SDValue Accum =
16685 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
16686
16687 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16688 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
16689
16690 if (!MulLHSUnsigned32) {
16691 auto MulLHSHi =
16692 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
16693 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
16694 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
16695 }
16696
16697 if (!MulRHSUnsigned32) {
16698 auto MulRHSHi =
16699 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
16700 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
16701 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
16702 }
16703
16704 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
16705 Accum = DAG.getBitcast(MVT::i64, Accum);
16706 }
16707
16708 if (VT != MVT::i64)
16709 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
16710 return Accum;
16711}
16712
16713SDValue
16714SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
16715 DAGCombinerInfo &DCI) const {
16716 SDValue RHS = N->getOperand(1);
16717 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16718 if (!CRHS)
16719 return SDValue();
16720
16721 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
16722 // common.
16723 uint64_t Val = CRHS->getZExtValue();
16724 if (countr_zero(Val) >= 32) {
16725 SelectionDAG &DAG = DCI.DAG;
16726 SDLoc SL(N);
16727 SDValue LHS = N->getOperand(0);
16728
16729 // Avoid carry machinery if we know the low half of the add does not
16730 // contribute to the final result.
16731 //
16732 // add i64:x, K if computeTrailingZeros(K) >= 32
16733 // => build_pair (add x.hi, K.hi), x.lo
16734
16735 // Breaking the 64-bit add here with this strange constant is unlikely
16736 // to interfere with addressing mode patterns.
16737
16738 SDValue Hi = getHiHalf64(LHS, DAG);
16739 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
16740 unsigned Opcode = N->getOpcode();
16741 if (Opcode == ISD::PTRADD)
16742 Opcode = ISD::ADD;
16743 SDValue AddHi =
16744 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
16745
16746 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
16747 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
16748 }
16749
16750 return SDValue();
16751}
16752
16753// Collect the ultimate src of each of the mul node's operands, and confirm
16754// each operand is 8 bytes.
16755static std::optional<ByteProvider<SDValue>>
16756handleMulOperand(const SDValue &MulOperand) {
16757 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
16758 if (!Byte0 || Byte0->isConstantZero()) {
16759 return std::nullopt;
16760 }
16761 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
16762 if (Byte1 && !Byte1->isConstantZero()) {
16763 return std::nullopt;
16764 }
16765 return Byte0;
16766}
16767
16768static unsigned addPermMasks(unsigned First, unsigned Second) {
16769 unsigned FirstCs = First & 0x0c0c0c0c;
16770 unsigned SecondCs = Second & 0x0c0c0c0c;
16771 unsigned FirstNoCs = First & ~0x0c0c0c0c;
16772 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16773
16774 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16775 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16776 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16777 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16778
16779 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16780}
16781
16782struct DotSrc {
16784 int64_t PermMask;
16786};
16787
16791 SmallVectorImpl<DotSrc> &Src1s, int Step) {
16792
16793 assert(Src0.Src.has_value() && Src1.Src.has_value());
16794 // Src0s and Src1s are empty, just place arbitrarily.
16795 if (Step == 0) {
16796 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
16797 Src0.SrcOffset / 4});
16798 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
16799 Src1.SrcOffset / 4});
16800 return;
16801 }
16802
16803 for (int BPI = 0; BPI < 2; BPI++) {
16804 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
16805 if (BPI == 1) {
16806 BPP = {Src1, Src0};
16807 }
16808 unsigned ZeroMask = 0x0c0c0c0c;
16809 unsigned FMask = 0xFF << (8 * (3 - Step));
16810
16811 unsigned FirstMask =
16812 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16813 unsigned SecondMask =
16814 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16815 // Attempt to find Src vector which contains our SDValue, if so, add our
16816 // perm mask to the existing one. If we are unable to find a match for the
16817 // first SDValue, attempt to find match for the second.
16818 int FirstGroup = -1;
16819 for (int I = 0; I < 2; I++) {
16820 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
16821 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
16822 return IterElt.SrcOp == *BPP.first.Src &&
16823 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16824 };
16825
16826 auto *Match = llvm::find_if(Srcs, MatchesFirst);
16827 if (Match != Srcs.end()) {
16828 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
16829 FirstGroup = I;
16830 break;
16831 }
16832 }
16833 if (FirstGroup != -1) {
16834 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
16835 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
16836 return IterElt.SrcOp == *BPP.second.Src &&
16837 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16838 };
16839 auto *Match = llvm::find_if(Srcs, MatchesSecond);
16840 if (Match != Srcs.end()) {
16841 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
16842 } else
16843 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
16844 return;
16845 }
16846 }
16847
16848 // If we have made it here, then we could not find a match in Src0s or Src1s
16849 // for either Src0 or Src1, so just place them arbitrarily.
16850
16851 unsigned ZeroMask = 0x0c0c0c0c;
16852 unsigned FMask = 0xFF << (8 * (3 - Step));
16853
16854 Src0s.push_back(
16855 {*Src0.Src,
16856 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16857 Src0.SrcOffset / 4});
16858 Src1s.push_back(
16859 {*Src1.Src,
16860 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16861 Src1.SrcOffset / 4});
16862}
16863
16865 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
16866 bool IsAny) {
16867
16868 // If we just have one source, just permute it accordingly.
16869 if (Srcs.size() == 1) {
16870 auto *Elt = Srcs.begin();
16871 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
16872
16873 // v_perm will produce the original value
16874 if (Elt->PermMask == 0x3020100)
16875 return EltOp;
16876
16877 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16878 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
16879 }
16880
16881 auto *FirstElt = Srcs.begin();
16882 auto *SecondElt = std::next(FirstElt);
16883
16885
16886 // If we have multiple sources in the chain, combine them via perms (using
16887 // calculated perm mask) and Ors.
16888 while (true) {
16889 auto FirstMask = FirstElt->PermMask;
16890 auto SecondMask = SecondElt->PermMask;
16891
16892 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16893 unsigned FirstPlusFour = FirstMask | 0x04040404;
16894 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
16895 // original 0x0C.
16896 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16897
16898 auto PermMask = addPermMasks(FirstMask, SecondMask);
16899 auto FirstVal =
16900 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16901 auto SecondVal =
16902 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
16903
16904 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
16905 SecondVal,
16906 DAG.getConstant(PermMask, SL, MVT::i32)));
16907
16908 FirstElt = std::next(SecondElt);
16909 if (FirstElt == Srcs.end())
16910 break;
16911
16912 SecondElt = std::next(FirstElt);
16913 // If we only have a FirstElt, then just combine that into the cumulative
16914 // source node.
16915 if (SecondElt == Srcs.end()) {
16916 auto EltOp =
16917 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16918
16919 Perms.push_back(
16920 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16921 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
16922 break;
16923 }
16924 }
16925
16926 assert(Perms.size() == 1 || Perms.size() == 2);
16927 return Perms.size() == 2
16928 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
16929 : Perms[0];
16930}
16931
16932static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
16933 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16934 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16935 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16936 EntryMask += ZeroMask;
16937 }
16938}
16939
16940static bool isMul(const SDValue Op) {
16941 auto Opcode = Op.getOpcode();
16942
16943 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16944 Opcode == AMDGPUISD::MUL_I24);
16945}
16946
16947static std::optional<bool>
16949 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16950 const SDValue &S1Op, const SelectionDAG &DAG) {
16951 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16952 // of the dot4 is irrelevant.
16953 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16954 return false;
16955
16956 auto Known0 = DAG.computeKnownBits(S0Op, 0);
16957 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16958 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16959 auto Known1 = DAG.computeKnownBits(S1Op, 0);
16960 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16961 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16962
16963 assert(!(S0IsUnsigned && S0IsSigned));
16964 assert(!(S1IsUnsigned && S1IsSigned));
16965
16966 // There are 9 possible permutations of
16967 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16968
16969 // In two permutations, the sign bits are known to be the same for both Ops,
16970 // so simply return Signed / Unsigned corresponding to the MSB
16971
16972 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16973 return S0IsSigned;
16974
16975 // In another two permutations, the sign bits are known to be opposite. In
16976 // this case return std::nullopt to indicate a bad match.
16977
16978 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16979 return std::nullopt;
16980
16981 // In the remaining five permutations, we don't know the value of the sign
16982 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16983 // the upper bits must be extension bits. Thus, the only ways for the sign
16984 // bit to be unknown is if it was sign extended from unknown value, or if it
16985 // was any extended. In either case, it is correct to use the signed
16986 // version of the signedness semantics of dot4
16987
16988 // In two of such permutations, we known the sign bit is set for
16989 // one op, and the other is unknown. It is okay to used signed version of
16990 // dot4.
16991 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16992 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16993 return true;
16994
16995 // In one such permutation, we don't know either of the sign bits. It is okay
16996 // to used the signed version of dot4.
16997 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16998 return true;
16999
17000 // In two of such permutations, we known the sign bit is unset for
17001 // one op, and the other is unknown. Return std::nullopt to indicate a
17002 // bad match.
17003 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
17004 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
17005 return std::nullopt;
17006
17007 llvm_unreachable("Fully covered condition");
17008}
17009
17010SDValue SITargetLowering::performAddCombine(SDNode *N,
17011 DAGCombinerInfo &DCI) const {
17012 SelectionDAG &DAG = DCI.DAG;
17013 EVT VT = N->getValueType(0);
17014 SDLoc SL(N);
17015 SDValue LHS = N->getOperand(0);
17016 SDValue RHS = N->getOperand(1);
17017
17018 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
17019 if (Subtarget->hasMad64_32()) {
17020 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
17021 return Folded;
17022 }
17023 }
17024
17025 if (SDValue V = reassociateScalarOps(N, DAG)) {
17026 return V;
17027 }
17028
17029 if (VT == MVT::i64) {
17030 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
17031 return Folded;
17032 }
17033
17034 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
17035 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
17036 SDValue TempNode(N, 0);
17037 std::optional<bool> IsSigned;
17041
17042 // Match the v_dot4 tree, while collecting src nodes.
17043 int ChainLength = 0;
17044 for (int I = 0; I < 4; I++) {
17045 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
17046 if (MulIdx == -1)
17047 break;
17048 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
17049 if (!Src0)
17050 break;
17051 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
17052 if (!Src1)
17053 break;
17054
17055 auto IterIsSigned = checkDot4MulSignedness(
17056 TempNode->getOperand(MulIdx), *Src0, *Src1,
17057 TempNode->getOperand(MulIdx)->getOperand(0),
17058 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
17059 if (!IterIsSigned)
17060 break;
17061 if (!IsSigned)
17062 IsSigned = *IterIsSigned;
17063 if (*IterIsSigned != *IsSigned)
17064 break;
17065 placeSources(*Src0, *Src1, Src0s, Src1s, I);
17066 auto AddIdx = 1 - MulIdx;
17067 // Allow the special case where add (add (mul24, 0), mul24) became ->
17068 // add (mul24, mul24).
17069 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
17070 Src2s.push_back(TempNode->getOperand(AddIdx));
17071 auto Src0 =
17072 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
17073 if (!Src0)
17074 break;
17075 auto Src1 =
17076 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
17077 if (!Src1)
17078 break;
17079 auto IterIsSigned = checkDot4MulSignedness(
17080 TempNode->getOperand(AddIdx), *Src0, *Src1,
17081 TempNode->getOperand(AddIdx)->getOperand(0),
17082 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
17083 if (!IterIsSigned)
17084 break;
17085 assert(IsSigned);
17086 if (*IterIsSigned != *IsSigned)
17087 break;
17088 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
17089 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
17090 ChainLength = I + 2;
17091 break;
17092 }
17093
17094 TempNode = TempNode->getOperand(AddIdx);
17095 Src2s.push_back(TempNode);
17096 ChainLength = I + 1;
17097 if (TempNode->getNumOperands() < 2)
17098 break;
17099 LHS = TempNode->getOperand(0);
17100 RHS = TempNode->getOperand(1);
17101 }
17102
17103 if (ChainLength < 2)
17104 return SDValue();
17105
17106 // Masks were constructed with assumption that we would find a chain of
17107 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
17108 // 0x0c) so they do not affect dot calculation.
17109 if (ChainLength < 4) {
17110 fixMasks(Src0s, ChainLength);
17111 fixMasks(Src1s, ChainLength);
17112 }
17113
17114 SDValue Src0, Src1;
17115
17116 // If we are just using a single source for both, and have permuted the
17117 // bytes consistently, we can just use the sources without permuting
17118 // (commutation).
17119 bool UseOriginalSrc = false;
17120 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
17121 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
17122 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
17123 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
17124 SmallVector<unsigned, 4> SrcBytes;
17125 auto Src0Mask = Src0s.begin()->PermMask;
17126 SrcBytes.push_back(Src0Mask & 0xFF000000);
17127 bool UniqueEntries = true;
17128 for (auto I = 1; I < 4; I++) {
17129 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
17130
17131 if (is_contained(SrcBytes, NextByte)) {
17132 UniqueEntries = false;
17133 break;
17134 }
17135 SrcBytes.push_back(NextByte);
17136 }
17137
17138 if (UniqueEntries) {
17139 UseOriginalSrc = true;
17140
17141 auto *FirstElt = Src0s.begin();
17142 auto FirstEltOp =
17143 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
17144
17145 auto *SecondElt = Src1s.begin();
17146 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
17147 SecondElt->DWordOffset);
17148
17149 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
17150 MVT::getIntegerVT(32));
17151 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
17152 MVT::getIntegerVT(32));
17153 }
17154 }
17155
17156 if (!UseOriginalSrc) {
17157 Src0 = resolveSources(DAG, SL, Src0s, false, true);
17158 Src1 = resolveSources(DAG, SL, Src1s, false, true);
17159 }
17160
17161 assert(IsSigned);
17162 SDValue Src2 =
17163 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
17164
17165 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
17166 : Intrinsic::amdgcn_udot4,
17167 SL, MVT::i64);
17168
17169 assert(!VT.isVector());
17170 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
17171 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
17172
17173 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
17174 }
17175
17176 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
17177 return SDValue();
17178
17179 // add x, zext (setcc) => uaddo_carry x, 0, setcc
17180 // add x, sext (setcc) => usubo_carry x, 0, setcc
17181 unsigned Opc = LHS.getOpcode();
17184 std::swap(RHS, LHS);
17185
17186 Opc = RHS.getOpcode();
17187 switch (Opc) {
17188 default:
17189 break;
17190 case ISD::ZERO_EXTEND:
17191 case ISD::SIGN_EXTEND:
17192 case ISD::ANY_EXTEND: {
17193 auto Cond = RHS.getOperand(0);
17194 // If this won't be a real VOPC output, we would still need to insert an
17195 // extra instruction anyway.
17196 if (!isBoolSGPR(Cond))
17197 break;
17198 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
17199 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
17201 return DAG.getNode(Opc, SL, VTList, Args);
17202 }
17203 case ISD::UADDO_CARRY: {
17204 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
17205 if (!isNullConstant(RHS.getOperand(1)))
17206 break;
17207 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
17208 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
17209 }
17210 }
17211 return SDValue();
17212}
17213
17214SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
17215 DAGCombinerInfo &DCI) const {
17216 SelectionDAG &DAG = DCI.DAG;
17217 SDLoc DL(N);
17218 EVT VT = N->getValueType(0);
17219 SDValue N0 = N->getOperand(0);
17220 SDValue N1 = N->getOperand(1);
17221
17222 // The following folds transform PTRADDs into regular arithmetic in cases
17223 // where the PTRADD wouldn't be folded as an immediate offset into memory
17224 // instructions anyway. They are target-specific in that other targets might
17225 // prefer to not lose information about the pointer arithmetic.
17226
17227 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
17228 // Adapted from DAGCombiner::visitADDLikeCommutative.
17229 SDValue V, K;
17230 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
17231 SDNodeFlags ShlFlags = N1->getFlags();
17232 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
17233 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
17234 // preserved.
17235 SDNodeFlags NewShlFlags =
17236 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
17238 : SDNodeFlags();
17239 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
17240 DCI.AddToWorklist(Inner.getNode());
17241 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
17242 }
17243
17244 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
17245 // performAddCombine.
17246 if (N1.getOpcode() == ISD::MUL) {
17247 if (Subtarget->hasMad64_32()) {
17248 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
17249 return Folded;
17250 }
17251 }
17252
17253 // If the 32 low bits of the constant are all zero, there is nothing to fold
17254 // into an immediate offset, so it's better to eliminate the unnecessary
17255 // addition for the lower 32 bits than to preserve the PTRADD.
17256 // Analogous to a fold in performAddCombine.
17257 if (VT == MVT::i64) {
17258 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
17259 return Folded;
17260 }
17261
17262 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
17263 return SDValue();
17264
17265 SDValue X = N0;
17266 SDValue Y = N1.getOperand(0);
17267 SDValue Z = N1.getOperand(1);
17268 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
17269 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
17270
17271 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
17272 Y->isDivergent() != Z->isDivergent()) {
17273 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
17274 // y are uniform and z isn't.
17275 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
17276 // z are uniform and y isn't.
17277 // The goal is to push uniform operands up in the computation, so that they
17278 // can be handled with scalar operations. We can't use reassociateScalarOps
17279 // for this since it requires two identical commutative operations to
17280 // reassociate.
17281 if (Y->isDivergent())
17282 std::swap(Y, Z);
17283 // If both additions in the original were NUW, reassociation preserves that.
17284 SDNodeFlags ReassocFlags =
17285 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
17286 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
17287 DCI.AddToWorklist(UniformInner.getNode());
17288 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
17289 }
17290
17291 return SDValue();
17292}
17293
17294static bool isCtlzOpc(unsigned Opc) {
17295 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
17296}
17297
17298SDValue SITargetLowering::performSubCombine(SDNode *N,
17299 DAGCombinerInfo &DCI) const {
17300 SelectionDAG &DAG = DCI.DAG;
17301 EVT VT = N->getValueType(0);
17302
17303 if (VT == MVT::i64) {
17304 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
17305 return Folded;
17306 }
17307
17308 if (VT != MVT::i32)
17309 return SDValue();
17310
17311 SDLoc SL(N);
17312 SDValue LHS = N->getOperand(0);
17313 SDValue RHS = N->getOperand(1);
17314
17315 // sub x, zext (setcc) => usubo_carry x, 0, setcc
17316 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
17317 unsigned Opc = RHS.getOpcode();
17318 switch (Opc) {
17319 default:
17320 break;
17321 case ISD::ZERO_EXTEND:
17322 case ISD::SIGN_EXTEND:
17323 case ISD::ANY_EXTEND: {
17324 auto Cond = RHS.getOperand(0);
17325 // If this won't be a real VOPC output, we would still need to insert an
17326 // extra instruction anyway.
17327 if (!isBoolSGPR(Cond))
17328 break;
17329 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
17330 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
17332 return DAG.getNode(Opc, SL, VTList, Args);
17333 }
17334 }
17335
17336 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
17337 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
17338 if (!isNullConstant(LHS.getOperand(1)))
17339 return SDValue();
17340 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
17341 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
17342 }
17343
17344 // sub (ctlz (xor x, (sra x, 31))), 1 -> ctls x.
17345 if (isOneConstant(RHS) && isCtlzOpc(LHS.getOpcode())) {
17346 SDValue CtlzSrc = LHS.getOperand(0);
17347 // Check for xor x, (sra x, 31) pattern.
17348 if (CtlzSrc.getOpcode() == ISD::XOR) {
17349 SDValue X = CtlzSrc.getOperand(0);
17350 SDValue SignExt = CtlzSrc.getOperand(1);
17351 // Try both ordering of XOR operands.
17352 if (SignExt.getOpcode() != ISD::SRA)
17353 std::swap(X, SignExt);
17354 if (SignExt.getOpcode() == ISD::SRA && SignExt.getOperand(0) == X) {
17355 ConstantSDNode *ShiftAmt =
17357 unsigned BitWidth = X.getValueType().getScalarSizeInBits();
17358 if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1)
17359 return DAG.getNode(ISD::CTLS, SL, VT, X);
17360 }
17361 }
17362 }
17363
17364 return SDValue();
17365}
17366
17367SDValue
17368SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
17369 DAGCombinerInfo &DCI) const {
17370
17371 if (N->getValueType(0) != MVT::i32)
17372 return SDValue();
17373
17374 if (!isNullConstant(N->getOperand(1)))
17375 return SDValue();
17376
17377 SelectionDAG &DAG = DCI.DAG;
17378 SDValue LHS = N->getOperand(0);
17379
17380 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
17381 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
17382 unsigned LHSOpc = LHS.getOpcode();
17383 unsigned Opc = N->getOpcode();
17384 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
17385 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
17386 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
17387 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
17388 }
17389 return SDValue();
17390}
17391
17392SDValue SITargetLowering::performFAddCombine(SDNode *N,
17393 DAGCombinerInfo &DCI) const {
17394 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
17395 return SDValue();
17396
17397 SelectionDAG &DAG = DCI.DAG;
17398 EVT VT = N->getValueType(0);
17399
17400 SDLoc SL(N);
17401 SDValue LHS = N->getOperand(0);
17402 SDValue RHS = N->getOperand(1);
17403
17404 // These should really be instruction patterns, but writing patterns with
17405 // source modifiers is a pain.
17406
17407 // fadd (fadd (a, a), b) -> mad 2.0, a, b
17408 if (LHS.getOpcode() == ISD::FADD) {
17409 SDValue A = LHS.getOperand(0);
17410 if (A == LHS.getOperand(1)) {
17411 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
17412 if (FusedOp != 0) {
17413 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
17414 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
17415 }
17416 }
17417 }
17418
17419 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
17420 if (RHS.getOpcode() == ISD::FADD) {
17421 SDValue A = RHS.getOperand(0);
17422 if (A == RHS.getOperand(1)) {
17423 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
17424 if (FusedOp != 0) {
17425 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
17426 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
17427 }
17428 }
17429 }
17430
17431 return SDValue();
17432}
17433
17434SDValue SITargetLowering::performFSubCombine(SDNode *N,
17435 DAGCombinerInfo &DCI) const {
17436 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
17437 return SDValue();
17438
17439 SelectionDAG &DAG = DCI.DAG;
17440 SDLoc SL(N);
17441 EVT VT = N->getValueType(0);
17442 assert(!VT.isVector());
17443
17444 // Try to get the fneg to fold into the source modifier. This undoes generic
17445 // DAG combines and folds them into the mad.
17446 //
17447 // Only do this if we are not trying to support denormals. v_mad_f32 does
17448 // not support denormals ever.
17449 SDValue LHS = N->getOperand(0);
17450 SDValue RHS = N->getOperand(1);
17451 if (LHS.getOpcode() == ISD::FADD) {
17452 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
17453 SDValue A = LHS.getOperand(0);
17454 if (A == LHS.getOperand(1)) {
17455 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
17456 if (FusedOp != 0) {
17457 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
17458 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
17459
17460 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
17461 }
17462 }
17463 }
17464
17465 if (RHS.getOpcode() == ISD::FADD) {
17466 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
17467
17468 SDValue A = RHS.getOperand(0);
17469 if (A == RHS.getOperand(1)) {
17470 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
17471 if (FusedOp != 0) {
17472 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
17473 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
17474 }
17475 }
17476 }
17477
17478 return SDValue();
17479}
17480
17481SDValue SITargetLowering::performFDivCombine(SDNode *N,
17482 DAGCombinerInfo &DCI) const {
17483 SelectionDAG &DAG = DCI.DAG;
17484 SDLoc SL(N);
17485 EVT VT = N->getValueType(0);
17486
17487 // fsqrt legality correlates to rsq availability.
17488 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
17489 return SDValue();
17490
17491 SDValue LHS = N->getOperand(0);
17492 SDValue RHS = N->getOperand(1);
17493
17494 SDNodeFlags Flags = N->getFlags();
17495 SDNodeFlags RHSFlags = RHS->getFlags();
17496 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
17497 !RHS->hasOneUse())
17498 return SDValue();
17499
17500 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
17501 bool IsNegative = false;
17502 if (CLHS->isExactlyValue(1.0) ||
17503 (IsNegative = CLHS->isExactlyValue(-1.0))) {
17504 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
17505 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
17506 if (RHS.getOpcode() == ISD::FSQRT) {
17507 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
17508 SDValue Rsq =
17509 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
17510 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
17511 }
17512 }
17513 }
17514
17515 return SDValue();
17516}
17517
17518SDValue SITargetLowering::performFMulCombine(SDNode *N,
17519 DAGCombinerInfo &DCI) const {
17520 SelectionDAG &DAG = DCI.DAG;
17521 EVT VT = N->getValueType(0);
17522 EVT ScalarVT = VT.getScalarType();
17523 EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);
17524
17525 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
17526 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
17527 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
17528 return SDValue();
17529 }
17530
17531 SDValue LHS = N->getOperand(0);
17532 SDValue RHS = N->getOperand(1);
17533
17534 // It is cheaper to realize i32 inline constants as compared against
17535 // materializing f16 or f64 (or even non-inline f32) values,
17536 // possible via ldexp usage, as shown below :
17537 //
17538 // Given : A = 2^a & B = 2^b ; where a and b are integers.
17539 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
17540 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
17541 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
17542 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
17543 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
17544 if (!TrueNode)
17545 return SDValue();
17546 const ConstantFPSDNode *FalseNode =
17547 isConstOrConstSplatFP(RHS.getOperand(2));
17548 if (!FalseNode)
17549 return SDValue();
17550
17551 if (TrueNode->isNegative() != FalseNode->isNegative())
17552 return SDValue();
17553
17554 // For f32, only non-inline constants should be transformed.
17555 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17556 if (ScalarVT == MVT::f32 &&
17557 TII->isInlineConstant(TrueNode->getValueAPF()) &&
17558 TII->isInlineConstant(FalseNode->getValueAPF()))
17559 return SDValue();
17560
17561 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
17562 if (TrueNodeExpVal == INT_MIN)
17563 return SDValue();
17564 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
17565 if (FalseNodeExpVal == INT_MIN)
17566 return SDValue();
17567
17568 SDLoc SL(N);
17569 SDValue SelectNode =
17570 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
17571 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
17572 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
17573
17574 LHS = TrueNode->isNegative()
17575 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
17576 : LHS;
17577
17578 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
17579 }
17580
17581 return SDValue();
17582}
17583
17584SDValue SITargetLowering::performFMACombine(SDNode *N,
17585 DAGCombinerInfo &DCI) const {
17586 SelectionDAG &DAG = DCI.DAG;
17587 EVT VT = N->getValueType(0);
17588 SDLoc SL(N);
17589
17590 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17591 return SDValue();
17592
17593 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
17594 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
17595 SDValue Op1 = N->getOperand(0);
17596 SDValue Op2 = N->getOperand(1);
17597 SDValue FMA = N->getOperand(2);
17598
17599 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
17600 Op2.getOpcode() != ISD::FP_EXTEND)
17601 return SDValue();
17602
17603 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
17604 // regardless of the denorm mode setting. Therefore,
17605 // fp-contract is sufficient to allow generating fdot2.
17606 const TargetOptions &Options = DAG.getTarget().Options;
17607 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17608 (N->getFlags().hasAllowContract() &&
17609 FMA->getFlags().hasAllowContract())) {
17610 Op1 = Op1.getOperand(0);
17611 Op2 = Op2.getOperand(0);
17612 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17614 return SDValue();
17615
17616 SDValue Vec1 = Op1.getOperand(0);
17617 SDValue Idx1 = Op1.getOperand(1);
17618 SDValue Vec2 = Op2.getOperand(0);
17619
17620 SDValue FMAOp1 = FMA.getOperand(0);
17621 SDValue FMAOp2 = FMA.getOperand(1);
17622 SDValue FMAAcc = FMA.getOperand(2);
17623
17624 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
17625 FMAOp2.getOpcode() != ISD::FP_EXTEND)
17626 return SDValue();
17627
17628 FMAOp1 = FMAOp1.getOperand(0);
17629 FMAOp2 = FMAOp2.getOperand(0);
17630 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17632 return SDValue();
17633
17634 SDValue Vec3 = FMAOp1.getOperand(0);
17635 SDValue Vec4 = FMAOp2.getOperand(0);
17636 SDValue Idx2 = FMAOp1.getOperand(1);
17637
17638 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
17639 // Idx1 and Idx2 cannot be the same.
17640 Idx1 == Idx2)
17641 return SDValue();
17642
17643 if (Vec1 == Vec2 || Vec3 == Vec4)
17644 return SDValue();
17645
17646 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
17647 return SDValue();
17648
17649 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17650 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
17651 DAG.getTargetConstant(0, SL, MVT::i1));
17652 }
17653 }
17654 return SDValue();
17655}
17656
17657// Given a double-precision ordered or unordered comparison, return the
17658// condition code for an equivalent integral comparison of the operands' upper
17659// 32 bits, or `SETCC_INVALID` if not possible.
17660// For simplicity, no simplification occurs if the operands are not both known
17661// to have sign bit zero.
17662//
17663// EQ/NE:
17664// If LHS.lo32 == RHS.lo32:
17665// setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
17666// If LHS.lo32 != RHS.lo32:
17667// setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
17668// The reduction is not possible if operands may be +0 and -0.
17669// For ordered eq / unordered ne, at most one operand may be NaN.
17670// For unordered eq / ordered ne, neither operand can be NaN.
17671//
17672// LT/GE:
17673// If LHS.lo32 >= RHS.lo32 (unsigned):
17674// setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
17675// If LHS.lo32 < RHS.lo32 (unsigned):
17676// setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
17677// The reduction is only supported if both operands are nonnegative.
17678// For ordered lt / unordered ge, the RHS cannot be NaN.
17679// For unordered lt / ordered ge, neither operand can be NaN.
17680//
17681// LE/GT:
17682// If LHS.lo32 > RHS.lo32 (unsigned):
17683// setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
17684// If LHS.lo32 <= RHS.lo32 (unsigned):
17685// setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
17686// The reduction is only supported if both operands are nonnegative.
17687// For unordered le / ordered gt, the LHS cannot be NaN.
17688// For ordered le / unordered gt, neither operand can be NaN.
17690 const SDValue LHS,
17691 const SDValue RHS,
17692 const SelectionDAG &DAG) {
17693 EVT VT = LHS.getValueType();
17694 assert(VT == MVT::f64 && "Incorrect operand type!");
17695
17696 const KnownBits RHSBits = DAG.computeKnownBits(RHS);
17697 // Bail if RHS sign bit is not known to be zero.
17698 if (!RHSBits.Zero.isSignBitSet())
17699 return ISD::SETCC_INVALID;
17700
17701 const KnownBits RHSKnownLo32 = RHSBits.trunc(32);
17702 const KnownFPClass RHSFPClass =
17704 const bool RHSMaybeNaN = !RHSFPClass.isKnownNeverNaN();
17705
17706 const KnownBits LHSBits = DAG.computeKnownBits(LHS);
17707 const KnownBits LHSKnownLo32 = LHSBits.trunc(32);
17708 const KnownFPClass LHSFPClass =
17710 const bool LHSMaybeNaN = !LHSFPClass.isKnownNeverNaN();
17711
17712 // Bail if LHS sign bit is not known to be zero.
17713 if (!LHSBits.Zero.isSignBitSet())
17714 return ISD::SETCC_INVALID;
17715
17716 switch (CC) {
17717 default:
17718 break;
17719 case ISD::SETEQ:
17720 case ISD::SETOEQ:
17721 case ISD::SETUEQ:
17722 case ISD::SETONE:
17723 case ISD::SETUNE: {
17724 // OEQ should be false if either operand is NaN, so it suffices that at
17725 // least one operand is not NaN.
17726 if (CC == ISD::SETOEQ && LHSMaybeNaN && RHSMaybeNaN)
17727 break;
17728 // UEQ should be true if either operand is NaN, but this cannot be checked
17729 // on underlying bits.
17730 if (CC == ISD::SETUEQ && (LHSMaybeNaN || RHSMaybeNaN))
17731 break;
17732 // ONE should be false if either operand is NaN, but this cannot be
17733 // checked on underlying bits.
17734 if (CC == ISD::SETONE && (LHSMaybeNaN || RHSMaybeNaN))
17735 break;
17736 // UNE should be true if either operand is NaN, so it suffices that they
17737 // are not both NaN.
17738 if (CC == ISD::SETUNE && LHSMaybeNaN && RHSMaybeNaN)
17739 break;
17740
17741 const std::optional<bool> KnownEq =
17742 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
17743
17744 if (!KnownEq)
17745 break;
17746
17747 if (*KnownEq)
17748 return (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ)
17749 ? ISD::SETEQ
17750 : ISD::SETNE;
17751
17752 return (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ)
17754 : ISD::SETTRUE;
17755 }
17756 case ISD::SETLT:
17757 case ISD::SETOLT:
17758 case ISD::SETULT:
17759 case ISD::SETGE:
17760 case ISD::SETOGE:
17761 case ISD::SETUGE: {
17762 // OLT should be false if either operand is NaN.
17763 // Since NaNs have maximum exponent and nonzero mantissa, false positives
17764 // are only possible if the RHS is NaN. (No issue with RHS == +inf since
17765 // the inequality is strict)
17766 if (CC == ISD::SETOLT && RHSMaybeNaN)
17767 break;
17768 // ULT should be true if either operand is NaN, but this cannot be ensured
17769 // with a truncated comparison.
17770 if (CC == ISD::SETULT && (LHSMaybeNaN || RHSMaybeNaN))
17771 break;
17772 // OGE should be false if either operand is NaN, but this cannot be
17773 // ensured with a truncated comparison.
17774 if (CC == ISD::SETOGE && (LHSMaybeNaN || RHSMaybeNaN))
17775 break;
17776 // UGE should be true if either operand is NaN.
17777 // False negatives are only possible if the RHS is NaN.
17778 // (No issue with RHS == +inf since the inequality is inclusive)
17779 if (CC == ISD::SETUGE && RHSMaybeNaN)
17780 break;
17781
17782 const std::optional<bool> KnownUge =
17783 KnownBits::uge(LHSKnownLo32, RHSKnownLo32);
17784
17785 if (!KnownUge)
17786 break;
17787
17788 if (*KnownUge) {
17789 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
17790 return (CC == ISD::SETLT || CC == ISD::SETOLT || CC == ISD::SETULT)
17791 ? ISD::SETLT
17792 : ISD::SETGE;
17793 }
17794 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
17795 return (CC == ISD::SETLT || CC == ISD::SETOLT || CC == ISD::SETULT)
17796 ? ISD::SETLE
17797 : ISD::SETGT;
17798 }
17799 case ISD::SETLE:
17800 case ISD::SETOLE:
17801 case ISD::SETULE:
17802 case ISD::SETGT:
17803 case ISD::SETOGT:
17804 case ISD::SETUGT: {
17805 // OLE should be false if either operand is NaN, but this cannot be
17806 // ensured with a truncated comparison.
17807 if (CC == ISD::SETOLE && (LHSMaybeNaN || RHSMaybeNaN))
17808 break;
17809 // ULE should be true if either operand is NaN.
17810 // False negatives are only possible if the LHS is NaN.
17811 // (No issue with LHS == +inf since the inequality is inclusive)
17812 if (CC == ISD::SETULE && LHSMaybeNaN)
17813 break;
17814 // OGT should be false if either operand is NaN.
17815 // False positives are only possible if the LHS is NaN.
17816 // (No issue with LHS == +inf since the inequality is strict)
17817 if (CC == ISD::SETOGT && LHSMaybeNaN)
17818 break;
17819 // UGT should be true if either operand is NaN, but this cannot be ensured
17820 // with a truncated comparison.
17821 if (CC == ISD::SETUGT && (LHSMaybeNaN || RHSMaybeNaN))
17822 break;
17823
17824 const std::optional<bool> KnownUle =
17825 KnownBits::ule(LHSKnownLo32, RHSKnownLo32);
17826
17827 if (!KnownUle)
17828 break;
17829
17830 if (*KnownUle) {
17831 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
17832 return (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE)
17833 ? ISD::SETLE
17834 : ISD::SETGT;
17835 }
17836 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
17837 return (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE)
17838 ? ISD::SETLT
17839 : ISD::SETGE;
17840 }
17841 }
17842
17843 return ISD::SETCC_INVALID;
17844}
17845
17846SDValue SITargetLowering::performSetCCCombine(SDNode *N,
17847 DAGCombinerInfo &DCI) const {
17848 SelectionDAG &DAG = DCI.DAG;
17849 SDLoc SL(N);
17850
17851 SDValue LHS = N->getOperand(0);
17852 SDValue RHS = N->getOperand(1);
17853 EVT VT = LHS.getValueType();
17854 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
17855
17856 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
17857 if (!CRHS) {
17859 if (CRHS) {
17860 std::swap(LHS, RHS);
17861 CC = getSetCCSwappedOperands(CC);
17862 }
17863 }
17864
17865 if (CRHS) {
17866 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
17867 isBoolSGPR(LHS.getOperand(0))) {
17868 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
17869 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
17870 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
17871 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
17872 if ((CRHS->isAllOnes() &&
17873 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
17874 (CRHS->isZero() &&
17875 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
17876 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
17877 DAG.getAllOnesConstant(SL, MVT::i1));
17878 if ((CRHS->isAllOnes() &&
17879 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
17880 (CRHS->isZero() &&
17881 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
17882 return LHS.getOperand(0);
17883 }
17884
17885 const APInt &CRHSVal = CRHS->getAPIntValue();
17886 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
17887 LHS.getOpcode() == ISD::SELECT &&
17888 isa<ConstantSDNode>(LHS.getOperand(1)) &&
17889 isa<ConstantSDNode>(LHS.getOperand(2)) &&
17890 isBoolSGPR(LHS.getOperand(0))) {
17891 // Given CT != FT:
17892 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
17893 // setcc (select cc, CT, CF), CF, ne => cc
17894 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
17895 // setcc (select cc, CT, CF), CT, eq => cc
17896 const APInt &CT = LHS.getConstantOperandAPInt(1);
17897 const APInt &CF = LHS.getConstantOperandAPInt(2);
17898
17899 if (CT != CF) {
17900 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
17901 (CT == CRHSVal && CC == ISD::SETNE))
17902 return DAG.getNOT(SL, LHS.getOperand(0), MVT::i1);
17903 if ((CF == CRHSVal && CC == ISD::SETNE) ||
17904 (CT == CRHSVal && CC == ISD::SETEQ))
17905 return LHS.getOperand(0);
17906 }
17907 }
17908 }
17909
17910 // Truncate 64-bit setcc to test only upper 32-bits of its operands in the
17911 // following cases where information about the lower 32-bits of its operands
17912 // is known:
17913 //
17914 // If LHS.lo32 == RHS.lo32:
17915 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
17916 // If LHS.lo32 != RHS.lo32:
17917 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
17918 // If LHS.lo32 >= RHS.lo32 (unsigned):
17919 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
17920 // If LHS.lo32 > RHS.lo32 (unsigned):
17921 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
17922 // If LHS.lo32 <= RHS.lo32 (unsigned):
17923 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
17924 // If LHS.lo32 < RHS.lo32 (unsigned):
17925 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
17926 if (VT == MVT::i64) {
17927 const KnownBits LHSKnownLo32 = DAG.computeKnownBits(LHS).trunc(32);
17928 const KnownBits RHSKnownLo32 = DAG.computeKnownBits(RHS).trunc(32);
17929
17930 // NewCC is valid iff we can truncate the setcc to only test the upper 32
17931 // bits
17933
17934 switch (CC) {
17935 default:
17936 break;
17937 case ISD::SETEQ: {
17938 const std::optional<bool> KnownEq =
17939 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
17940 if (KnownEq)
17941 NewCC = *KnownEq ? ISD::SETEQ : ISD::SETFALSE;
17942
17943 break;
17944 }
17945 case ISD::SETNE: {
17946 const std::optional<bool> KnownEq =
17947 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
17948 if (KnownEq)
17949 NewCC = *KnownEq ? ISD::SETNE : ISD::SETTRUE;
17950
17951 break;
17952 }
17953 case ISD::SETULT:
17954 case ISD::SETUGE:
17955 case ISD::SETLT:
17956 case ISD::SETGE: {
17957 const std::optional<bool> KnownUge =
17958 KnownBits::uge(LHSKnownLo32, RHSKnownLo32);
17959 if (KnownUge) {
17960 if (*KnownUge) {
17961 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
17962 NewCC = CC;
17963 } else {
17964 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
17965 NewCC = CC == ISD::SETULT ? ISD::SETULE
17966 : CC == ISD::SETUGE ? ISD::SETUGT
17967 : CC == ISD::SETLT ? ISD::SETLE
17968 : ISD::SETGT;
17969 }
17970 }
17971 break;
17972 }
17973 case ISD::SETULE:
17974 case ISD::SETUGT:
17975 case ISD::SETLE:
17976 case ISD::SETGT: {
17977 const std::optional<bool> KnownUle =
17978 KnownBits::ule(LHSKnownLo32, RHSKnownLo32);
17979 if (KnownUle) {
17980 if (*KnownUle) {
17981 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
17982 NewCC = CC;
17983 } else {
17984 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
17985 NewCC = CC == ISD::SETULE ? ISD::SETULT
17986 : CC == ISD::SETUGT ? ISD::SETUGE
17987 : CC == ISD::SETLE ? ISD::SETLT
17988 : ISD::SETGE;
17989 }
17990 }
17991 break;
17992 }
17993 }
17994
17995 if (NewCC != ISD::SETCC_INVALID)
17996 return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
17997 getHiHalf64(RHS, DAG), NewCC);
17998 }
17999
18000 // Eliminate setcc by using carryout from add/sub instruction
18001
18002 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
18003 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
18004 // similarly for subtraction
18005
18006 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
18007 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
18008
18009 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
18011 (CC == ISD::SETUGT &&
18013 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
18014 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
18015 bool IsAdd = LHS.getOpcode() == ISD::ADD;
18016
18017 SDValue Op0 = LHS.getOperand(0);
18018 SDValue Op1 = LHS.getOperand(1);
18019
18020 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
18021 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
18022
18023 SDValue Op0Hi = getHiHalf64(Op0, DAG);
18024 SDValue Op1Hi = getHiHalf64(Op1, DAG);
18025
18026 SDValue NodeLo =
18027 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
18028 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
18029
18030 SDValue CarryInHi = NodeLo.getValue(1);
18031 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
18032 SL, DAG.getVTList(MVT::i32, MVT::i1),
18033 {Op0Hi, Op1Hi, CarryInHi});
18034
18035 SDValue ResultLo = NodeLo.getValue(0);
18036 SDValue ResultHi = NodeHi.getValue(0);
18037
18038 SDValue JoinedResult =
18039 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
18040
18041 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
18042 SDValue Overflow = NodeHi.getValue(1);
18043 DCI.CombineTo(LHS.getNode(), Result);
18044 return Overflow;
18045 }
18046
18047 if (VT != MVT::f32 && VT != MVT::f64 &&
18048 (!Subtarget->has16BitInsts() || VT != MVT::f16))
18049 return SDValue();
18050
18051 // Match isinf/isfinite pattern
18052 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
18053 // (fcmp one (fabs x), inf) -> (fp_class x,
18054 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
18055 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
18056 LHS.getOpcode() == ISD::FABS) {
18057 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
18058 if (!CRHS)
18059 return SDValue();
18060
18061 const APFloat &APF = CRHS->getValueAPF();
18062 if (APF.isInfinity() && !APF.isNegative()) {
18063 const unsigned IsInfMask =
18065 const unsigned IsFiniteMask =
18069 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
18070 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
18071 DAG.getConstant(Mask, SL, MVT::i32));
18072 }
18073 }
18074
18075 if (VT == MVT::f64) {
18076 ISD::CondCode HiHalfCC = tryReduceF64CompareToHiHalf(CC, LHS, RHS, DAG);
18077 if (HiHalfCC != ISD::SETCC_INVALID)
18078 return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
18079 getHiHalf64(RHS, DAG), HiHalfCC);
18080 }
18081
18082 return SDValue();
18083}
18084
18085SDValue
18086SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
18087 DAGCombinerInfo &DCI) const {
18088 SelectionDAG &DAG = DCI.DAG;
18089 SDLoc SL(N);
18090 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
18091
18092 SDValue Src = N->getOperand(0);
18093 SDValue Shift = N->getOperand(0);
18094
18095 // TODO: Extend type shouldn't matter (assuming legal types).
18096 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
18097 Shift = Shift.getOperand(0);
18098
18099 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
18100 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
18101 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
18102 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
18103 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
18104 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
18105 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
18106 SDValue Shifted = DAG.getZExtOrTrunc(
18107 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
18108
18109 unsigned ShiftOffset = 8 * Offset;
18110 if (Shift.getOpcode() == ISD::SHL)
18111 ShiftOffset -= C->getZExtValue();
18112 else
18113 ShiftOffset += C->getZExtValue();
18114
18115 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
18116 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
18117 MVT::f32, Shifted);
18118 }
18119 }
18120 }
18121
18122 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18123 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
18124 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
18125 // We simplified Src. If this node is not dead, visit it again so it is
18126 // folded properly.
18127 if (N->getOpcode() != ISD::DELETED_NODE)
18128 DCI.AddToWorklist(N);
18129 return SDValue(N, 0);
18130 }
18131
18132 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
18133 if (SDValue DemandedSrc =
18134 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
18135 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
18136
18137 return SDValue();
18138}
18139
18140SDValue SITargetLowering::performClampCombine(SDNode *N,
18141 DAGCombinerInfo &DCI) const {
18142 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
18143 if (!CSrc)
18144 return SDValue();
18145
18146 const MachineFunction &MF = DCI.DAG.getMachineFunction();
18147 const APFloat &F = CSrc->getValueAPF();
18148 APFloat Zero = APFloat::getZero(F.getSemantics());
18149 if (F < Zero ||
18150 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
18151 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
18152 }
18153
18154 APFloat One(F.getSemantics(), "1.0");
18155 if (F > One)
18156 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
18157
18158 return SDValue(CSrc, 0);
18159}
18160
18161SDValue SITargetLowering::performSelectCombine(SDNode *N,
18162 DAGCombinerInfo &DCI) const {
18163
18164 // Try to fold CMP + SELECT patterns with shared constants (both FP and
18165 // integer).
18166 // Detect when CMP and SELECT use the same constant and fold them to avoid
18167 // loading the constant twice. Specifically handles patterns like:
18168 // %cmp = icmp eq i32 %val, 4242
18169 // %sel = select i1 %cmp, i32 4242, i32 %other
18170 // It can be optimized to reuse %val instead of 4242 in select.
18171 SDValue Cond = N->getOperand(0);
18172 SDValue TrueVal = N->getOperand(1);
18173 SDValue FalseVal = N->getOperand(2);
18174
18175 // Check if condition is a comparison.
18176 if (Cond.getOpcode() != ISD::SETCC)
18177 return SDValue();
18178
18179 SDValue LHS = Cond.getOperand(0);
18180 SDValue RHS = Cond.getOperand(1);
18181 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
18182
18183 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
18184 bool isInteger = LHS.getValueType().isInteger();
18185
18186 // Handle simple floating-point and integer types only.
18187 if (!isFloatingPoint && !isInteger)
18188 return SDValue();
18189
18190 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
18191 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
18192 if (!isEquality && !isNonEquality)
18193 return SDValue();
18194
18195 SDValue ArgVal, ConstVal;
18196 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
18197 (isInteger && isa<ConstantSDNode>(RHS))) {
18198 ConstVal = RHS;
18199 ArgVal = LHS;
18200 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
18201 (isInteger && isa<ConstantSDNode>(LHS))) {
18202 ConstVal = LHS;
18203 ArgVal = RHS;
18204 } else {
18205 return SDValue();
18206 }
18207
18208 // Skip optimization for inlinable immediates.
18209 if (isFloatingPoint) {
18210 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
18211 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
18212 return SDValue();
18213 } else {
18215 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
18216 return SDValue();
18217 }
18218
18219 // For equality and non-equality comparisons, patterns:
18220 // select (setcc x, const), const, y -> select (setcc x, const), x, y
18221 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
18222 if (!(isEquality && TrueVal == ConstVal) &&
18223 !(isNonEquality && FalseVal == ConstVal))
18224 return SDValue();
18225
18226 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
18227 SDValue SelectRHS =
18228 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
18229 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
18230 SelectLHS, SelectRHS);
18231}
18232
18234 DAGCombinerInfo &DCI) const {
18235 switch (N->getOpcode()) {
18236 case ISD::ADD:
18237 case ISD::SUB:
18238 case ISD::SHL:
18239 case ISD::SRL:
18240 case ISD::SRA:
18241 case ISD::AND:
18242 case ISD::OR:
18243 case ISD::XOR:
18244 case ISD::MUL:
18245 case ISD::SETCC:
18246 case ISD::SELECT:
18247 case ISD::SMIN:
18248 case ISD::SMAX:
18249 case ISD::UMIN:
18250 case ISD::UMAX:
18251 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
18252 return Res;
18253 break;
18254 default:
18255 break;
18256 }
18257
18258 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
18259 return SDValue();
18260
18261 switch (N->getOpcode()) {
18262 case ISD::ADD:
18263 return performAddCombine(N, DCI);
18264 case ISD::PTRADD:
18265 return performPtrAddCombine(N, DCI);
18266 case ISD::SUB:
18267 return performSubCombine(N, DCI);
18268 case ISD::UADDO_CARRY:
18269 case ISD::USUBO_CARRY:
18270 return performAddCarrySubCarryCombine(N, DCI);
18271 case ISD::FADD:
18272 return performFAddCombine(N, DCI);
18273 case ISD::FSUB:
18274 return performFSubCombine(N, DCI);
18275 case ISD::FDIV:
18276 return performFDivCombine(N, DCI);
18277 case ISD::FMUL:
18278 return performFMulCombine(N, DCI);
18279 case ISD::SETCC:
18280 return performSetCCCombine(N, DCI);
18281 case ISD::SELECT:
18282 if (auto Res = performSelectCombine(N, DCI))
18283 return Res;
18284 break;
18285 case ISD::FMAXNUM:
18286 case ISD::FMINNUM:
18287 case ISD::FMAXNUM_IEEE:
18288 case ISD::FMINNUM_IEEE:
18289 case ISD::FMAXIMUM:
18290 case ISD::FMINIMUM:
18291 case ISD::FMAXIMUMNUM:
18292 case ISD::FMINIMUMNUM:
18293 case ISD::SMAX:
18294 case ISD::SMIN:
18295 case ISD::UMAX:
18296 case ISD::UMIN:
18297 case AMDGPUISD::FMIN_LEGACY:
18298 case AMDGPUISD::FMAX_LEGACY:
18299 return performMinMaxCombine(N, DCI);
18300 case ISD::FMA:
18301 return performFMACombine(N, DCI);
18302 case ISD::AND:
18303 return performAndCombine(N, DCI);
18304 case ISD::OR:
18305 return performOrCombine(N, DCI);
18306 case ISD::FSHR: {
18308 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
18309 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
18310 return matchPERM(N, DCI);
18311 }
18312 break;
18313 }
18314 case ISD::XOR:
18315 return performXorCombine(N, DCI);
18316 case ISD::ANY_EXTEND:
18317 case ISD::ZERO_EXTEND:
18318 return performZeroOrAnyExtendCombine(N, DCI);
18320 return performSignExtendInRegCombine(N, DCI);
18321 case AMDGPUISD::FP_CLASS:
18322 return performClassCombine(N, DCI);
18323 case ISD::FCANONICALIZE:
18324 return performFCanonicalizeCombine(N, DCI);
18325 case AMDGPUISD::RCP:
18326 return performRcpCombine(N, DCI);
18327 case ISD::FLDEXP:
18328 case AMDGPUISD::FRACT:
18329 case AMDGPUISD::RSQ:
18330 case AMDGPUISD::RCP_LEGACY:
18331 case AMDGPUISD::RCP_IFLAG:
18332 case AMDGPUISD::RSQ_CLAMP: {
18333 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
18334 SDValue Src = N->getOperand(0);
18335 if (Src.isUndef())
18336 return Src;
18337 break;
18338 }
18339 case ISD::SINT_TO_FP:
18340 case ISD::UINT_TO_FP:
18341 return performUCharToFloatCombine(N, DCI);
18342 case ISD::FCOPYSIGN:
18343 return performFCopySignCombine(N, DCI);
18344 case AMDGPUISD::CVT_F32_UBYTE0:
18345 case AMDGPUISD::CVT_F32_UBYTE1:
18346 case AMDGPUISD::CVT_F32_UBYTE2:
18347 case AMDGPUISD::CVT_F32_UBYTE3:
18348 return performCvtF32UByteNCombine(N, DCI);
18349 case AMDGPUISD::FMED3:
18350 return performFMed3Combine(N, DCI);
18351 case AMDGPUISD::CVT_PKRTZ_F16_F32:
18352 return performCvtPkRTZCombine(N, DCI);
18353 case AMDGPUISD::CLAMP:
18354 return performClampCombine(N, DCI);
18355 case ISD::SCALAR_TO_VECTOR: {
18356 SelectionDAG &DAG = DCI.DAG;
18357 EVT VT = N->getValueType(0);
18358
18359 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
18360 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
18361 SDLoc SL(N);
18362 SDValue Src = N->getOperand(0);
18363 EVT EltVT = Src.getValueType();
18364 if (EltVT != MVT::i16)
18365 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
18366
18367 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
18368 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
18369 }
18370
18371 break;
18372 }
18374 return performExtractVectorEltCombine(N, DCI);
18376 return performInsertVectorEltCombine(N, DCI);
18377 case ISD::FP_ROUND:
18378 return performFPRoundCombine(N, DCI);
18379 case ISD::LOAD: {
18380 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
18381 return Widened;
18382 [[fallthrough]];
18383 }
18384 default: {
18385 if (!DCI.isBeforeLegalize()) {
18386 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
18387 return performMemSDNodeCombine(MemNode, DCI);
18388 }
18389
18390 break;
18391 }
18392 }
18393
18395}
18396
18397/// Helper function for adjustWritemask
18398static unsigned SubIdx2Lane(unsigned Idx) {
18399 switch (Idx) {
18400 default:
18401 return ~0u;
18402 case AMDGPU::sub0:
18403 return 0;
18404 case AMDGPU::sub1:
18405 return 1;
18406 case AMDGPU::sub2:
18407 return 2;
18408 case AMDGPU::sub3:
18409 return 3;
18410 case AMDGPU::sub4:
18411 return 4; // Possible with TFE/LWE
18412 }
18413}
18414
18415/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
18416SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
18417 SelectionDAG &DAG) const {
18418 unsigned Opcode = Node->getMachineOpcode();
18419
18420 // Subtract 1 because the vdata output is not a MachineSDNode operand.
18421 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
18422 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
18423 return Node; // not implemented for D16
18424
18425 SDNode *Users[5] = {nullptr};
18426 unsigned Lane = 0;
18427 unsigned DmaskIdx =
18428 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
18429 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
18430 unsigned NewDmask = 0;
18431 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
18432 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
18433 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
18434 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
18435 unsigned TFCLane = 0;
18436 bool HasChain = Node->getNumValues() > 1;
18437
18438 if (OldDmask == 0) {
18439 // These are folded out, but on the chance it happens don't assert.
18440 return Node;
18441 }
18442
18443 unsigned OldBitsSet = llvm::popcount(OldDmask);
18444 // Work out which is the TFE/LWE lane if that is enabled.
18445 if (UsesTFC) {
18446 TFCLane = OldBitsSet;
18447 }
18448
18449 // Try to figure out the used register components
18450 for (SDUse &Use : Node->uses()) {
18451
18452 // Don't look at users of the chain.
18453 if (Use.getResNo() != 0)
18454 continue;
18455
18456 SDNode *User = Use.getUser();
18457
18458 // Abort if we can't understand the usage
18459 if (!User->isMachineOpcode() ||
18460 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
18461 return Node;
18462
18463 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
18464 // Note that subregs are packed, i.e. Lane==0 is the first bit set
18465 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
18466 // set, etc.
18467 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
18468 if (Lane == ~0u)
18469 return Node;
18470
18471 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
18472 if (UsesTFC && Lane == TFCLane) {
18473 Users[Lane] = User;
18474 } else {
18475 // Set which texture component corresponds to the lane.
18476 unsigned Comp;
18477 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
18478 Comp = llvm::countr_zero(Dmask);
18479 Dmask &= ~(1 << Comp);
18480 }
18481
18482 // Abort if we have more than one user per component.
18483 if (Users[Lane])
18484 return Node;
18485
18486 Users[Lane] = User;
18487 NewDmask |= 1 << Comp;
18488 }
18489 }
18490
18491 // Don't allow 0 dmask, as hardware assumes one channel enabled.
18492 bool NoChannels = !NewDmask;
18493 if (NoChannels) {
18494 if (!UsesTFC) {
18495 // No uses of the result and not using TFC. Then do nothing.
18496 return Node;
18497 }
18498 // If the original dmask has one channel - then nothing to do
18499 if (OldBitsSet == 1)
18500 return Node;
18501 // Use an arbitrary dmask - required for the instruction to work
18502 NewDmask = 1;
18503 }
18504 // Abort if there's no change
18505 if (NewDmask == OldDmask)
18506 return Node;
18507
18508 unsigned BitsSet = llvm::popcount(NewDmask);
18509
18510 // Check for TFE or LWE - increase the number of channels by one to account
18511 // for the extra return value
18512 // This will need adjustment for D16 if this is also included in
18513 // adjustWriteMask (this function) but at present D16 are excluded.
18514 unsigned NewChannels = BitsSet + UsesTFC;
18515
18516 int NewOpcode =
18517 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
18518 assert(NewOpcode != -1 &&
18519 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
18520 "failed to find equivalent MIMG op");
18521
18522 // Adjust the writemask in the node
18524 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
18525 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
18526 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
18527
18528 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
18529
18530 MVT ResultVT = NewChannels == 1
18531 ? SVT
18532 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
18533 : NewChannels == 5 ? 8
18534 : NewChannels);
18535 SDVTList NewVTList =
18536 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
18537
18538 MachineSDNode *NewNode =
18539 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
18540
18541 if (HasChain) {
18542 // Update chain.
18543 DAG.setNodeMemRefs(NewNode, Node->memoperands());
18544 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
18545 }
18546
18547 if (NewChannels == 1) {
18548 assert(Node->hasNUsesOfValue(1, 0));
18549 SDNode *Copy =
18550 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
18551 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
18552 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
18553 return nullptr;
18554 }
18555
18556 // Update the users of the node with the new indices
18557 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
18558 SDNode *User = Users[i];
18559 if (!User) {
18560 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
18561 // Users[0] is still nullptr because channel 0 doesn't really have a use.
18562 if (i || !NoChannels)
18563 continue;
18564 } else {
18565 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
18566 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
18567 if (NewUser != User) {
18568 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
18569 DAG.RemoveDeadNode(User);
18570 }
18571 }
18572
18573 switch (Idx) {
18574 default:
18575 break;
18576 case AMDGPU::sub0:
18577 Idx = AMDGPU::sub1;
18578 break;
18579 case AMDGPU::sub1:
18580 Idx = AMDGPU::sub2;
18581 break;
18582 case AMDGPU::sub2:
18583 Idx = AMDGPU::sub3;
18584 break;
18585 case AMDGPU::sub3:
18586 Idx = AMDGPU::sub4;
18587 break;
18588 }
18589 }
18590
18591 DAG.RemoveDeadNode(Node);
18592 return nullptr;
18593}
18594
18596 if (Op.getOpcode() == ISD::AssertZext)
18597 Op = Op.getOperand(0);
18598
18599 return isa<FrameIndexSDNode>(Op);
18600}
18601
18602/// Legalize target independent instructions (e.g. INSERT_SUBREG)
18603/// with frame index operands.
18604/// LLVM assumes that inputs are to these instructions are registers.
18605SDNode *
18607 SelectionDAG &DAG) const {
18608 if (Node->getOpcode() == ISD::CopyToReg) {
18609 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
18610 SDValue SrcVal = Node->getOperand(2);
18611
18612 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
18613 // to try understanding copies to physical registers.
18614 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
18615 SDLoc SL(Node);
18617 SDValue VReg = DAG.getRegister(
18618 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
18619
18620 SDNode *Glued = Node->getGluedNode();
18621 SDValue ToVReg = DAG.getCopyToReg(
18622 Node->getOperand(0), SL, VReg, SrcVal,
18623 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
18624 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
18625 VReg, ToVReg.getValue(1));
18626 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
18627 DAG.RemoveDeadNode(Node);
18628 return ToResultReg.getNode();
18629 }
18630 }
18631
18633 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
18634 if (!isFrameIndexOp(Node->getOperand(i))) {
18635 Ops.push_back(Node->getOperand(i));
18636 continue;
18637 }
18638
18639 SDLoc DL(Node);
18640 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
18641 Node->getOperand(i).getValueType(),
18642 Node->getOperand(i)),
18643 0));
18644 }
18645
18646 return DAG.UpdateNodeOperands(Node, Ops);
18647}
18648
18649/// Fold the instructions after selecting them.
18650/// Returns null if users were already updated.
18652 SelectionDAG &DAG) const {
18654 unsigned Opcode = Node->getMachineOpcode();
18655
18656 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
18657 !TII->isGather4(Opcode) &&
18658 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
18659 return adjustWritemask(Node, DAG);
18660 }
18661
18662 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
18664 return Node;
18665 }
18666
18667 switch (Opcode) {
18668 case AMDGPU::V_DIV_SCALE_F32_e64:
18669 case AMDGPU::V_DIV_SCALE_F64_e64: {
18670 // Satisfy the operand register constraint when one of the inputs is
18671 // undefined. Ordinarily each undef value will have its own implicit_def of
18672 // a vreg, so force these to use a single register.
18673 SDValue Src0 = Node->getOperand(1);
18674 SDValue Src1 = Node->getOperand(3);
18675 SDValue Src2 = Node->getOperand(5);
18676
18677 if ((Src0.isMachineOpcode() &&
18678 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
18679 (Src0 == Src1 || Src0 == Src2))
18680 break;
18681
18682 MVT VT = Src0.getValueType().getSimpleVT();
18683 const TargetRegisterClass *RC =
18684 getRegClassFor(VT, Src0.getNode()->isDivergent());
18685
18687 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
18688
18689 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
18690 Src0, SDValue());
18691
18692 // src0 must be the same register as src1 or src2, even if the value is
18693 // undefined, so make sure we don't violate this constraint.
18694 if (Src0.isMachineOpcode() &&
18695 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
18696 if (Src1.isMachineOpcode() &&
18697 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
18698 Src0 = Src1;
18699 else if (Src2.isMachineOpcode() &&
18700 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
18701 Src0 = Src2;
18702 else {
18703 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
18704 Src0 = UndefReg;
18705 Src1 = UndefReg;
18706 }
18707 } else
18708 break;
18709
18711 Ops[1] = Src0;
18712 Ops[3] = Src1;
18713 Ops[5] = Src2;
18714 Ops.push_back(ImpDef.getValue(1));
18715 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
18716 }
18717 default:
18718 break;
18719 }
18720
18721 return Node;
18722}
18723
18724// Any MIMG instructions that use tfe or lwe require an initialization of the
18725// result register that will be written in the case of a memory access failure.
18726// The required code is also added to tie this init code to the result of the
18727// img instruction.
18730 const SIRegisterInfo &TRI = TII->getRegisterInfo();
18731 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
18732 MachineBasicBlock &MBB = *MI.getParent();
18733
18734 int DstIdx =
18735 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
18736 unsigned InitIdx = 0;
18737
18738 if (TII->isImage(MI)) {
18739 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
18740 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
18741 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
18742
18743 if (!TFE && !LWE) // intersect_ray
18744 return;
18745
18746 unsigned TFEVal = TFE ? TFE->getImm() : 0;
18747 unsigned LWEVal = LWE ? LWE->getImm() : 0;
18748 unsigned D16Val = D16 ? D16->getImm() : 0;
18749
18750 if (!TFEVal && !LWEVal)
18751 return;
18752
18753 // At least one of TFE or LWE are non-zero
18754 // We have to insert a suitable initialization of the result value and
18755 // tie this to the dest of the image instruction.
18756
18757 // Calculate which dword we have to initialize to 0.
18758 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
18759
18760 // check that dmask operand is found.
18761 assert(MO_Dmask && "Expected dmask operand in instruction");
18762
18763 unsigned dmask = MO_Dmask->getImm();
18764 // Determine the number of active lanes taking into account the
18765 // Gather4 special case
18766 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
18767
18768 bool Packed = !Subtarget->hasUnpackedD16VMem();
18769
18770 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
18771
18772 // Abandon attempt if the dst size isn't large enough
18773 // - this is in fact an error but this is picked up elsewhere and
18774 // reported correctly.
18775 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
18776
18777 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
18778 if (DstSize < InitIdx)
18779 return;
18780 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
18781 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
18782 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
18783 } else {
18784 return;
18785 }
18786
18787 const DebugLoc &DL = MI.getDebugLoc();
18788
18789 // Create a register for the initialization value.
18790 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
18791 unsigned NewDst = 0; // Final initialized value will be in here
18792
18793 // If PRTStrictNull feature is enabled (the default) then initialize
18794 // all the result registers to 0, otherwise just the error indication
18795 // register (VGPRn+1)
18796 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
18797 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
18798
18799 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
18800 for (; SizeLeft; SizeLeft--, CurrIdx++) {
18801 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
18802 // Initialize dword
18803 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
18804 // clang-format off
18805 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
18806 .addImm(0);
18807 // clang-format on
18808 // Insert into the super-reg
18809 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
18810 .addReg(PrevDst)
18811 .addReg(SubReg)
18813
18814 PrevDst = NewDst;
18815 }
18816
18817 // Add as an implicit operand
18818 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
18819
18820 // Tie the just added implicit operand to the dst
18821 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
18822}
18823
18824/// Assign the register class depending on the number of
18825/// bits set in the writemask
18827 SDNode *Node) const {
18829
18830 MachineFunction *MF = MI.getMF();
18831 MachineRegisterInfo &MRI = MF->getRegInfo();
18832
18833 if (TII->isVOP3(MI.getOpcode())) {
18834 // Make sure constant bus requirements are respected.
18835 TII->legalizeOperandsVOP3(MRI, MI);
18836
18837 if (TII->isMAI(MI)) {
18838 // The ordinary src0, src1, src2 were legalized above.
18839 //
18840 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
18841 // as a separate instruction.
18842 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
18843 AMDGPU::OpName::scale_src0);
18844 if (Src0Idx != -1) {
18845 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
18846 AMDGPU::OpName::scale_src1);
18847 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
18848 TII->usesConstantBus(MRI, MI, Src1Idx))
18849 TII->legalizeOpWithMove(MI, Src1Idx);
18850 }
18851 }
18852
18853 return;
18854 }
18855
18856 if (TII->isImage(MI))
18857 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
18858}
18859
18861 uint64_t Val) {
18862 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
18863 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
18864}
18865
18867 const SDLoc &DL,
18868 SDValue Ptr) const {
18870
18871 // Build the half of the subregister with the constants before building the
18872 // full 128-bit register. If we are building multiple resource descriptors,
18873 // this will allow CSEing of the 2-component register.
18874 const SDValue Ops0[] = {
18875 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
18876 buildSMovImm32(DAG, DL, 0),
18877 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
18878 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
18879 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
18880
18881 SDValue SubRegHi = SDValue(
18882 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
18883
18884 // Combine the constants and the pointer.
18885 const SDValue Ops1[] = {
18886 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
18887 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
18888 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
18889
18890 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
18891}
18892
18893/// Return a resource descriptor with the 'Add TID' bit enabled
18894/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
18895/// of the resource descriptor) to create an offset, which is added to
18896/// the resource pointer.
18898 SDValue Ptr, uint32_t RsrcDword1,
18899 uint64_t RsrcDword2And3) const {
18900 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
18901 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
18902 if (RsrcDword1) {
18903 PtrHi =
18904 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
18905 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
18906 0);
18907 }
18908
18909 SDValue DataLo =
18910 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
18911 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
18912
18913 const SDValue Ops[] = {
18914 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
18915 PtrLo,
18916 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
18917 PtrHi,
18918 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
18919 DataLo,
18920 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
18921 DataHi,
18922 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
18923
18924 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
18925}
18926
18927//===----------------------------------------------------------------------===//
18928// SI Inline Assembly Support
18929//===----------------------------------------------------------------------===//
18930
18931std::pair<unsigned, const TargetRegisterClass *>
18933 StringRef Constraint,
18934 MVT VT) const {
18935 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
18936
18937 const TargetRegisterClass *RC = nullptr;
18938 if (Constraint.size() == 1) {
18939 // Check if we cannot determine the bit size of the given value type. This
18940 // can happen, for example, in this situation where we have an empty struct
18941 // (size 0): `call void asm "", "v"({} poison)`-
18942 if (VT == MVT::Other)
18943 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18944 const unsigned BitWidth = VT.getSizeInBits();
18945 switch (Constraint[0]) {
18946 default:
18947 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18948 case 's':
18949 case 'r':
18950 switch (BitWidth) {
18951 case 16:
18952 RC = &AMDGPU::SReg_32RegClass;
18953 break;
18954 case 64:
18955 RC = &AMDGPU::SGPR_64RegClass;
18956 break;
18957 default:
18959 if (!RC)
18960 return std::pair(0U, nullptr);
18961 break;
18962 }
18963 break;
18964 case 'v':
18965 switch (BitWidth) {
18966 case 1:
18967 return std::pair(0U, nullptr);
18968 case 16:
18969 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18970 : &AMDGPU::VGPR_32_Lo256RegClass;
18971 break;
18972 default:
18973 RC = Subtarget->has1024AddressableVGPRs()
18974 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
18975 : TRI->getVGPRClassForBitWidth(BitWidth);
18976 if (!RC)
18977 return std::pair(0U, nullptr);
18978 break;
18979 }
18980 break;
18981 case 'a':
18982 if (!Subtarget->hasMAIInsts())
18983 break;
18984 switch (BitWidth) {
18985 case 1:
18986 return std::pair(0U, nullptr);
18987 case 16:
18988 RC = &AMDGPU::AGPR_32RegClass;
18989 break;
18990 default:
18991 RC = TRI->getAGPRClassForBitWidth(BitWidth);
18992 if (!RC)
18993 return std::pair(0U, nullptr);
18994 break;
18995 }
18996 break;
18997 }
18998 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
18999 const unsigned BitWidth = VT.getSizeInBits();
19000 switch (BitWidth) {
19001 case 16:
19002 RC = &AMDGPU::AV_32RegClass;
19003 break;
19004 default:
19005 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
19006 if (!RC)
19007 return std::pair(0U, nullptr);
19008 break;
19009 }
19010 }
19011
19012 // We actually support i128, i16 and f16 as inline parameters
19013 // even if they are not reported as legal
19014 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
19015 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
19016 return std::pair(0U, RC);
19017
19018 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
19019 if (Kind != '\0') {
19020 if (Kind == 'v') {
19021 RC = &AMDGPU::VGPR_32_Lo256RegClass;
19022 } else if (Kind == 's') {
19023 RC = &AMDGPU::SGPR_32RegClass;
19024 } else if (Kind == 'a') {
19025 RC = &AMDGPU::AGPR_32RegClass;
19026 }
19027
19028 if (RC) {
19029 if (NumRegs > 1) {
19030 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
19031 return std::pair(0U, nullptr);
19032
19033 uint32_t Width = NumRegs * 32;
19034 // Prohibit constraints for register ranges with a width that does not
19035 // match the required type.
19036 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
19037 return std::pair(0U, nullptr);
19038
19039 MCRegister Reg = RC->getRegister(Idx);
19041 RC = TRI->getVGPRClassForBitWidth(Width);
19042 else if (SIRegisterInfo::isSGPRClass(RC))
19043 RC = TRI->getSGPRClassForBitWidth(Width);
19044 else if (SIRegisterInfo::isAGPRClass(RC))
19045 RC = TRI->getAGPRClassForBitWidth(Width);
19046 if (RC) {
19047 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
19048 if (!Reg) {
19049 // The register class does not contain the requested register,
19050 // e.g., because it is an SGPR pair that would violate alignment
19051 // requirements.
19052 return std::pair(0U, nullptr);
19053 }
19054 return std::pair(Reg, RC);
19055 }
19056 }
19057
19058 // Check for lossy scalar/vector conversions.
19059 if (VT.isVector() && VT.getSizeInBits() != 32)
19060 return std::pair(0U, nullptr);
19061 if (RC && Idx < RC->getNumRegs())
19062 return std::pair(RC->getRegister(Idx), RC);
19063 return std::pair(0U, nullptr);
19064 }
19065 }
19066
19067 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
19068 if (Ret.first)
19069 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
19070
19071 return Ret;
19072}
19073
19074static bool isImmConstraint(StringRef Constraint) {
19075 if (Constraint.size() == 1) {
19076 switch (Constraint[0]) {
19077 default:
19078 break;
19079 case 'I':
19080 case 'J':
19081 case 'A':
19082 case 'B':
19083 case 'C':
19084 return true;
19085 }
19086 } else if (Constraint == "DA" || Constraint == "DB") {
19087 return true;
19088 }
19089 return false;
19090}
19091
19094 if (Constraint.size() == 1) {
19095 switch (Constraint[0]) {
19096 default:
19097 break;
19098 case 's':
19099 case 'v':
19100 case 'a':
19101 return C_RegisterClass;
19102 }
19103 } else if (Constraint.size() == 2) {
19104 if (Constraint == "VA")
19105 return C_RegisterClass;
19106 }
19107 if (isImmConstraint(Constraint)) {
19108 return C_Other;
19109 }
19110 return TargetLowering::getConstraintType(Constraint);
19111}
19112
19113static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
19115 Val = Val & maskTrailingOnes<uint64_t>(Size);
19116 }
19117 return Val;
19118}
19119
19121 StringRef Constraint,
19122 std::vector<SDValue> &Ops,
19123 SelectionDAG &DAG) const {
19124 if (isImmConstraint(Constraint)) {
19125 uint64_t Val;
19126 if (getAsmOperandConstVal(Op, Val) &&
19127 checkAsmConstraintVal(Op, Constraint, Val)) {
19128 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
19129 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
19130 }
19131 } else {
19133 }
19134}
19135
19137 unsigned Size = Op.getScalarValueSizeInBits();
19138 if (Size > 64)
19139 return false;
19140
19141 if (Size == 16 && !Subtarget->has16BitInsts())
19142 return false;
19143
19145 Val = C->getSExtValue();
19146 return true;
19147 }
19149 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
19150 return true;
19151 }
19153 if (Size != 16 || Op.getNumOperands() != 2)
19154 return false;
19155 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
19156 return false;
19157 if (ConstantSDNode *C = V->getConstantSplatNode()) {
19158 Val = C->getSExtValue();
19159 return true;
19160 }
19161 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
19162 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
19163 return true;
19164 }
19165 }
19166
19167 return false;
19168}
19169
19171 uint64_t Val) const {
19172 if (Constraint.size() == 1) {
19173 switch (Constraint[0]) {
19174 case 'I':
19176 case 'J':
19177 return isInt<16>(Val);
19178 case 'A':
19179 return checkAsmConstraintValA(Op, Val);
19180 case 'B':
19181 return isInt<32>(Val);
19182 case 'C':
19183 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
19185 default:
19186 break;
19187 }
19188 } else if (Constraint.size() == 2) {
19189 if (Constraint == "DA") {
19190 int64_t HiBits = static_cast<int32_t>(Val >> 32);
19191 int64_t LoBits = static_cast<int32_t>(Val);
19192 return checkAsmConstraintValA(Op, HiBits, 32) &&
19193 checkAsmConstraintValA(Op, LoBits, 32);
19194 }
19195 if (Constraint == "DB") {
19196 return true;
19197 }
19198 }
19199 llvm_unreachable("Invalid asm constraint");
19200}
19201
19203 unsigned MaxSize) const {
19204 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
19205 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
19206 if (Size == 16) {
19207 MVT VT = Op.getSimpleValueType();
19208 switch (VT.SimpleTy) {
19209 default:
19210 return false;
19211 case MVT::i16:
19212 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
19213 case MVT::f16:
19214 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
19215 case MVT::bf16:
19216 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
19217 case MVT::v2i16:
19218 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
19219 case MVT::v2f16:
19220 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
19221 case MVT::v2bf16:
19222 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
19223 }
19224 }
19225 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
19226 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
19227 return true;
19228 return false;
19229}
19230
19231static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
19232 switch (UnalignedClassID) {
19233 case AMDGPU::VReg_64RegClassID:
19234 return AMDGPU::VReg_64_Align2RegClassID;
19235 case AMDGPU::VReg_96RegClassID:
19236 return AMDGPU::VReg_96_Align2RegClassID;
19237 case AMDGPU::VReg_128RegClassID:
19238 return AMDGPU::VReg_128_Align2RegClassID;
19239 case AMDGPU::VReg_160RegClassID:
19240 return AMDGPU::VReg_160_Align2RegClassID;
19241 case AMDGPU::VReg_192RegClassID:
19242 return AMDGPU::VReg_192_Align2RegClassID;
19243 case AMDGPU::VReg_224RegClassID:
19244 return AMDGPU::VReg_224_Align2RegClassID;
19245 case AMDGPU::VReg_256RegClassID:
19246 return AMDGPU::VReg_256_Align2RegClassID;
19247 case AMDGPU::VReg_288RegClassID:
19248 return AMDGPU::VReg_288_Align2RegClassID;
19249 case AMDGPU::VReg_320RegClassID:
19250 return AMDGPU::VReg_320_Align2RegClassID;
19251 case AMDGPU::VReg_352RegClassID:
19252 return AMDGPU::VReg_352_Align2RegClassID;
19253 case AMDGPU::VReg_384RegClassID:
19254 return AMDGPU::VReg_384_Align2RegClassID;
19255 case AMDGPU::VReg_512RegClassID:
19256 return AMDGPU::VReg_512_Align2RegClassID;
19257 case AMDGPU::VReg_1024RegClassID:
19258 return AMDGPU::VReg_1024_Align2RegClassID;
19259 case AMDGPU::AReg_64RegClassID:
19260 return AMDGPU::AReg_64_Align2RegClassID;
19261 case AMDGPU::AReg_96RegClassID:
19262 return AMDGPU::AReg_96_Align2RegClassID;
19263 case AMDGPU::AReg_128RegClassID:
19264 return AMDGPU::AReg_128_Align2RegClassID;
19265 case AMDGPU::AReg_160RegClassID:
19266 return AMDGPU::AReg_160_Align2RegClassID;
19267 case AMDGPU::AReg_192RegClassID:
19268 return AMDGPU::AReg_192_Align2RegClassID;
19269 case AMDGPU::AReg_256RegClassID:
19270 return AMDGPU::AReg_256_Align2RegClassID;
19271 case AMDGPU::AReg_512RegClassID:
19272 return AMDGPU::AReg_512_Align2RegClassID;
19273 case AMDGPU::AReg_1024RegClassID:
19274 return AMDGPU::AReg_1024_Align2RegClassID;
19275 default:
19276 return -1;
19277 }
19278}
19279
19280// Figure out which registers should be reserved for stack access. Only after
19281// the function is legalized do we know all of the non-spill stack objects or if
19282// calls are present.
19284 MachineRegisterInfo &MRI = MF.getRegInfo();
19286 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
19287 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19288 const SIInstrInfo *TII = ST.getInstrInfo();
19289
19290 if (Info->isEntryFunction()) {
19291 // Callable functions have fixed registers used for stack access.
19293 }
19294
19295 // TODO: Move this logic to getReservedRegs()
19296 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
19297 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
19298 Register SReg = ST.isWave32()
19299 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
19300 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
19301 &AMDGPU::SGPR_64RegClass);
19302 Info->setSGPRForEXECCopy(SReg);
19303
19304 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
19305 Info->getStackPtrOffsetReg()));
19306 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
19307 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
19308
19309 // We need to worry about replacing the default register with itself in case
19310 // of MIR testcases missing the MFI.
19311 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
19312 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
19313
19314 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
19315 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
19316
19317 Info->limitOccupancy(MF);
19318
19319 if (ST.isWave32() && !MF.empty()) {
19320 for (auto &MBB : MF) {
19321 for (auto &MI : MBB) {
19322 TII->fixImplicitOperands(MI);
19323 }
19324 }
19325 }
19326
19327 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
19328 // classes if required. Ideally the register class constraints would differ
19329 // per-subtarget, but there's no easy way to achieve that right now. This is
19330 // not a problem for VGPRs because the correctly aligned VGPR class is implied
19331 // from using them as the register class for legal types.
19332 if (ST.needsAlignedVGPRs()) {
19333 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
19334 const Register Reg = Register::index2VirtReg(I);
19335 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
19336 if (!RC)
19337 continue;
19338 int NewClassID = getAlignedAGPRClassID(RC->getID());
19339 if (NewClassID != -1)
19340 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
19341 }
19342 }
19343
19345}
19346
19348 KnownBits &Known,
19349 const APInt &DemandedElts,
19350 const SelectionDAG &DAG,
19351 unsigned Depth) const {
19352 Known.resetAll();
19353 unsigned Opc = Op.getOpcode();
19354 switch (Opc) {
19356 unsigned IID = Op.getConstantOperandVal(0);
19357 switch (IID) {
19358 case Intrinsic::amdgcn_mbcnt_lo:
19359 case Intrinsic::amdgcn_mbcnt_hi: {
19360 const GCNSubtarget &ST =
19362 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
19363 // most 31 + src1.
19364 Known.Zero.setBitsFrom(
19365 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
19366 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
19367 Known = KnownBits::add(Known, Known2);
19368 return;
19369 }
19370 }
19371 break;
19372 }
19373 }
19375 Op, Known, DemandedElts, DAG, Depth);
19376}
19377
19379 const int FI, KnownBits &Known, const MachineFunction &MF) const {
19381
19382 // Set the high bits to zero based on the maximum allowed scratch size per
19383 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
19384 // calculation won't overflow, so assume the sign bit is never set.
19385 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
19386}
19387
19389 GISelValueTracking &VT, KnownBits &Known,
19390 unsigned Dim) {
19391 unsigned MaxValue =
19392 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
19393 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
19394}
19395
19397 KnownBits &Known, const APInt &DemandedElts,
19398 unsigned BFEWidth, bool SExt, unsigned Depth) {
19400 const MachineOperand &Src1 = MI.getOperand(2);
19401
19402 unsigned Src1Cst = 0;
19403 if (Src1.isImm()) {
19404 Src1Cst = Src1.getImm();
19405 } else if (Src1.isReg()) {
19406 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
19407 if (!Cst)
19408 return;
19409 Src1Cst = Cst->Value.getZExtValue();
19410 } else {
19411 return;
19412 }
19413
19414 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
19415 // Width is always [22:16].
19416 const unsigned Offset =
19417 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
19418 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
19419
19420 if (Width >= BFEWidth) // Ill-formed.
19421 return;
19422
19423 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
19424 Depth + 1);
19425
19426 Known = Known.extractBits(Width, Offset);
19427
19428 if (SExt)
19429 Known = Known.sext(BFEWidth);
19430 else
19431 Known = Known.zext(BFEWidth);
19432}
19433
19435 GISelValueTracking &VT, Register R, KnownBits &Known,
19436 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
19437 unsigned Depth) const {
19438 Known.resetAll();
19439 const MachineInstr *MI = MRI.getVRegDef(R);
19440 switch (MI->getOpcode()) {
19441 case AMDGPU::S_BFE_I32:
19442 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
19443 /*SExt=*/true, Depth);
19444 case AMDGPU::S_BFE_U32:
19445 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
19446 /*SExt=*/false, Depth);
19447 case AMDGPU::S_BFE_I64:
19448 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
19449 /*SExt=*/true, Depth);
19450 case AMDGPU::S_BFE_U64:
19451 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
19452 /*SExt=*/false, Depth);
19453 case AMDGPU::G_INTRINSIC:
19454 case AMDGPU::G_INTRINSIC_CONVERGENT: {
19455 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
19456 switch (IID) {
19457 case Intrinsic::amdgcn_workitem_id_x:
19458 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
19459 break;
19460 case Intrinsic::amdgcn_workitem_id_y:
19461 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
19462 break;
19463 case Intrinsic::amdgcn_workitem_id_z:
19464 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
19465 break;
19466 case Intrinsic::amdgcn_mbcnt_lo:
19467 case Intrinsic::amdgcn_mbcnt_hi: {
19468 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
19469 // most 31 + src1.
19470 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
19471 ? getSubtarget()->getWavefrontSizeLog2()
19472 : 5);
19473 KnownBits Known2;
19474 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
19475 Depth + 1);
19476 Known = KnownBits::add(Known, Known2);
19477 break;
19478 }
19479 case Intrinsic::amdgcn_groupstaticsize: {
19480 // We can report everything over the maximum size as 0. We can't report
19481 // based on the actual size because we don't know if it's accurate or not
19482 // at any given point.
19483 Known.Zero.setHighBits(
19484 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
19485 break;
19486 }
19487 }
19488 break;
19489 }
19490 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
19491 Known.Zero.setHighBits(24);
19492 break;
19493 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
19494 Known.Zero.setHighBits(16);
19495 break;
19496 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
19497 // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,
19498 // producing exactly 0 or 1.
19499 Known.Zero.setHighBits(Known.getBitWidth() - 1);
19500 break;
19501 case AMDGPU::G_AMDGPU_SMED3:
19502 case AMDGPU::G_AMDGPU_UMED3: {
19503 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
19504
19505 KnownBits Known2;
19506 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
19507 if (Known2.isUnknown())
19508 break;
19509
19510 KnownBits Known1;
19511 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
19512 if (Known1.isUnknown())
19513 break;
19514
19515 KnownBits Known0;
19516 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
19517 if (Known0.isUnknown())
19518 break;
19519
19520 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
19521 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
19522 Known.One = Known0.One & Known1.One & Known2.One;
19523 break;
19524 }
19525 }
19526}
19527
19530 unsigned Depth) const {
19531 const MachineInstr *MI = MRI.getVRegDef(R);
19532 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
19533 // FIXME: Can this move to generic code? What about the case where the call
19534 // site specifies a lower alignment?
19535 Intrinsic::ID IID = GI->getIntrinsicID();
19537 AttributeList Attrs =
19538 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
19539 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
19540 return *RetAlign;
19541 }
19542 return Align(1);
19543}
19544
19547 const Align CacheLineAlign = Align(64);
19548
19549 // GFX950: Prevent an 8-byte instruction at loop header from being split by
19550 // the 32-byte instruction fetch window boundary. This avoids a significant
19551 // fetch delay after backward branch. We use 32-byte alignment with max
19552 // padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().
19553 if (ML && !DisableLoopAlignment &&
19554 getSubtarget()->hasLoopHeadInstSplitSensitivity()) {
19555 const MachineBasicBlock *Header = ML->getHeader();
19556 // Respect user-specified or previously set alignment.
19557 if (Header->getAlignment() != PrefAlign)
19558 return Header->getAlignment();
19559 if (needsFetchWindowAlignment(*Header))
19560 return Align(32);
19561 }
19562
19563 // Pre-GFX10 target did not benefit from loop alignment
19564 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
19565 getSubtarget()->hasInstFwdPrefetchBug())
19566 return PrefAlign;
19567
19568 // On GFX10 I$ is 4 x 64 bytes cache lines.
19569 // By default prefetcher keeps one cache line behind and reads two ahead.
19570 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
19571 // behind and one ahead.
19572 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
19573 // If loop fits 64 bytes it always spans no more than two cache lines and
19574 // does not need an alignment.
19575 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
19576 // Else if loop is less or equal 192 bytes we need two lines behind.
19577
19579 const MachineBasicBlock *Header = ML->getHeader();
19580 if (Header->getAlignment() != PrefAlign)
19581 return Header->getAlignment(); // Already processed.
19582
19583 unsigned LoopSize = 0;
19584 for (const MachineBasicBlock *MBB : ML->blocks()) {
19585 // If inner loop block is aligned assume in average half of the alignment
19586 // size to be added as nops.
19587 if (MBB != Header)
19588 LoopSize += MBB->getAlignment().value() / 2;
19589
19590 for (const MachineInstr &MI : *MBB) {
19591 LoopSize += TII->getInstSizeInBytes(MI);
19592 if (LoopSize > 192)
19593 return PrefAlign;
19594 }
19595 }
19596
19597 if (LoopSize <= 64)
19598 return PrefAlign;
19599
19600 if (LoopSize <= 128)
19601 return CacheLineAlign;
19602
19603 // If any of parent loops is surrounded by prefetch instructions do not
19604 // insert new for inner loop, which would reset parent's settings.
19605 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
19606 if (MachineBasicBlock *Exit = P->getExitBlock()) {
19607 auto I = Exit->getFirstNonDebugInstr();
19608 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
19609 return CacheLineAlign;
19610 }
19611 }
19612
19613 MachineBasicBlock *Pre = ML->getLoopPreheader();
19614 MachineBasicBlock *Exit = ML->getExitBlock();
19615
19616 if (Pre && Exit) {
19617 auto PreTerm = Pre->getFirstTerminator();
19618 if (PreTerm == Pre->begin() ||
19619 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
19620 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
19621 .addImm(1); // prefetch 2 lines behind PC
19622
19623 auto ExitHead = Exit->getFirstNonDebugInstr();
19624 if (ExitHead == Exit->end() ||
19625 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
19626 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
19627 .addImm(2); // prefetch 1 line behind PC
19628 }
19629
19630 return CacheLineAlign;
19631}
19632
19634 MachineBasicBlock *MBB) const {
19635 // GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte
19636 // instruction could be split by the 32-byte fetch window boundary.
19637 // See getPrefLoopAlignment() for context.
19638 if (needsFetchWindowAlignment(*MBB))
19639 return 4;
19641}
19642
19643bool SITargetLowering::needsFetchWindowAlignment(
19644 const MachineBasicBlock &MBB) const {
19645 if (!getSubtarget()->hasLoopHeadInstSplitSensitivity())
19646 return false;
19648 for (const MachineInstr &MI : MBB) {
19649 if (MI.isMetaInstruction())
19650 continue;
19651 // Instructions larger than 4 bytes can be split by a 32-byte boundary.
19652 return TII->getInstSizeInBytes(MI) > 4;
19653 }
19654 return false;
19655}
19656
19657[[maybe_unused]]
19658static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
19659 assert(N->getOpcode() == ISD::CopyFromReg);
19660 do {
19661 // Follow the chain until we find an INLINEASM node.
19662 N = N->getOperand(0).getNode();
19663 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
19664 return true;
19665 } while (N->getOpcode() == ISD::CopyFromReg);
19666 return false;
19667}
19668
19671 UniformityInfo *UA) const {
19672 switch (N->getOpcode()) {
19673 case ISD::CopyFromReg: {
19674 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
19675 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
19676 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19677 Register Reg = R->getReg();
19678
19679 // FIXME: Why does this need to consider isLiveIn?
19680 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
19681 return !TRI->isSGPRReg(MRI, Reg);
19682
19683 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
19684 return UA->isDivergent(V);
19685
19687 return !TRI->isSGPRReg(MRI, Reg);
19688 }
19689 case ISD::LOAD: {
19690 const LoadSDNode *L = cast<LoadSDNode>(N);
19691 unsigned AS = L->getAddressSpace();
19692 // A flat load may access private memory.
19694 }
19695 case ISD::CALLSEQ_END:
19696 return true;
19698 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
19700 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
19701 case AMDGPUISD::ATOMIC_CMP_SWAP:
19702 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
19703 case AMDGPUISD::BUFFER_ATOMIC_ADD:
19704 case AMDGPUISD::BUFFER_ATOMIC_SUB:
19705 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
19706 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
19707 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
19708 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
19709 case AMDGPUISD::BUFFER_ATOMIC_AND:
19710 case AMDGPUISD::BUFFER_ATOMIC_OR:
19711 case AMDGPUISD::BUFFER_ATOMIC_XOR:
19712 case AMDGPUISD::BUFFER_ATOMIC_INC:
19713 case AMDGPUISD::BUFFER_ATOMIC_DEC:
19714 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
19715 case AMDGPUISD::BUFFER_ATOMIC_FADD:
19716 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
19717 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
19718 // Target-specific read-modify-write atomics are sources of divergence.
19719 return true;
19720 default:
19721 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
19722 // Generic read-modify-write atomics are sources of divergence.
19723 return A->readMem() && A->writeMem();
19724 }
19725 return false;
19726 }
19727}
19728
19730 EVT VT) const {
19731 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
19732 case MVT::f32:
19734 case MVT::f64:
19735 case MVT::f16:
19737 default:
19738 return false;
19739 }
19740}
19741
19743 LLT Ty, const MachineFunction &MF) const {
19744 switch (Ty.getScalarSizeInBits()) {
19745 case 32:
19746 return !denormalModeIsFlushAllF32(MF);
19747 case 64:
19748 case 16:
19749 return !denormalModeIsFlushAllF64F16(MF);
19750 default:
19751 return false;
19752 }
19753}
19754
19756 const APInt &DemandedElts,
19757 const SelectionDAG &DAG,
19758 bool SNaN,
19759 unsigned Depth) const {
19760 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
19761 const MachineFunction &MF = DAG.getMachineFunction();
19763
19764 if (Info->getMode().DX10Clamp)
19765 return true; // Clamped to 0.
19766 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
19767 }
19768
19770 DAG, SNaN, Depth);
19771}
19772
19773// On older subtargets, global FP atomic instructions have a hardcoded FP mode
19774// and do not support FP32 denormals, and only support v2f16/f64 denormals.
19776 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
19777 return true;
19778
19779 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
19780 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
19781 if (DenormMode == DenormalMode::getPreserveSign())
19782 return true;
19783
19784 // TODO: Remove this.
19785 return RMW->getFunction()
19786 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
19787 .getValueAsBool();
19788}
19789
19791 LLVMContext &Ctx = RMW->getContext();
19792 StringRef MemScope =
19793 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
19794
19795 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
19796 << "Hardware instruction generated for atomic "
19797 << RMW->getOperationName(RMW->getOperation())
19798 << " operation at memory scope " << MemScope;
19799}
19800
19801static bool isV2F16OrV2BF16(Type *Ty) {
19802 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
19803 Type *EltTy = VT->getElementType();
19804 return VT->getNumElements() == 2 &&
19805 (EltTy->isHalfTy() || EltTy->isBFloatTy());
19806 }
19807
19808 return false;
19809}
19810
19811static bool isV2F16(Type *Ty) {
19813 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
19814}
19815
19816static bool isV2BF16(Type *Ty) {
19818 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
19819}
19820
19821/// \return true if atomicrmw integer ops work for the type.
19822static bool isAtomicRMWLegalIntTy(Type *Ty) {
19823 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
19824 unsigned BW = IT->getBitWidth();
19825 return BW == 32 || BW == 64;
19826 }
19827
19828 return false;
19829}
19830
19831/// \return true if this atomicrmw xchg type can be selected.
19832static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
19833 Type *Ty = RMW->getType();
19834 if (isAtomicRMWLegalIntTy(Ty))
19835 return true;
19836
19837 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
19838 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
19839 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
19840 return BW == 32 || BW == 64;
19841 }
19842
19843 if (Ty->isFloatTy() || Ty->isDoubleTy())
19844 return true;
19845
19847 return VT->getNumElements() == 2 &&
19848 VT->getElementType()->getPrimitiveSizeInBits() == 16;
19849 }
19850
19851 return false;
19852}
19853
19854/// \returns true if it's valid to emit a native instruction for \p RMW, based
19855/// on the properties of the target memory.
19856static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
19857 const AtomicRMWInst *RMW,
19858 bool HasSystemScope) {
19859 // The remote/fine-grained access logic is different from the integer
19860 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
19861 // fine-grained access does not work, even for a device local allocation.
19862 //
19863 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
19864 // allocations work.
19865 if (HasSystemScope) {
19866 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
19867 RMW->hasMetadata("amdgpu.no.remote.memory"))
19868 return true;
19869 if (Subtarget.hasEmulatedSystemScopeAtomics())
19870 return true;
19871 } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
19872 return true;
19873
19874 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
19875}
19876
19877/// \return Action to perform on AtomicRMWInsts for integer operations.
19884
19885/// Return if a flat address space atomicrmw can access private memory.
19887 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
19888 return !MD ||
19890}
19891
19894 // For GAS, lower to flat atomic.
19895 return STI.hasGloballyAddressableScratch()
19898}
19899
19902 unsigned AS = RMW->getPointerAddressSpace();
19903 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
19905
19906 // 64-bit flat atomics that dynamically reside in private memory will silently
19907 // be dropped.
19908 //
19909 // Note that we will emit a new copy of the original atomic in the expansion,
19910 // which will be incrementally relegalized.
19911 const DataLayout &DL = RMW->getFunction()->getDataLayout();
19912 if (AS == AMDGPUAS::FLAT_ADDRESS &&
19913 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
19916
19917 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
19919 ORE.emit([=]() {
19920 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
19921 });
19922 return Kind;
19923 };
19924
19925 auto SSID = RMW->getSyncScopeID();
19926 bool HasSystemScope =
19927 SSID == SyncScope::System ||
19928 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
19929
19930 auto Op = RMW->getOperation();
19931 switch (Op) {
19933 // PCIe supports add and xchg for system atomics.
19934 return isAtomicRMWLegalXChgTy(RMW)
19937 case AtomicRMWInst::Add:
19938 // PCIe supports add and xchg for system atomics.
19940 case AtomicRMWInst::Sub:
19941 case AtomicRMWInst::And:
19942 case AtomicRMWInst::Or:
19943 case AtomicRMWInst::Xor:
19944 case AtomicRMWInst::Max:
19945 case AtomicRMWInst::Min:
19952 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
19954 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
19957 auto *IT = dyn_cast<IntegerType>(RMW->getType());
19958 if (!IT || IT->getBitWidth() != 32)
19960 }
19961
19964 if (Subtarget->hasEmulatedSystemScopeAtomics())
19966
19967 // On most subtargets, for atomicrmw operations other than add/xchg,
19968 // whether or not the instructions will behave correctly depends on where
19969 // the address physically resides and what interconnect is used in the
19970 // system configuration. On some some targets the instruction will nop,
19971 // and in others synchronization will only occur at degraded device scope.
19972 //
19973 // If the allocation is known local to the device, the instructions should
19974 // work correctly.
19975 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
19977
19978 // If fine-grained remote memory works at device scope, we don't need to
19979 // do anything.
19980 if (!HasSystemScope &&
19981 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
19983
19984 // If we are targeting a remote allocated address, it depends what kind of
19985 // allocation the address belongs to.
19986 //
19987 // If the allocation is fine-grained (in host memory, or in PCIe peer
19988 // device memory), the operation will fail depending on the target.
19989 //
19990 // Note fine-grained host memory access does work on APUs or if XGMI is
19991 // used, but we do not know if we are targeting an APU or the system
19992 // configuration from the ISA version/target-cpu.
19993 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
19995
19998 // Atomic sub/or/xor do not work over PCI express, but atomic add
19999 // does. InstCombine transforms these with 0 to or, so undo that.
20000 if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
20001 ConstVal && ConstVal->isNullValue())
20003 }
20004
20005 // If the allocation could be in remote, fine-grained memory, the rmw
20006 // instructions may fail. cmpxchg should work, so emit that. On some
20007 // system configurations, PCIe atomics aren't supported so cmpxchg won't
20008 // even work, so you're out of luck anyway.
20009
20010 // In summary:
20011 //
20012 // Cases that may fail:
20013 // - fine-grained pinned host memory
20014 // - fine-grained migratable host memory
20015 // - fine-grained PCIe peer device
20016 //
20017 // Cases that should work, but may be treated overly conservatively.
20018 // - fine-grained host memory on an APU
20019 // - fine-grained XGMI peer device
20021 }
20022
20024 }
20025 case AtomicRMWInst::FAdd: {
20026 Type *Ty = RMW->getType();
20027
20028 // TODO: Handle REGION_ADDRESS
20029 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
20030 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
20031 // is fixed to round-to-nearest-even.
20032 //
20033 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
20034 // round-to-nearest-even.
20035 //
20036 // We ignore the rounding mode problem, even in strictfp. The C++ standard
20037 // suggests it is OK if the floating-point mode may not match the calling
20038 // thread.
20039 if (Ty->isFloatTy()) {
20040 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
20042 }
20043
20044 if (Ty->isDoubleTy()) {
20045 // Ignores denormal mode, but we don't consider flushing mandatory.
20046 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
20048 }
20049
20050 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
20052
20054 }
20055
20056 // LDS atomics respect the denormal mode from the mode register.
20057 //
20058 // Traditionally f32 global/buffer memory atomics would unconditionally
20059 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
20060 // flush.
20061 //
20062 // On targets with flat atomic fadd, denormals would flush depending on
20063 // whether the target address resides in LDS or global memory. We consider
20064 // this flat-maybe-flush as will-flush.
20065 if (Ty->isFloatTy() &&
20066 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
20069
20070 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
20071 // safe. The message phrasing also should be better.
20072 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
20073 if (AS == AMDGPUAS::FLAT_ADDRESS) {
20074 // gfx942, gfx12
20075 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
20076 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20077 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
20078 // gfx90a, gfx942, gfx12
20079 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
20080 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20081
20082 // gfx942, gfx12
20083 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
20084 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20085 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
20086 // gfx90a, gfx942, gfx12
20087 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
20088 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20089
20090 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
20091 // buffer. gfx12 does have the buffer version.
20092 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
20093 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20094 }
20095
20096 // global and flat atomic fadd f64: gfx90a, gfx942.
20097 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
20098 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20099
20100 if (AS != AMDGPUAS::FLAT_ADDRESS) {
20101 if (Ty->isFloatTy()) {
20102 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
20103 // gfx11+.
20104 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20105 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20106 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
20107 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20108 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20109 } else {
20110 // gfx908
20111 if (RMW->use_empty() &&
20112 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
20113 isV2F16(Ty))
20114 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20115 }
20116 }
20117
20118 // flat atomic fadd f32: gfx942, gfx11+.
20119 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
20120 if (Subtarget->hasFlatAtomicFaddF32Inst())
20121 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20122
20123 // If it is in flat address space, and the type is float, we will try to
20124 // expand it, if the target supports global and lds atomic fadd. The
20125 // reason we need that is, in the expansion, we emit the check of
20126 // address space. If it is in global address space, we emit the global
20127 // atomic fadd; if it is in shared address space, we emit the LDS atomic
20128 // fadd.
20129 if (Subtarget->hasLDSFPAtomicAddF32()) {
20130 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20132 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20134 }
20135 }
20136 }
20137
20139 }
20141 case AtomicRMWInst::FMax: {
20142 Type *Ty = RMW->getType();
20143
20144 // LDS float and double fmin/fmax were always supported.
20145 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
20146 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
20148 }
20149
20150 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
20151 // For flat and global cases:
20152 // float, double in gfx7. Manual claims denormal support.
20153 // Removed in gfx8.
20154 // float, double restored in gfx10.
20155 // double removed again in gfx11, so only f32 for gfx11/gfx12.
20156 //
20157 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
20158 // no f32.
20159 if (AS == AMDGPUAS::FLAT_ADDRESS) {
20160 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
20161 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20162 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
20163 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20164 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
20166 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
20167 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20168 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
20169 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20170 }
20171 }
20172
20174 }
20177 default:
20179 }
20180
20181 llvm_unreachable("covered atomicrmw op switch");
20182}
20183
20190
20197
20200 const AtomicCmpXchgInst *CmpX) const {
20201 unsigned AddrSpace = CmpX->getPointerAddressSpace();
20202 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
20204
20205 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
20207
20208 const DataLayout &DL = CmpX->getDataLayout();
20209
20210 Type *ValTy = CmpX->getNewValOperand()->getType();
20211
20212 // If a 64-bit flat atomic may alias private, we need to avoid using the
20213 // atomic in the private case.
20214 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
20216}
20217
20218const TargetRegisterClass *
20219SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
20221 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
20222 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
20223 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
20224 : &AMDGPU::SReg_32RegClass;
20225 if (!TRI->isSGPRClass(RC) && !isDivergent)
20226 return TRI->getEquivalentSGPRClass(RC);
20227 if (TRI->isSGPRClass(RC) && isDivergent) {
20228 if (Subtarget->hasGFX90AInsts())
20229 return TRI->getEquivalentAVClass(RC);
20230 return TRI->getEquivalentVGPRClass(RC);
20231 }
20232
20233 return RC;
20234}
20235
20236// FIXME: This is a workaround for DivergenceAnalysis not understanding always
20237// uniform values (as produced by the mask results of control flow intrinsics)
20238// used outside of divergent blocks. The phi users need to also be treated as
20239// always uniform.
20240//
20241// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
20242static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
20243 unsigned WaveSize) {
20244 // FIXME: We assume we never cast the mask results of a control flow
20245 // intrinsic.
20246 // Early exit if the type won't be consistent as a compile time hack.
20247 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
20248 if (!IT || IT->getBitWidth() != WaveSize)
20249 return false;
20250
20251 if (!isa<Instruction>(V))
20252 return false;
20253 if (!Visited.insert(V).second)
20254 return false;
20255 bool Result = false;
20256 for (const auto *U : V->users()) {
20258 if (V == U->getOperand(1)) {
20259 switch (Intrinsic->getIntrinsicID()) {
20260 default:
20261 Result = false;
20262 break;
20263 case Intrinsic::amdgcn_if_break:
20264 case Intrinsic::amdgcn_if:
20265 case Intrinsic::amdgcn_else:
20266 Result = true;
20267 break;
20268 }
20269 }
20270 if (V == U->getOperand(0)) {
20271 switch (Intrinsic->getIntrinsicID()) {
20272 default:
20273 Result = false;
20274 break;
20275 case Intrinsic::amdgcn_end_cf:
20276 case Intrinsic::amdgcn_loop:
20277 Result = true;
20278 break;
20279 }
20280 }
20281 } else {
20282 Result = hasCFUser(U, Visited, WaveSize);
20283 }
20284 if (Result)
20285 break;
20286 }
20287 return Result;
20288}
20289
20291 const Value *V) const {
20292 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
20293 if (CI->isInlineAsm()) {
20294 // FIXME: This cannot give a correct answer. This should only trigger in
20295 // the case where inline asm returns mixed SGPR and VGPR results, used
20296 // outside the defining block. We don't have a specific result to
20297 // consider, so this assumes if any value is SGPR, the overall register
20298 // also needs to be SGPR.
20299 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
20301 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
20302 for (auto &TC : TargetConstraints) {
20303 if (TC.Type == InlineAsm::isOutput) {
20305 const TargetRegisterClass *RC =
20306 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
20307 TC.ConstraintVT)
20308 .second;
20309 if (RC && SIRI->isSGPRClass(RC))
20310 return true;
20311 }
20312 }
20313 }
20314 }
20316 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
20317}
20318
20320 for (SDUse &Use : N->uses()) {
20322 if (getBasePtrIndex(M) == Use.getOperandNo())
20323 return true;
20324 }
20325 }
20326 return false;
20327}
20328
20330 SDValue N1) const {
20331 if (!N0.hasOneUse())
20332 return false;
20333 // Take care of the opportunity to keep N0 uniform
20334 if (N0->isDivergent() || !N1->isDivergent())
20335 return true;
20336 // Check if we have a good chance to form the memory access pattern with the
20337 // base and offset
20338 return (DAG.isBaseWithConstantOffset(N0) &&
20340}
20341
20343 Register N0, Register N1) const {
20344 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
20345}
20346
20349 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
20351 if (I.getMetadata("amdgpu.noclobber"))
20352 Flags |= MONoClobber;
20353 if (I.getMetadata("amdgpu.last.use"))
20354 Flags |= MOLastUse;
20355 return Flags;
20356}
20357
20359 Instruction *AI) const {
20360 // Given: atomicrmw fadd ptr %addr, float %val ordering
20361 //
20362 // With this expansion we produce the following code:
20363 // [...]
20364 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
20365 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
20366 //
20367 // atomicrmw.shared:
20368 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
20369 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
20370 // float %val ordering
20371 // br label %atomicrmw.phi
20372 //
20373 // atomicrmw.check.private:
20374 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
20375 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
20376 //
20377 // atomicrmw.private:
20378 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
20379 // %loaded.private = load float, ptr addrspace(5) %cast.private
20380 // %val.new = fadd float %loaded.private, %val
20381 // store float %val.new, ptr addrspace(5) %cast.private
20382 // br label %atomicrmw.phi
20383 //
20384 // atomicrmw.global:
20385 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
20386 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
20387 // float %val ordering
20388 // br label %atomicrmw.phi
20389 //
20390 // atomicrmw.phi:
20391 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
20392 // [ %loaded.private, %atomicrmw.private ],
20393 // [ %loaded.global, %atomicrmw.global ]
20394 // br label %atomicrmw.end
20395 //
20396 // atomicrmw.end:
20397 // [...]
20398 //
20399 //
20400 // For 64-bit atomics which may reside in private memory, we perform a simpler
20401 // version that only inserts the private check, and uses the flat operation.
20402
20403 IRBuilder<> Builder(AI);
20404 LLVMContext &Ctx = Builder.getContext();
20405
20406 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
20407 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
20409 Value *Addr = AI->getOperand(PtrOpIdx);
20410
20411 /// TODO: Only need to check private, then emit flat-known-not private (no
20412 /// need for shared block, or cast to global).
20414
20415 Align Alignment;
20416 if (RMW)
20417 Alignment = RMW->getAlign();
20418 else if (CX)
20419 Alignment = CX->getAlign();
20420 else
20421 llvm_unreachable("unhandled atomic operation");
20422
20423 // FullFlatEmulation is true if we need to issue the private, shared, and
20424 // global cases.
20425 //
20426 // If this is false, we are only dealing with the flat-targeting-private case,
20427 // where we only insert a check for private and still use the flat instruction
20428 // for global and shared.
20429
20430 bool FullFlatEmulation =
20431 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
20432 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
20433 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
20434 RMW->getType()->isDoubleTy()));
20435
20436 // If the return value isn't used, do not introduce a false use in the phi.
20437 bool ReturnValueIsUsed = !AI->use_empty();
20438
20439 BasicBlock *BB = Builder.GetInsertBlock();
20440 Function *F = BB->getParent();
20441 BasicBlock *ExitBB =
20442 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
20443 BasicBlock *SharedBB = nullptr;
20444
20445 BasicBlock *CheckPrivateBB = BB;
20446 if (FullFlatEmulation) {
20447 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
20448 CheckPrivateBB =
20449 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
20450 }
20451
20452 BasicBlock *PrivateBB =
20453 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
20454 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
20455 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
20456
20457 std::prev(BB->end())->eraseFromParent();
20458 Builder.SetInsertPoint(BB);
20459
20460 Value *LoadedShared = nullptr;
20461 if (FullFlatEmulation) {
20462 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
20463 {Addr}, nullptr, "is.shared");
20464 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
20465 Builder.SetInsertPoint(SharedBB);
20466 Value *CastToLocal = Builder.CreateAddrSpaceCast(
20468
20469 Instruction *Clone = AI->clone();
20470 Clone->insertInto(SharedBB, SharedBB->end());
20471 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
20472 LoadedShared = Clone;
20473
20474 Builder.CreateBr(PhiBB);
20475 Builder.SetInsertPoint(CheckPrivateBB);
20476 }
20477
20478 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
20479 {Addr}, nullptr, "is.private");
20480 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
20481
20482 Builder.SetInsertPoint(PrivateBB);
20483
20484 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
20486
20487 Value *LoadedPrivate;
20488 if (RMW) {
20489 LoadedPrivate = Builder.CreateAlignedLoad(
20490 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
20491
20492 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
20493 LoadedPrivate, RMW->getValOperand());
20494
20495 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
20496 } else {
20497 auto [ResultLoad, Equal] =
20498 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
20499 CX->getNewValOperand(), CX->getAlign());
20500
20501 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
20502 ResultLoad, 0);
20503 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
20504 }
20505
20506 Builder.CreateBr(PhiBB);
20507
20508 Builder.SetInsertPoint(GlobalBB);
20509
20510 // Continue using a flat instruction if we only emitted the check for private.
20511 Instruction *LoadedGlobal = AI;
20512 if (FullFlatEmulation) {
20513 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
20515 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
20516 }
20517
20518 AI->removeFromParent();
20519 AI->insertInto(GlobalBB, GlobalBB->end());
20520
20521 // The new atomicrmw may go through another round of legalization later.
20522 if (!FullFlatEmulation) {
20523 // We inserted the runtime check already, make sure we do not try to
20524 // re-expand this.
20525 // TODO: Should union with any existing metadata.
20526 MDBuilder MDB(F->getContext());
20527 MDNode *RangeNotPrivate =
20530 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
20531 RangeNotPrivate);
20532 }
20533
20534 Builder.CreateBr(PhiBB);
20535
20536 Builder.SetInsertPoint(PhiBB);
20537
20538 if (ReturnValueIsUsed) {
20539 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
20540 AI->replaceAllUsesWith(Loaded);
20541 if (FullFlatEmulation)
20542 Loaded->addIncoming(LoadedShared, SharedBB);
20543 Loaded->addIncoming(LoadedPrivate, PrivateBB);
20544 Loaded->addIncoming(LoadedGlobal, GlobalBB);
20545 Loaded->takeName(AI);
20546 }
20547
20548 Builder.CreateBr(ExitBB);
20549}
20550
20552 unsigned PtrOpIdx) {
20553 Value *PtrOp = I->getOperand(PtrOpIdx);
20556
20557 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
20558 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
20559 I->getIterator());
20560 I->setOperand(PtrOpIdx, ASCast);
20561}
20562
20565
20568
20571 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
20572 ConstVal && ConstVal->isNullValue()) {
20573 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
20575
20576 // We may still need the private-alias-flat handling below.
20577
20578 // TODO: Skip this for cases where we cannot access remote memory.
20579 }
20580 }
20581
20582 // The non-flat expansions should only perform the de-canonicalization of
20583 // identity values.
20585 return;
20586
20588}
20589
20596
20600
20602 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
20603}
20604
20606 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
20607 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
20608
20610 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
20611}
20612
20613LoadInst *
20615 IRBuilder<> Builder(AI);
20616 auto Order = AI->getOrdering();
20617
20618 // The optimization removes store aspect of the atomicrmw. Therefore, cache
20619 // must be flushed if the atomic ordering had a release semantics. This is
20620 // not necessary a fence, a release fence just coincides to do that flush.
20621 // Avoid replacing of an atomicrmw with a release semantics.
20622 if (isReleaseOrStronger(Order))
20623 return nullptr;
20624
20625 LoadInst *LI = Builder.CreateAlignedLoad(
20626 AI->getType(), AI->getPointerOperand(), AI->getAlign());
20627 LI->setAtomic(Order, AI->getSyncScopeID());
20628 LI->copyMetadata(*AI);
20629 LI->takeName(AI);
20630 AI->replaceAllUsesWith(LI);
20631 AI->eraseFromParent();
20632 return LI;
20633}
static bool isMul(MachineInstr *MI)
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
static bool isCtlzOpc(unsigned Opc)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
@ DEFAULT
Default weight is used in cases when there is no dedicated execution weight set.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1269
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1266
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static MachineBasicBlock * expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static std::tuple< unsigned, unsigned > getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static std::pair< Register, Register > ExtractSubRegs(MachineInstr &MI, MachineOperand &Op, const TargetRegisterClass *SrcRC, const GCNSubtarget &ST, MachineRegisterInfo &MRI)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static ISD::CondCode tryReduceF64CompareToHiHalf(const ISD::CondCode CC, const SDValue LHS, const SDValue RHS, const SelectionDAG &DAG)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static void expand64BitV_CNDMASK(MachineInstr &MI, MachineBasicBlock *BB)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
static uint64_t getIdentityValueForWaveReduction(unsigned Opc)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1175
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5890
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1564
bool isNegative() const
Definition APFloat.h:1516
bool isNormal() const
Definition APFloat.h:1520
APInt bitcastToAPInt() const
Definition APFloat.h:1408
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1193
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1153
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1134
bool isInfinity() const
Definition APFloat.h:1513
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1406
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1400
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1654
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:342
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:338
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:216
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
iterator_range< arg_iterator > args()
Definition Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:804
Argument * getArg(unsigned i) const
Definition Function.h:886
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool isWave64() const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2812
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1080
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
unsigned getNumVirtRegs() const
getNumVirtRegs - Return the number of virtual registers created.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:252
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:246
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:249
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...