LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/MC/MCContext.h"
38
39using namespace llvm;
40
41#define DEBUG_TYPE "si-instr-info"
42
43#define GET_INSTRINFO_CTOR_DTOR
44#include "AMDGPUGenInstrInfo.inc"
45
46namespace llvm::AMDGPU {
47#define GET_D16ImageDimIntrinsics_IMPL
48#define GET_ImageDimIntrinsicTable_IMPL
49#define GET_RsrcIntrinsics_IMPL
50#include "AMDGPUGenSearchableTables.inc"
51} // namespace llvm::AMDGPU
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
68 AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(&ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
87 AMDGPU::OpName OpName) {
88 unsigned Opc0 = N0->getMachineOpcode();
89 unsigned Opc1 = N1->getMachineOpcode();
90
91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
93
94 if (Op0Idx == -1 && Op1Idx == -1)
95 return true;
96
97
98 if ((Op0Idx == -1 && Op1Idx != -1) ||
99 (Op1Idx == -1 && Op0Idx != -1))
100 return false;
101
102 // getNamedOperandIdx returns the index for the MachineInstr's operands,
103 // which includes the result as the first operand. We are indexing into the
104 // MachineSDNode's operands, so we need to skip the result operand to get
105 // the real index.
106 --Op0Idx;
107 --Op1Idx;
108
109 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
110}
111
112static bool canRemat(const MachineInstr &MI) {
113
117 return true;
118
119 if (SIInstrInfo::isSMRD(MI)) {
120 return !MI.memoperands_empty() &&
121 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
122 return MMO->isLoad() && MMO->isInvariant();
123 });
124 }
125
126 return false;
127}
128
130 const MachineInstr &MI) const {
131
132 if (canRemat(MI)) {
133 // Normally VALU use of exec would block the rematerialization, but that
134 // is OK in this case to have an implicit exec read as all VALU do.
135 // We really want all of the generic logic for this except for this.
136
137 // Another potential implicit use is mode register. The core logic of
138 // the RA will not attempt rematerialization if mode is set anywhere
139 // in the function, otherwise it is safe since mode is not changed.
140
141 // There is difference to generic method which does not allow
142 // rematerialization if there are virtual register uses. We allow this,
143 // therefore this method includes SOP instructions as well.
144 if (!MI.hasImplicitDef() &&
145 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
146 !MI.mayRaiseFPException())
147 return true;
148 }
149
151}
152
153// Returns true if the result of a VALU instruction depends on exec.
154bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
155 assert(isVALU(MI));
156
157 // If it is convergent it depends on EXEC.
158 if (MI.isConvergent())
159 return true;
160
161 // If it defines SGPR it depends on EXEC
162 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
163 for (const MachineOperand &Def : MI.defs()) {
164 if (!Def.isReg())
165 continue;
166
167 Register Reg = Def.getReg();
168 if (Reg && RI.isSGPRReg(MRI, Reg))
169 return true;
170 }
171
172 return false;
173}
174
176 // Any implicit use of exec by VALU is not a real register read.
177 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
178 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
179}
180
182 MachineBasicBlock *SuccToSinkTo,
183 MachineCycleInfo *CI) const {
184 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
185 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
186 return true;
187
188 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
189 // Check if sinking of MI would create temporal divergent use.
190 for (auto Op : MI.uses()) {
191 if (Op.isReg() && Op.getReg().isVirtual() &&
192 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
193 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
194
195 // SgprDef defined inside cycle
196 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
197 if (FromCycle == nullptr)
198 continue;
199
200 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
201 // Check if there is a FromCycle that contains SgprDef's basic block but
202 // does not contain SuccToSinkTo and also has divergent exit condition.
203 while (FromCycle && !FromCycle->contains(ToCycle)) {
205 FromCycle->getExitingBlocks(ExitingBlocks);
206
207 // FromCycle has divergent exit condition.
208 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
209 if (hasDivergentBranch(ExitingBlock))
210 return false;
211 }
212
213 FromCycle = FromCycle->getParentCycle();
214 }
215 }
216 }
217
218 return true;
219}
220
222 int64_t &Offset0,
223 int64_t &Offset1) const {
224 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
225 return false;
226
227 unsigned Opc0 = Load0->getMachineOpcode();
228 unsigned Opc1 = Load1->getMachineOpcode();
229
230 // Make sure both are actually loads.
231 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
232 return false;
233
234 // A mayLoad instruction without a def is not a load. Likely a prefetch.
235 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
236 return false;
237
238 if (isDS(Opc0) && isDS(Opc1)) {
239
240 // FIXME: Handle this case:
241 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
242 return false;
243
244 // Check base reg.
245 if (Load0->getOperand(0) != Load1->getOperand(0))
246 return false;
247
248 // Skip read2 / write2 variants for simplicity.
249 // TODO: We should report true if the used offsets are adjacent (excluded
250 // st64 versions).
251 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
252 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
253 if (Offset0Idx == -1 || Offset1Idx == -1)
254 return false;
255
256 // XXX - be careful of dataless loads
257 // getNamedOperandIdx returns the index for MachineInstrs. Since they
258 // include the output in the operand list, but SDNodes don't, we need to
259 // subtract the index by one.
260 Offset0Idx -= get(Opc0).NumDefs;
261 Offset1Idx -= get(Opc1).NumDefs;
262 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
263 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
264 return true;
265 }
266
267 if (isSMRD(Opc0) && isSMRD(Opc1)) {
268 // Skip time and cache invalidation instructions.
269 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
270 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
271 return false;
272
273 unsigned NumOps = getNumOperandsNoGlue(Load0);
274 if (NumOps != getNumOperandsNoGlue(Load1))
275 return false;
276
277 // Check base reg.
278 if (Load0->getOperand(0) != Load1->getOperand(0))
279 return false;
280
281 // Match register offsets, if both register and immediate offsets present.
282 assert(NumOps == 4 || NumOps == 5);
283 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
284 return false;
285
286 const ConstantSDNode *Load0Offset =
288 const ConstantSDNode *Load1Offset =
290
291 if (!Load0Offset || !Load1Offset)
292 return false;
293
294 Offset0 = Load0Offset->getZExtValue();
295 Offset1 = Load1Offset->getZExtValue();
296 return true;
297 }
298
299 // MUBUF and MTBUF can access the same addresses.
300 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
301
302 // MUBUF and MTBUF have vaddr at different indices.
303 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
304 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
305 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
306 return false;
307
308 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
309 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
310
311 if (OffIdx0 == -1 || OffIdx1 == -1)
312 return false;
313
314 // getNamedOperandIdx returns the index for MachineInstrs. Since they
315 // include the output in the operand list, but SDNodes don't, we need to
316 // subtract the index by one.
317 OffIdx0 -= get(Opc0).NumDefs;
318 OffIdx1 -= get(Opc1).NumDefs;
319
320 SDValue Off0 = Load0->getOperand(OffIdx0);
321 SDValue Off1 = Load1->getOperand(OffIdx1);
322
323 // The offset might be a FrameIndexSDNode.
324 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
325 return false;
326
327 Offset0 = Off0->getAsZExtVal();
328 Offset1 = Off1->getAsZExtVal();
329 return true;
330 }
331
332 return false;
333}
334
335static bool isStride64(unsigned Opc) {
336 switch (Opc) {
337 case AMDGPU::DS_READ2ST64_B32:
338 case AMDGPU::DS_READ2ST64_B64:
339 case AMDGPU::DS_WRITE2ST64_B32:
340 case AMDGPU::DS_WRITE2ST64_B64:
341 return true;
342 default:
343 return false;
344 }
345}
346
349 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
350 const TargetRegisterInfo *TRI) const {
351 if (!LdSt.mayLoadOrStore())
352 return false;
353
354 unsigned Opc = LdSt.getOpcode();
355 OffsetIsScalable = false;
356 const MachineOperand *BaseOp, *OffsetOp;
357 int DataOpIdx;
358
359 if (isDS(LdSt)) {
360 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
361 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
362 if (OffsetOp) {
363 // Normal, single offset LDS instruction.
364 if (!BaseOp) {
365 // DS_CONSUME/DS_APPEND use M0 for the base address.
366 // TODO: find the implicit use operand for M0 and use that as BaseOp?
367 return false;
368 }
369 BaseOps.push_back(BaseOp);
370 Offset = OffsetOp->getImm();
371 // Get appropriate operand, and compute width accordingly.
372 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
373 if (DataOpIdx == -1)
374 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
375 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
376 Width = LocationSize::precise(64);
377 else
378 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
379 } else {
380 // The 2 offset instructions use offset0 and offset1 instead. We can treat
381 // these as a load with a single offset if the 2 offsets are consecutive.
382 // We will use this for some partially aligned loads.
383 const MachineOperand *Offset0Op =
384 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
385 const MachineOperand *Offset1Op =
386 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
387
388 unsigned Offset0 = Offset0Op->getImm() & 0xff;
389 unsigned Offset1 = Offset1Op->getImm() & 0xff;
390 if (Offset0 + 1 != Offset1)
391 return false;
392
393 // Each of these offsets is in element sized units, so we need to convert
394 // to bytes of the individual reads.
395
396 unsigned EltSize;
397 if (LdSt.mayLoad())
398 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
399 else {
400 assert(LdSt.mayStore());
401 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
402 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
403 }
404
405 if (isStride64(Opc))
406 EltSize *= 64;
407
408 BaseOps.push_back(BaseOp);
409 Offset = EltSize * Offset0;
410 // Get appropriate operand(s), and compute width accordingly.
411 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
412 if (DataOpIdx == -1) {
413 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
414 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
415 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
416 Width = LocationSize::precise(
417 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
418 } else {
419 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
420 }
421 }
422 return true;
423 }
424
425 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
426 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
427 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
428 return false;
429 BaseOps.push_back(RSrc);
430 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
431 if (BaseOp && !BaseOp->isFI())
432 BaseOps.push_back(BaseOp);
433 const MachineOperand *OffsetImm =
434 getNamedOperand(LdSt, AMDGPU::OpName::offset);
435 Offset = OffsetImm->getImm();
436 const MachineOperand *SOffset =
437 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
438 if (SOffset) {
439 if (SOffset->isReg())
440 BaseOps.push_back(SOffset);
441 else
442 Offset += SOffset->getImm();
443 }
444 // Get appropriate operand, and compute width accordingly.
445 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
446 if (DataOpIdx == -1)
447 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
448 if (DataOpIdx == -1) // LDS DMA
449 return false;
450 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
451 return true;
452 }
453
454 if (isImage(LdSt)) {
455 auto RsrcOpName =
456 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
457 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
458 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
459 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
460 if (VAddr0Idx >= 0) {
461 // GFX10 possible NSA encoding.
462 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
463 BaseOps.push_back(&LdSt.getOperand(I));
464 } else {
465 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
466 }
467 Offset = 0;
468 // Get appropriate operand, and compute width accordingly.
469 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
470 if (DataOpIdx == -1)
471 return false; // no return sampler
472 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
473 return true;
474 }
475
476 if (isSMRD(LdSt)) {
477 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
478 if (!BaseOp) // e.g. S_MEMTIME
479 return false;
480 BaseOps.push_back(BaseOp);
481 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
482 Offset = OffsetOp ? OffsetOp->getImm() : 0;
483 // Get appropriate operand, and compute width accordingly.
484 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
485 if (DataOpIdx == -1)
486 return false;
487 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
488 return true;
489 }
490
491 if (isFLAT(LdSt)) {
492 // Instructions have either vaddr or saddr or both or none.
493 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
494 if (BaseOp)
495 BaseOps.push_back(BaseOp);
496 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
497 if (BaseOp)
498 BaseOps.push_back(BaseOp);
499 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
500 // Get appropriate operand, and compute width accordingly.
501 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
502 if (DataOpIdx == -1)
503 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
504 if (DataOpIdx == -1) // LDS DMA
505 return false;
506 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
507 return true;
508 }
509
510 return false;
511}
512
513static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
515 const MachineInstr &MI2,
517 // Only examine the first "base" operand of each instruction, on the
518 // assumption that it represents the real base address of the memory access.
519 // Other operands are typically offsets or indices from this base address.
520 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
521 return true;
522
523 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
524 return false;
525
526 auto *MO1 = *MI1.memoperands_begin();
527 auto *MO2 = *MI2.memoperands_begin();
528 if (MO1->getAddrSpace() != MO2->getAddrSpace())
529 return false;
530
531 const auto *Base1 = MO1->getValue();
532 const auto *Base2 = MO2->getValue();
533 if (!Base1 || !Base2)
534 return false;
535 Base1 = getUnderlyingObject(Base1);
536 Base2 = getUnderlyingObject(Base2);
537
538 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
539 return false;
540
541 return Base1 == Base2;
542}
543
545 int64_t Offset1, bool OffsetIsScalable1,
547 int64_t Offset2, bool OffsetIsScalable2,
548 unsigned ClusterSize,
549 unsigned NumBytes) const {
550 // If the mem ops (to be clustered) do not have the same base ptr, then they
551 // should not be clustered
552 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
553 if (!BaseOps1.empty() && !BaseOps2.empty()) {
554 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
555 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
556 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
557 return false;
558
559 const SIMachineFunctionInfo *MFI =
560 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
561 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
562 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
563 // If only one base op is empty, they do not have the same base ptr
564 return false;
565 }
566
567 // In order to avoid register pressure, on an average, the number of DWORDS
568 // loaded together by all clustered mem ops should not exceed
569 // MaxMemoryClusterDWords. This is an empirical value based on certain
570 // observations and performance related experiments.
571 // The good thing about this heuristic is - it avoids clustering of too many
572 // sub-word loads, and also avoids clustering of wide loads. Below is the
573 // brief summary of how the heuristic behaves for various `LoadSize` when
574 // MaxMemoryClusterDWords is 8.
575 //
576 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
577 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
578 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
579 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
580 // (5) LoadSize >= 17: do not cluster
581 const unsigned LoadSize = NumBytes / ClusterSize;
582 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
583 return NumDWords <= MaxMemoryClusterDWords;
584}
585
586// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
587// the first 16 loads will be interleaved with the stores, and the next 16 will
588// be clustered as expected. It should really split into 2 16 store batches.
589//
590// Loads are clustered until this returns false, rather than trying to schedule
591// groups of stores. This also means we have to deal with saying different
592// address space loads should be clustered, and ones which might cause bank
593// conflicts.
594//
595// This might be deprecated so it might not be worth that much effort to fix.
597 int64_t Offset0, int64_t Offset1,
598 unsigned NumLoads) const {
599 assert(Offset1 > Offset0 &&
600 "Second offset should be larger than first offset!");
601 // If we have less than 16 loads in a row, and the offsets are within 64
602 // bytes, then schedule together.
603
604 // A cacheline is 64 bytes (for global memory).
605 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
606}
607
610 const DebugLoc &DL, MCRegister DestReg,
611 MCRegister SrcReg, bool KillSrc,
612 const char *Msg = "illegal VGPR to SGPR copy") {
613 MachineFunction *MF = MBB.getParent();
614
616 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
617
618 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
619 .addReg(SrcReg, getKillRegState(KillSrc));
620}
621
622/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
623/// possible to have a direct copy in these cases on GFX908, so an intermediate
624/// VGPR copy is required.
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 RegScavenger &RS, bool RegsOverlap,
631 Register ImpDefSuperReg = Register(),
632 Register ImpUseSuperReg = Register()) {
633 assert((TII.getSubtarget().hasMAIInsts() &&
634 !TII.getSubtarget().hasGFX90AInsts()) &&
635 "Expected GFX908 subtarget.");
636
637 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
638 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
639 "Source register of the copy should be either an SGPR or an AGPR.");
640
641 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
642 "Destination register of the copy should be an AGPR.");
643
644 const SIRegisterInfo &RI = TII.getRegisterInfo();
645
646 // First try to find defining accvgpr_write to avoid temporary registers.
647 // In the case of copies of overlapping AGPRs, we conservatively do not
648 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
649 // an accvgpr_write used for this same copy due to implicit-defs
650 if (!RegsOverlap) {
651 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
652 --Def;
653
654 if (!Def->modifiesRegister(SrcReg, &RI))
655 continue;
656
657 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
658 Def->getOperand(0).getReg() != SrcReg)
659 break;
660
661 MachineOperand &DefOp = Def->getOperand(1);
662 assert(DefOp.isReg() || DefOp.isImm());
663
664 if (DefOp.isReg()) {
665 bool SafeToPropagate = true;
666 // Check that register source operand is not clobbered before MI.
667 // Immediate operands are always safe to propagate.
668 for (auto I = Def; I != MI && SafeToPropagate; ++I)
669 if (I->modifiesRegister(DefOp.getReg(), &RI))
670 SafeToPropagate = false;
671
672 if (!SafeToPropagate)
673 break;
674
675 for (auto I = Def; I != MI; ++I)
676 I->clearRegisterKills(DefOp.getReg(), &RI);
677 }
678
679 MachineInstrBuilder Builder =
680 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
681 .add(DefOp);
682 if (ImpDefSuperReg)
683 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
684
685 if (ImpUseSuperReg) {
686 Builder.addReg(ImpUseSuperReg,
688 }
689
690 return;
691 }
692 }
693
694 RS.enterBasicBlockEnd(MBB);
695 RS.backward(std::next(MI));
696
697 // Ideally we want to have three registers for a long reg_sequence copy
698 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
699 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
700 *MBB.getParent());
701
702 // Registers in the sequence are allocated contiguously so we can just
703 // use register number to pick one of three round-robin temps.
704 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
705 Register Tmp =
706 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
707 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
708 "VGPR used for an intermediate copy should have been reserved.");
709
710 // Only loop through if there are any free registers left. We don't want to
711 // spill.
712 while (RegNo--) {
713 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
714 /* RestoreAfter */ false, 0,
715 /* AllowSpill */ false);
716 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
717 break;
718 Tmp = Tmp2;
719 RS.setRegUsed(Tmp);
720 }
721
722 // Insert copy to temporary VGPR.
723 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
724 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
725 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
726 } else {
727 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
728 }
729
730 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
731 .addReg(SrcReg, getKillRegState(KillSrc));
732 if (ImpUseSuperReg) {
733 UseBuilder.addReg(ImpUseSuperReg,
735 }
736
737 MachineInstrBuilder DefBuilder
738 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
739 .addReg(Tmp, RegState::Kill);
740
741 if (ImpDefSuperReg)
742 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
743}
744
747 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
748 const TargetRegisterClass *RC, bool Forward) {
749 const SIRegisterInfo &RI = TII.getRegisterInfo();
750 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
752 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
753
754 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
755 int16_t SubIdx = BaseIndices[Idx];
756 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
757 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
758 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
759 unsigned Opcode = AMDGPU::S_MOV_B32;
760
761 // Is SGPR aligned? If so try to combine with next.
762 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
763 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
764 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
765 // Can use SGPR64 copy
766 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
767 SubIdx = RI.getSubRegFromChannel(Channel, 2);
768 DestSubReg = RI.getSubReg(DestReg, SubIdx);
769 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
770 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
771 Opcode = AMDGPU::S_MOV_B64;
772 Idx++;
773 }
774
775 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
776 .addReg(SrcSubReg)
777 .addReg(SrcReg, RegState::Implicit);
778
779 if (!FirstMI)
780 FirstMI = LastMI;
781
782 if (!Forward)
783 I--;
784 }
785
786 assert(FirstMI && LastMI);
787 if (!Forward)
788 std::swap(FirstMI, LastMI);
789
790 FirstMI->addOperand(
791 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
792
793 if (KillSrc)
794 LastMI->addRegisterKilled(SrcReg, &RI);
795}
796
799 const DebugLoc &DL, Register DestReg,
800 Register SrcReg, bool KillSrc, bool RenamableDest,
801 bool RenamableSrc) const {
802 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
803 unsigned Size = RI.getRegSizeInBits(*RC);
804 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
805 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
806
807 // The rest of copyPhysReg assumes Src and Dst size are the same size.
808 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
809 // we remove Fix16BitCopies and this code block?
810 if (Fix16BitCopies) {
811 if (((Size == 16) != (SrcSize == 16))) {
812 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
813 assert(ST.useRealTrue16Insts());
814 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
815 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
816 RegToFix = SubReg;
817
818 if (DestReg == SrcReg) {
819 // Identity copy. Insert empty bundle since ExpandPostRA expects an
820 // instruction here.
821 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
822 return;
823 }
824 RC = RI.getPhysRegBaseClass(DestReg);
825 Size = RI.getRegSizeInBits(*RC);
826 SrcRC = RI.getPhysRegBaseClass(SrcReg);
827 SrcSize = RI.getRegSizeInBits(*SrcRC);
828 }
829 }
830
831 if (RC == &AMDGPU::VGPR_32RegClass) {
832 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
833 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
834 AMDGPU::AGPR_32RegClass.contains(SrcReg));
835 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
836 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
837 BuildMI(MBB, MI, DL, get(Opc), DestReg)
838 .addReg(SrcReg, getKillRegState(KillSrc));
839 return;
840 }
841
842 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
843 RC == &AMDGPU::SReg_32RegClass) {
844 if (SrcReg == AMDGPU::SCC) {
845 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
846 .addImm(1)
847 .addImm(0);
848 return;
849 }
850
851 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
852 if (DestReg == AMDGPU::VCC_LO) {
853 // FIXME: Hack until VReg_1 removed.
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
855 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
856 .addImm(0)
857 .addReg(SrcReg, getKillRegState(KillSrc));
858 return;
859 }
860
861 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
862 return;
863 }
864
865 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
866 .addReg(SrcReg, getKillRegState(KillSrc));
867 return;
868 }
869
870 if (RC == &AMDGPU::SReg_64RegClass) {
871 if (SrcReg == AMDGPU::SCC) {
872 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
873 .addImm(1)
874 .addImm(0);
875 return;
876 }
877
878 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
879 if (DestReg == AMDGPU::VCC) {
880 // FIXME: Hack until VReg_1 removed.
881 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
882 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
883 .addImm(0)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
889 return;
890 }
891
892 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
893 .addReg(SrcReg, getKillRegState(KillSrc));
894 return;
895 }
896
897 if (DestReg == AMDGPU::SCC) {
898 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
899 // but SelectionDAG emits such copies for i1 sources.
900 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
901 // This copy can only be produced by patterns
902 // with explicit SCC, which are known to be enabled
903 // only for subtargets with S_CMP_LG_U64 present.
904 assert(ST.hasScalarCompareEq64());
905 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
906 .addReg(SrcReg, getKillRegState(KillSrc))
907 .addImm(0);
908 } else {
909 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
911 .addReg(SrcReg, getKillRegState(KillSrc))
912 .addImm(0);
913 }
914
915 return;
916 }
917
918 if (RC == &AMDGPU::AGPR_32RegClass) {
919 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
920 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
921 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
922 .addReg(SrcReg, getKillRegState(KillSrc));
923 return;
924 }
925
926 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
927 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
928 .addReg(SrcReg, getKillRegState(KillSrc));
929 return;
930 }
931
932 // FIXME: Pass should maintain scavenger to avoid scan through the block on
933 // every AGPR spill.
934 RegScavenger RS;
935 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
936 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
937 return;
938 }
939
940 if (Size == 16) {
941 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
942 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
943 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
944
945 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
946 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
947 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
948 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
949 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
950 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
951 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
952 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
953
954 if (IsSGPRDst) {
955 if (!IsSGPRSrc) {
956 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
957 return;
958 }
959
960 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
961 .addReg(NewSrcReg, getKillRegState(KillSrc));
962 return;
963 }
964
965 if (IsAGPRDst || IsAGPRSrc) {
966 if (!DstLow || !SrcLow) {
967 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
968 "Cannot use hi16 subreg with an AGPR!");
969 }
970
971 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
972 return;
973 }
974
975 if (ST.useRealTrue16Insts()) {
976 if (IsSGPRSrc) {
977 assert(SrcLow);
978 SrcReg = NewSrcReg;
979 }
980 // Use the smaller instruction encoding if possible.
981 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
982 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
983 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
984 .addReg(SrcReg);
985 } else {
986 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
987 .addImm(0) // src0_modifiers
988 .addReg(SrcReg)
989 .addImm(0); // op_sel
990 }
991 return;
992 }
993
994 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
995 if (!DstLow || !SrcLow) {
996 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
997 "Cannot use hi16 subreg on VI!");
998 }
999
1000 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1001 .addReg(NewSrcReg, getKillRegState(KillSrc));
1002 return;
1003 }
1004
1005 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1006 .addImm(0) // src0_modifiers
1007 .addReg(NewSrcReg)
1008 .addImm(0) // clamp
1015 // First implicit operand is $exec.
1016 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1017 return;
1018 }
1019
1020 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1021 if (ST.hasVMovB64Inst()) {
1022 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1023 .addReg(SrcReg, getKillRegState(KillSrc));
1024 return;
1025 }
1026 if (ST.hasPkMovB32()) {
1027 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1029 .addReg(SrcReg)
1031 .addReg(SrcReg)
1032 .addImm(0) // op_sel_lo
1033 .addImm(0) // op_sel_hi
1034 .addImm(0) // neg_lo
1035 .addImm(0) // neg_hi
1036 .addImm(0) // clamp
1037 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1038 return;
1039 }
1040 }
1041
1042 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1043 if (RI.isSGPRClass(RC)) {
1044 if (!RI.isSGPRClass(SrcRC)) {
1045 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1046 return;
1047 }
1048 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1049 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1050 Forward);
1051 return;
1052 }
1053
1054 unsigned EltSize = 4;
1055 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1056 if (RI.isAGPRClass(RC)) {
1057 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1058 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1059 else if (RI.hasVGPRs(SrcRC) ||
1060 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1061 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1062 else
1063 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1064 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1065 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1066 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1067 (RI.isProperlyAlignedRC(*RC) &&
1068 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1069 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1070 if (ST.hasVMovB64Inst()) {
1071 Opcode = AMDGPU::V_MOV_B64_e32;
1072 EltSize = 8;
1073 } else if (ST.hasPkMovB32()) {
1074 Opcode = AMDGPU::V_PK_MOV_B32;
1075 EltSize = 8;
1076 }
1077 }
1078
1079 // For the cases where we need an intermediate instruction/temporary register
1080 // (destination is an AGPR), we need a scavenger.
1081 //
1082 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1083 // whole block for every handled copy.
1084 std::unique_ptr<RegScavenger> RS;
1085 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1086 RS = std::make_unique<RegScavenger>();
1087
1088 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1089
1090 // If there is an overlap, we can't kill the super-register on the last
1091 // instruction, since it will also kill the components made live by this def.
1092 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1093 const bool CanKillSuperReg = KillSrc && !Overlap;
1094
1095 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1096 unsigned SubIdx;
1097 if (Forward)
1098 SubIdx = SubIndices[Idx];
1099 else
1100 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1101 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1102 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1103 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1104
1105 bool IsFirstSubreg = Idx == 0;
1106 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1107
1108 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1109 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1110 Register ImpUseSuper = SrcReg;
1111 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1112 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1113 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1115 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1117 .addReg(SrcSubReg)
1119 .addReg(SrcSubReg)
1120 .addImm(0) // op_sel_lo
1121 .addImm(0) // op_sel_hi
1122 .addImm(0) // neg_lo
1123 .addImm(0) // neg_hi
1124 .addImm(0) // clamp
1125 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1126 if (IsFirstSubreg)
1128 } else {
1129 MachineInstrBuilder Builder =
1130 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1131 if (IsFirstSubreg)
1132 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1133
1134 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1135 }
1136 }
1137}
1138
1139int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1140 int32_t NewOpc;
1141
1142 // Try to map original to commuted opcode
1143 NewOpc = AMDGPU::getCommuteRev(Opcode);
1144 if (NewOpc != -1)
1145 // Check if the commuted (REV) opcode exists on the target.
1146 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1147
1148 // Try to map commuted to original opcode
1149 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1150 if (NewOpc != -1)
1151 // Check if the original (non-REV) opcode exists on the target.
1152 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1153
1154 return Opcode;
1155}
1156
1157const TargetRegisterClass *
1159 return &AMDGPU::VGPR_32RegClass;
1160}
1161
1164 const DebugLoc &DL, Register DstReg,
1166 Register TrueReg,
1167 Register FalseReg) const {
1168 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1169 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1171 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1172 "Not a VGPR32 reg");
1173
1174 if (Cond.size() == 1) {
1175 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1176 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1177 .add(Cond[0]);
1178 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1179 .addImm(0)
1180 .addReg(FalseReg)
1181 .addImm(0)
1182 .addReg(TrueReg)
1183 .addReg(SReg);
1184 } else if (Cond.size() == 2) {
1185 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1186 switch (Cond[0].getImm()) {
1187 case SIInstrInfo::SCC_TRUE: {
1188 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1189 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1190 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1191 .addImm(0)
1192 .addReg(FalseReg)
1193 .addImm(0)
1194 .addReg(TrueReg)
1195 .addReg(SReg);
1196 break;
1197 }
1198 case SIInstrInfo::SCC_FALSE: {
1199 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1200 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1201 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1202 .addImm(0)
1203 .addReg(FalseReg)
1204 .addImm(0)
1205 .addReg(TrueReg)
1206 .addReg(SReg);
1207 break;
1208 }
1209 case SIInstrInfo::VCCNZ: {
1210 MachineOperand RegOp = Cond[1];
1211 RegOp.setImplicit(false);
1212 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1213 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1214 .add(RegOp);
1215 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1216 .addImm(0)
1217 .addReg(FalseReg)
1218 .addImm(0)
1219 .addReg(TrueReg)
1220 .addReg(SReg);
1221 break;
1222 }
1223 case SIInstrInfo::VCCZ: {
1224 MachineOperand RegOp = Cond[1];
1225 RegOp.setImplicit(false);
1226 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1227 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1228 .add(RegOp);
1229 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1230 .addImm(0)
1231 .addReg(TrueReg)
1232 .addImm(0)
1233 .addReg(FalseReg)
1234 .addReg(SReg);
1235 break;
1236 }
1237 case SIInstrInfo::EXECNZ: {
1238 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1239 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1240 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1241 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1242 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1243 .addImm(0)
1244 .addReg(FalseReg)
1245 .addImm(0)
1246 .addReg(TrueReg)
1247 .addReg(SReg);
1248 break;
1249 }
1250 case SIInstrInfo::EXECZ: {
1251 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1252 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1253 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1254 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1255 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1256 .addImm(0)
1257 .addReg(FalseReg)
1258 .addImm(0)
1259 .addReg(TrueReg)
1260 .addReg(SReg);
1261 llvm_unreachable("Unhandled branch predicate EXECZ");
1262 break;
1263 }
1264 default:
1265 llvm_unreachable("invalid branch predicate");
1266 }
1267 } else {
1268 llvm_unreachable("Can only handle Cond size 1 or 2");
1269 }
1270}
1271
1274 const DebugLoc &DL,
1275 Register SrcReg, int Value) const {
1276 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1277 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1278 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1279 .addImm(Value)
1280 .addReg(SrcReg);
1281
1282 return Reg;
1283}
1284
1287 const DebugLoc &DL,
1288 Register SrcReg, int Value) const {
1289 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1290 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1291 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1292 .addImm(Value)
1293 .addReg(SrcReg);
1294
1295 return Reg;
1296}
1297
1299 const Register Reg,
1300 int64_t &ImmVal) const {
1301 switch (MI.getOpcode()) {
1302 case AMDGPU::V_MOV_B32_e32:
1303 case AMDGPU::S_MOV_B32:
1304 case AMDGPU::S_MOVK_I32:
1305 case AMDGPU::S_MOV_B64:
1306 case AMDGPU::V_MOV_B64_e32:
1307 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1308 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1309 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1310 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1311 case AMDGPU::V_MOV_B64_PSEUDO:
1312 case AMDGPU::V_MOV_B16_t16_e32: {
1313 const MachineOperand &Src0 = MI.getOperand(1);
1314 if (Src0.isImm()) {
1315 ImmVal = Src0.getImm();
1316 return MI.getOperand(0).getReg() == Reg;
1317 }
1318
1319 return false;
1320 }
1321 case AMDGPU::V_MOV_B16_t16_e64: {
1322 const MachineOperand &Src0 = MI.getOperand(2);
1323 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1324 ImmVal = Src0.getImm();
1325 return MI.getOperand(0).getReg() == Reg;
1326 }
1327
1328 return false;
1329 }
1330 case AMDGPU::S_BREV_B32:
1331 case AMDGPU::V_BFREV_B32_e32:
1332 case AMDGPU::V_BFREV_B32_e64: {
1333 const MachineOperand &Src0 = MI.getOperand(1);
1334 if (Src0.isImm()) {
1335 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1336 return MI.getOperand(0).getReg() == Reg;
1337 }
1338
1339 return false;
1340 }
1341 case AMDGPU::S_NOT_B32:
1342 case AMDGPU::V_NOT_B32_e32:
1343 case AMDGPU::V_NOT_B32_e64: {
1344 const MachineOperand &Src0 = MI.getOperand(1);
1345 if (Src0.isImm()) {
1346 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1347 return MI.getOperand(0).getReg() == Reg;
1348 }
1349
1350 return false;
1351 }
1352 default:
1353 return false;
1354 }
1355}
1356
1357std::optional<int64_t>
1359 if (Op.isImm())
1360 return Op.getImm();
1361
1362 if (!Op.isReg() || !Op.getReg().isVirtual())
1363 return std::nullopt;
1364 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1365 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1366 if (Def && Def->isMoveImmediate()) {
1367 const MachineOperand &ImmSrc = Def->getOperand(1);
1368 if (ImmSrc.isImm())
1369 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1370 }
1371
1372 return std::nullopt;
1373}
1374
1376
1377 if (RI.isAGPRClass(DstRC))
1378 return AMDGPU::COPY;
1379 if (RI.getRegSizeInBits(*DstRC) == 16) {
1380 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1381 // before RA.
1382 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1383 }
1384 if (RI.getRegSizeInBits(*DstRC) == 32)
1385 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1386 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1387 return AMDGPU::S_MOV_B64;
1388 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1389 return AMDGPU::V_MOV_B64_PSEUDO;
1390 return AMDGPU::COPY;
1391}
1392
1393const MCInstrDesc &
1395 bool IsIndirectSrc) const {
1396 if (IsIndirectSrc) {
1397 if (VecSize <= 32) // 4 bytes
1398 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1399 if (VecSize <= 64) // 8 bytes
1400 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1401 if (VecSize <= 96) // 12 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1403 if (VecSize <= 128) // 16 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1405 if (VecSize <= 160) // 20 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1407 if (VecSize <= 192) // 24 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1409 if (VecSize <= 224) // 28 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 192) // 24 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1441 if (VecSize <= 224) // 28 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1443 if (VecSize <= 256) // 32 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1445 if (VecSize <= 288) // 36 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1447 if (VecSize <= 320) // 40 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1449 if (VecSize <= 352) // 44 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1451 if (VecSize <= 384) // 48 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1453 if (VecSize <= 512) // 64 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1455 if (VecSize <= 1024) // 128 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1457
1458 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1459}
1460
1461static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1462 if (VecSize <= 32) // 4 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1464 if (VecSize <= 64) // 8 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1466 if (VecSize <= 96) // 12 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1468 if (VecSize <= 128) // 16 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1470 if (VecSize <= 160) // 20 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1472 if (VecSize <= 192) // 24 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1474 if (VecSize <= 224) // 28 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1476 if (VecSize <= 256) // 32 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1478 if (VecSize <= 288) // 36 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1480 if (VecSize <= 320) // 40 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1482 if (VecSize <= 352) // 44 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1484 if (VecSize <= 384) // 48 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1486 if (VecSize <= 512) // 64 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1488 if (VecSize <= 1024) // 128 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1490
1491 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1492}
1493
1494static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1495 if (VecSize <= 32) // 4 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1497 if (VecSize <= 64) // 8 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1499 if (VecSize <= 96) // 12 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1501 if (VecSize <= 128) // 16 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1503 if (VecSize <= 160) // 20 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1505 if (VecSize <= 192) // 24 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1507 if (VecSize <= 224) // 28 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1509 if (VecSize <= 256) // 32 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1511 if (VecSize <= 288) // 36 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1513 if (VecSize <= 320) // 40 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1515 if (VecSize <= 352) // 44 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1517 if (VecSize <= 384) // 48 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1528 if (VecSize <= 64) // 8 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1530 if (VecSize <= 128) // 16 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1532 if (VecSize <= 256) // 32 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1534 if (VecSize <= 512) // 64 bytes
1535 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1536 if (VecSize <= 1024) // 128 bytes
1537 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1538
1539 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1540}
1541
1542const MCInstrDesc &
1543SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1544 bool IsSGPR) const {
1545 if (IsSGPR) {
1546 switch (EltSize) {
1547 case 32:
1548 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1549 case 64:
1550 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1551 default:
1552 llvm_unreachable("invalid reg indexing elt size");
1553 }
1554 }
1555
1556 assert(EltSize == 32 && "invalid reg indexing elt size");
1558}
1559
1560static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1561 switch (Size) {
1562 case 4:
1563 return NeedsCFI ? AMDGPU::SI_SPILL_S32_CFI_SAVE : AMDGPU::SI_SPILL_S32_SAVE;
1564 case 8:
1565 return NeedsCFI ? AMDGPU::SI_SPILL_S64_CFI_SAVE : AMDGPU::SI_SPILL_S64_SAVE;
1566 case 12:
1567 return NeedsCFI ? AMDGPU::SI_SPILL_S96_CFI_SAVE : AMDGPU::SI_SPILL_S96_SAVE;
1568 case 16:
1569 return NeedsCFI ? AMDGPU::SI_SPILL_S128_CFI_SAVE
1570 : AMDGPU::SI_SPILL_S128_SAVE;
1571 case 20:
1572 return NeedsCFI ? AMDGPU::SI_SPILL_S160_CFI_SAVE
1573 : AMDGPU::SI_SPILL_S160_SAVE;
1574 case 24:
1575 return NeedsCFI ? AMDGPU::SI_SPILL_S192_CFI_SAVE
1576 : AMDGPU::SI_SPILL_S192_SAVE;
1577 case 28:
1578 return NeedsCFI ? AMDGPU::SI_SPILL_S224_CFI_SAVE
1579 : AMDGPU::SI_SPILL_S224_SAVE;
1580 case 32:
1581 return AMDGPU::SI_SPILL_S256_SAVE;
1582 case 36:
1583 return AMDGPU::SI_SPILL_S288_SAVE;
1584 case 40:
1585 return AMDGPU::SI_SPILL_S320_SAVE;
1586 case 44:
1587 return AMDGPU::SI_SPILL_S352_SAVE;
1588 case 48:
1589 return AMDGPU::SI_SPILL_S384_SAVE;
1590 case 64:
1591 return NeedsCFI ? AMDGPU::SI_SPILL_S512_CFI_SAVE
1592 : AMDGPU::SI_SPILL_S512_SAVE;
1593 case 128:
1594 return NeedsCFI ? AMDGPU::SI_SPILL_S1024_CFI_SAVE
1595 : AMDGPU::SI_SPILL_S1024_SAVE;
1596 default:
1597 llvm_unreachable("unknown register size");
1598 }
1599}
1600
1601static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1602 switch (Size) {
1603 case 2:
1604 return AMDGPU::SI_SPILL_V16_SAVE;
1605 case 4:
1606 return NeedsCFI ? AMDGPU::SI_SPILL_V32_CFI_SAVE : AMDGPU::SI_SPILL_V32_SAVE;
1607 case 8:
1608 return NeedsCFI ? AMDGPU::SI_SPILL_V64_CFI_SAVE : AMDGPU::SI_SPILL_V64_SAVE;
1609 case 12:
1610 return NeedsCFI ? AMDGPU::SI_SPILL_V96_CFI_SAVE : AMDGPU::SI_SPILL_V96_SAVE;
1611 case 16:
1612 return NeedsCFI ? AMDGPU::SI_SPILL_V128_CFI_SAVE
1613 : AMDGPU::SI_SPILL_V128_SAVE;
1614 case 20:
1615 return NeedsCFI ? AMDGPU::SI_SPILL_V160_CFI_SAVE
1616 : AMDGPU::SI_SPILL_V160_SAVE;
1617 case 24:
1618 return NeedsCFI ? AMDGPU::SI_SPILL_V192_CFI_SAVE
1619 : AMDGPU::SI_SPILL_V192_SAVE;
1620 case 28:
1621 return NeedsCFI ? AMDGPU::SI_SPILL_V224_CFI_SAVE
1622 : AMDGPU::SI_SPILL_V224_SAVE;
1623 case 32:
1624 return NeedsCFI ? AMDGPU::SI_SPILL_V256_CFI_SAVE
1625 : AMDGPU::SI_SPILL_V256_SAVE;
1626 case 36:
1627 return NeedsCFI ? AMDGPU::SI_SPILL_V288_CFI_SAVE
1628 : AMDGPU::SI_SPILL_V288_SAVE;
1629 case 40:
1630 return NeedsCFI ? AMDGPU::SI_SPILL_V320_CFI_SAVE
1631 : AMDGPU::SI_SPILL_V320_SAVE;
1632 case 44:
1633 return NeedsCFI ? AMDGPU::SI_SPILL_V352_CFI_SAVE
1634 : AMDGPU::SI_SPILL_V352_SAVE;
1635 case 48:
1636 return NeedsCFI ? AMDGPU::SI_SPILL_V384_CFI_SAVE
1637 : AMDGPU::SI_SPILL_V384_SAVE;
1638 case 64:
1639 return NeedsCFI ? AMDGPU::SI_SPILL_V512_CFI_SAVE
1640 : AMDGPU::SI_SPILL_V512_SAVE;
1641 case 128:
1642 return NeedsCFI ? AMDGPU::SI_SPILL_V1024_CFI_SAVE
1643 : AMDGPU::SI_SPILL_V1024_SAVE;
1644 default:
1645 llvm_unreachable("unknown register size");
1646 }
1647}
1648
1649static unsigned getAVSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1650 switch (Size) {
1651 case 4:
1652 return NeedsCFI ? AMDGPU::SI_SPILL_AV32_CFI_SAVE
1653 : AMDGPU::SI_SPILL_AV32_SAVE;
1654 case 8:
1655 return NeedsCFI ? AMDGPU::SI_SPILL_AV64_CFI_SAVE
1656 : AMDGPU::SI_SPILL_AV64_SAVE;
1657 case 12:
1658 return NeedsCFI ? AMDGPU::SI_SPILL_AV96_CFI_SAVE
1659 : AMDGPU::SI_SPILL_AV96_SAVE;
1660 case 16:
1661 return NeedsCFI ? AMDGPU::SI_SPILL_AV128_CFI_SAVE
1662 : AMDGPU::SI_SPILL_AV128_SAVE;
1663 case 20:
1664 return NeedsCFI ? AMDGPU::SI_SPILL_AV160_CFI_SAVE
1665 : AMDGPU::SI_SPILL_AV160_SAVE;
1666 case 24:
1667 return NeedsCFI ? AMDGPU::SI_SPILL_AV192_CFI_SAVE
1668 : AMDGPU::SI_SPILL_AV192_SAVE;
1669 case 28:
1670 return NeedsCFI ? AMDGPU::SI_SPILL_AV224_CFI_SAVE
1671 : AMDGPU::SI_SPILL_AV224_SAVE;
1672 case 32:
1673 return NeedsCFI ? AMDGPU::SI_SPILL_AV256_CFI_SAVE
1674 : AMDGPU::SI_SPILL_AV256_SAVE;
1675 case 36:
1676 return AMDGPU::SI_SPILL_AV288_SAVE;
1677 case 40:
1678 return AMDGPU::SI_SPILL_AV320_SAVE;
1679 case 44:
1680 return AMDGPU::SI_SPILL_AV352_SAVE;
1681 case 48:
1682 return AMDGPU::SI_SPILL_AV384_SAVE;
1683 case 64:
1684 return NeedsCFI ? AMDGPU::SI_SPILL_AV512_CFI_SAVE
1685 : AMDGPU::SI_SPILL_AV512_SAVE;
1686 case 128:
1687 return NeedsCFI ? AMDGPU::SI_SPILL_AV1024_CFI_SAVE
1688 : AMDGPU::SI_SPILL_AV1024_SAVE;
1689 default:
1690 llvm_unreachable("unknown register size");
1691 }
1692}
1693
1694static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1695 bool IsVectorSuperClass) {
1696 // Currently, there is only 32-bit WWM register spills needed.
1697 if (Size != 4)
1698 llvm_unreachable("unknown wwm register spill size");
1699
1700 if (IsVectorSuperClass)
1701 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1702
1703 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1704}
1705
1707 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1708 const SIMachineFunctionInfo &MFI, bool NeedsCFI) const {
1709 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1710
1711 // Choose the right opcode if spilling a WWM register.
1713 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1714
1715 // TODO: Check if AGPRs are available
1716 if (ST.hasMAIInsts())
1717 return getAVSpillSaveOpcode(Size, NeedsCFI);
1718
1719 return getVGPRSpillSaveOpcode(Size, NeedsCFI);
1720}
1721
1722void SIInstrInfo::storeRegToStackSlotImpl(
1724 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1725 MachineInstr::MIFlag Flags, bool NeedsCFI) const {
1726 MachineFunction *MF = MBB.getParent();
1728 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1729 const DebugLoc &DL = MBB.findDebugLoc(MI);
1730
1731 MachinePointerInfo PtrInfo
1732 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1734 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1735 FrameInfo.getObjectAlign(FrameIndex));
1736 unsigned SpillSize = RI.getSpillSize(*RC);
1737
1738 MachineRegisterInfo &MRI = MF->getRegInfo();
1739 if (RI.isSGPRClass(RC)) {
1740 MFI->setHasSpilledSGPRs();
1741 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1742 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1743 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1744
1745 // We are only allowed to create one new instruction when spilling
1746 // registers, so we need to use pseudo instruction for spilling SGPRs.
1747 const MCInstrDesc &OpDesc =
1748 get(getSGPRSpillSaveOpcode(SpillSize, NeedsCFI));
1749
1750 // The SGPR spill/restore instructions only work on number sgprs, so we need
1751 // to make sure we are using the correct register class.
1752 if (SrcReg.isVirtual() && SpillSize == 4) {
1753 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1754 }
1755
1756 BuildMI(MBB, MI, DL, OpDesc)
1757 .addReg(SrcReg, getKillRegState(isKill)) // data
1758 .addFrameIndex(FrameIndex) // addr
1759 .addMemOperand(MMO)
1761
1762 if (RI.spillSGPRToVGPR())
1763 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1764 return;
1765 }
1766
1767 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1768 SpillSize, *MFI, NeedsCFI);
1769 MFI->setHasSpilledVGPRs();
1770
1771 BuildMI(MBB, MI, DL, get(Opcode))
1772 .addReg(SrcReg, getKillRegState(isKill)) // data
1773 .addFrameIndex(FrameIndex) // addr
1774 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1775 .addImm(0) // offset
1776 .addMemOperand(MMO);
1777}
1778
1781 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1782 MachineInstr::MIFlag Flags) const {
1783 storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, VReg, Flags,
1784 false);
1785}
1786
1789 Register SrcReg, bool isKill,
1790 int FrameIndex,
1791 const TargetRegisterClass *RC) const {
1792 storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, Register(),
1793 MachineInstr::NoFlags, true);
1794}
1795
1796static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1797 switch (Size) {
1798 case 4:
1799 return AMDGPU::SI_SPILL_S32_RESTORE;
1800 case 8:
1801 return AMDGPU::SI_SPILL_S64_RESTORE;
1802 case 12:
1803 return AMDGPU::SI_SPILL_S96_RESTORE;
1804 case 16:
1805 return AMDGPU::SI_SPILL_S128_RESTORE;
1806 case 20:
1807 return AMDGPU::SI_SPILL_S160_RESTORE;
1808 case 24:
1809 return AMDGPU::SI_SPILL_S192_RESTORE;
1810 case 28:
1811 return AMDGPU::SI_SPILL_S224_RESTORE;
1812 case 32:
1813 return AMDGPU::SI_SPILL_S256_RESTORE;
1814 case 36:
1815 return AMDGPU::SI_SPILL_S288_RESTORE;
1816 case 40:
1817 return AMDGPU::SI_SPILL_S320_RESTORE;
1818 case 44:
1819 return AMDGPU::SI_SPILL_S352_RESTORE;
1820 case 48:
1821 return AMDGPU::SI_SPILL_S384_RESTORE;
1822 case 64:
1823 return AMDGPU::SI_SPILL_S512_RESTORE;
1824 case 128:
1825 return AMDGPU::SI_SPILL_S1024_RESTORE;
1826 default:
1827 llvm_unreachable("unknown register size");
1828 }
1829}
1830
1831static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1832 switch (Size) {
1833 case 2:
1834 return AMDGPU::SI_SPILL_V16_RESTORE;
1835 case 4:
1836 return AMDGPU::SI_SPILL_V32_RESTORE;
1837 case 8:
1838 return AMDGPU::SI_SPILL_V64_RESTORE;
1839 case 12:
1840 return AMDGPU::SI_SPILL_V96_RESTORE;
1841 case 16:
1842 return AMDGPU::SI_SPILL_V128_RESTORE;
1843 case 20:
1844 return AMDGPU::SI_SPILL_V160_RESTORE;
1845 case 24:
1846 return AMDGPU::SI_SPILL_V192_RESTORE;
1847 case 28:
1848 return AMDGPU::SI_SPILL_V224_RESTORE;
1849 case 32:
1850 return AMDGPU::SI_SPILL_V256_RESTORE;
1851 case 36:
1852 return AMDGPU::SI_SPILL_V288_RESTORE;
1853 case 40:
1854 return AMDGPU::SI_SPILL_V320_RESTORE;
1855 case 44:
1856 return AMDGPU::SI_SPILL_V352_RESTORE;
1857 case 48:
1858 return AMDGPU::SI_SPILL_V384_RESTORE;
1859 case 64:
1860 return AMDGPU::SI_SPILL_V512_RESTORE;
1861 case 128:
1862 return AMDGPU::SI_SPILL_V1024_RESTORE;
1863 default:
1864 llvm_unreachable("unknown register size");
1865 }
1866}
1867
1868static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1869 switch (Size) {
1870 case 4:
1871 return AMDGPU::SI_SPILL_AV32_RESTORE;
1872 case 8:
1873 return AMDGPU::SI_SPILL_AV64_RESTORE;
1874 case 12:
1875 return AMDGPU::SI_SPILL_AV96_RESTORE;
1876 case 16:
1877 return AMDGPU::SI_SPILL_AV128_RESTORE;
1878 case 20:
1879 return AMDGPU::SI_SPILL_AV160_RESTORE;
1880 case 24:
1881 return AMDGPU::SI_SPILL_AV192_RESTORE;
1882 case 28:
1883 return AMDGPU::SI_SPILL_AV224_RESTORE;
1884 case 32:
1885 return AMDGPU::SI_SPILL_AV256_RESTORE;
1886 case 36:
1887 return AMDGPU::SI_SPILL_AV288_RESTORE;
1888 case 40:
1889 return AMDGPU::SI_SPILL_AV320_RESTORE;
1890 case 44:
1891 return AMDGPU::SI_SPILL_AV352_RESTORE;
1892 case 48:
1893 return AMDGPU::SI_SPILL_AV384_RESTORE;
1894 case 64:
1895 return AMDGPU::SI_SPILL_AV512_RESTORE;
1896 case 128:
1897 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1898 default:
1899 llvm_unreachable("unknown register size");
1900 }
1901}
1902
1903static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1904 bool IsVectorSuperClass) {
1905 // Currently, there is only 32-bit WWM register spills needed.
1906 if (Size != 4)
1907 llvm_unreachable("unknown wwm register spill size");
1908
1909 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1910 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1911
1912 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1913}
1914
1916 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1917 const SIMachineFunctionInfo &MFI) const {
1918 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1919
1920 // Choose the right opcode if restoring a WWM register.
1922 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1923
1924 // TODO: Check if AGPRs are available
1925 if (ST.hasMAIInsts())
1927
1928 assert(!RI.isAGPRClass(RC));
1930}
1931
1934 Register DestReg, int FrameIndex,
1935 const TargetRegisterClass *RC,
1936 Register VReg, unsigned SubReg,
1937 MachineInstr::MIFlag Flags) const {
1938 MachineFunction *MF = MBB.getParent();
1940 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1941 const DebugLoc &DL = MBB.findDebugLoc(MI);
1942 unsigned SpillSize = RI.getSpillSize(*RC);
1943
1944 MachinePointerInfo PtrInfo
1945 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1946
1948 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1949 FrameInfo.getObjectAlign(FrameIndex));
1950
1951 if (RI.isSGPRClass(RC)) {
1952 MFI->setHasSpilledSGPRs();
1953 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1954 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1955 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1956
1957 // FIXME: Maybe this should not include a memoperand because it will be
1958 // lowered to non-memory instructions.
1959 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1960 if (DestReg.isVirtual() && SpillSize == 4) {
1961 MachineRegisterInfo &MRI = MF->getRegInfo();
1962 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1963 }
1964
1965 if (RI.spillSGPRToVGPR())
1966 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1967 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1968 .addFrameIndex(FrameIndex) // addr
1969 .addMemOperand(MMO)
1971
1972 return;
1973 }
1974
1975 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1976 SpillSize, *MFI);
1977 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1978 .addFrameIndex(FrameIndex) // vaddr
1979 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1980 .addImm(0) // offset
1981 .addMemOperand(MMO);
1982}
1983
1988
1991 unsigned Quantity) const {
1992 DebugLoc DL = MBB.findDebugLoc(MI);
1993 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1994 while (Quantity > 0) {
1995 unsigned Arg = std::min(Quantity, MaxSNopCount);
1996 Quantity -= Arg;
1997 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1998 }
1999}
2000
2002 auto *MF = MBB.getParent();
2004
2005 assert(Info->isEntryFunction());
2006
2007 if (MBB.succ_empty()) {
2008 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2009 if (HasNoTerminator) {
2010 if (Info->returnsVoid()) {
2011 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2012 } else {
2013 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2014 }
2015 }
2016 }
2017}
2018
2022 const DebugLoc &DL) const {
2023 MachineFunction *MF = MBB.getParent();
2024 constexpr unsigned DoorbellIDMask = 0x3ff;
2025 constexpr unsigned ECQueueWaveAbort = 0x400;
2026
2027 MachineBasicBlock *TrapBB = &MBB;
2028 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2029
2030 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2031 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2032 TrapBB = MF->CreateMachineBasicBlock();
2033 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2034 MF->push_back(TrapBB);
2035 MBB.addSuccessor(TrapBB);
2036 }
2037 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2038 // will be a nop.
2039 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2040 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2041 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2042 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2043 DoorbellReg)
2045 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2046 .addUse(AMDGPU::M0);
2047 Register DoorbellRegMasked =
2048 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2049 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2050 .addUse(DoorbellReg)
2051 .addImm(DoorbellIDMask);
2052 Register SetWaveAbortBit =
2053 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2054 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2055 .addUse(DoorbellRegMasked)
2056 .addImm(ECQueueWaveAbort);
2057 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2058 .addUse(SetWaveAbortBit);
2059 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2061 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2062 .addUse(AMDGPU::TTMP2);
2063 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2064 TrapBB->addSuccessor(HaltLoopBB);
2065
2066 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2067 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2068 .addMBB(HaltLoopBB);
2069 MF->push_back(HaltLoopBB);
2070 HaltLoopBB->addSuccessor(HaltLoopBB);
2071
2072 return MBB.getNextNode();
2073}
2074
2076 switch (MI.getOpcode()) {
2077 default:
2078 if (MI.isMetaInstruction())
2079 return 0;
2080 return 1; // FIXME: Do wait states equal cycles?
2081
2082 case AMDGPU::S_NOP:
2083 return MI.getOperand(0).getImm() + 1;
2084 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2085 // hazard, even if one exist, won't really be visible. Should we handle it?
2086 }
2087}
2088
2090 MachineBasicBlock &MBB = *MI.getParent();
2091 DebugLoc DL = MBB.findDebugLoc(MI);
2093 switch (MI.getOpcode()) {
2094 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2095 case AMDGPU::S_MOV_B64_term:
2096 // This is only a terminator to get the correct spill code placement during
2097 // register allocation.
2098 MI.setDesc(get(AMDGPU::S_MOV_B64));
2099 break;
2100
2101 case AMDGPU::S_MOV_B32_term:
2102 // This is only a terminator to get the correct spill code placement during
2103 // register allocation.
2104 MI.setDesc(get(AMDGPU::S_MOV_B32));
2105 break;
2106
2107 case AMDGPU::S_XOR_B64_term:
2108 // This is only a terminator to get the correct spill code placement during
2109 // register allocation.
2110 MI.setDesc(get(AMDGPU::S_XOR_B64));
2111 break;
2112
2113 case AMDGPU::S_XOR_B32_term:
2114 // This is only a terminator to get the correct spill code placement during
2115 // register allocation.
2116 MI.setDesc(get(AMDGPU::S_XOR_B32));
2117 break;
2118 case AMDGPU::S_OR_B64_term:
2119 // This is only a terminator to get the correct spill code placement during
2120 // register allocation.
2121 MI.setDesc(get(AMDGPU::S_OR_B64));
2122 break;
2123 case AMDGPU::S_OR_B32_term:
2124 // This is only a terminator to get the correct spill code placement during
2125 // register allocation.
2126 MI.setDesc(get(AMDGPU::S_OR_B32));
2127 break;
2128
2129 case AMDGPU::S_ANDN2_B64_term:
2130 // This is only a terminator to get the correct spill code placement during
2131 // register allocation.
2132 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2133 break;
2134
2135 case AMDGPU::S_ANDN2_B32_term:
2136 // This is only a terminator to get the correct spill code placement during
2137 // register allocation.
2138 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2139 break;
2140
2141 case AMDGPU::S_AND_B64_term:
2142 // This is only a terminator to get the correct spill code placement during
2143 // register allocation.
2144 MI.setDesc(get(AMDGPU::S_AND_B64));
2145 break;
2146
2147 case AMDGPU::S_AND_B32_term:
2148 // This is only a terminator to get the correct spill code placement during
2149 // register allocation.
2150 MI.setDesc(get(AMDGPU::S_AND_B32));
2151 break;
2152
2153 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2154 // This is only a terminator to get the correct spill code placement during
2155 // register allocation.
2156 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2157 break;
2158
2159 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2160 // This is only a terminator to get the correct spill code placement during
2161 // register allocation.
2162 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2163 break;
2164
2165 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2166 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2167 break;
2168
2169 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2170 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2171 break;
2172 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2173 Register Dst = MI.getOperand(0).getReg();
2174 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2175 MI.setDesc(
2176 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2177 break;
2178 }
2179 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2180 Register Dst = MI.getOperand(0).getReg();
2181 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2182 int64_t Imm = MI.getOperand(1).getImm();
2183
2184 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2185 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2186 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2187 .addImm(SignExtend64<32>(Imm));
2188 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2189 .addImm(SignExtend64<32>(Imm >> 32));
2190 MI.eraseFromParent();
2191 break;
2192 }
2193
2194 [[fallthrough]];
2195 }
2196 case AMDGPU::V_MOV_B64_PSEUDO: {
2197 Register Dst = MI.getOperand(0).getReg();
2198 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2199 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2200
2201 const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2202 const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
2203
2204 const MachineOperand &SrcOp = MI.getOperand(1);
2205 // FIXME: Will this work for 64-bit floating point immediates?
2206 assert(!SrcOp.isFPImm());
2207 if (ST.hasVMovB64Inst() && Mov64RC->contains(Dst)) {
2208 MI.setDesc(Mov64Desc);
2209 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2210 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2211 break;
2212 }
2213 if (SrcOp.isImm()) {
2214 APInt Imm(64, SrcOp.getImm());
2215 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2216 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2217 const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2218 const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
2219
2220 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2221 PkMovRC->contains(Dst)) {
2222 BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2224 .addImm(Lo.getSExtValue())
2226 .addImm(Lo.getSExtValue())
2227 .addImm(0) // op_sel_lo
2228 .addImm(0) // op_sel_hi
2229 .addImm(0) // neg_lo
2230 .addImm(0) // neg_hi
2231 .addImm(0); // clamp
2232 } else {
2233 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2234 .addImm(Lo.getSExtValue());
2235 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2236 .addImm(Hi.getSExtValue());
2237 }
2238 } else {
2239 assert(SrcOp.isReg());
2240 if (ST.hasPkMovB32() &&
2241 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2242 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2243 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2244 .addReg(SrcOp.getReg())
2246 .addReg(SrcOp.getReg())
2247 .addImm(0) // op_sel_lo
2248 .addImm(0) // op_sel_hi
2249 .addImm(0) // neg_lo
2250 .addImm(0) // neg_hi
2251 .addImm(0); // clamp
2252 } else {
2253 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2254 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0));
2255 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2256 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1));
2257 }
2258 }
2259 MI.eraseFromParent();
2260 break;
2261 }
2262 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2264 break;
2265 }
2266 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2267 const MachineOperand &SrcOp = MI.getOperand(1);
2268 assert(!SrcOp.isFPImm());
2269
2270 if (ST.has64BitLiterals()) {
2271 MI.setDesc(get(AMDGPU::S_MOV_B64));
2272 break;
2273 }
2274
2275 APInt Imm(64, SrcOp.getImm());
2276 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2277 MI.setDesc(get(AMDGPU::S_MOV_B64));
2278 break;
2279 }
2280
2281 Register Dst = MI.getOperand(0).getReg();
2282 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2283 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2284
2285 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2286 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2287 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2288 .addImm(Lo.getSExtValue());
2289 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2290 .addImm(Hi.getSExtValue());
2291 MI.eraseFromParent();
2292 break;
2293 }
2294 case AMDGPU::V_SET_INACTIVE_B32: {
2295 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2296 Register DstReg = MI.getOperand(0).getReg();
2297 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2298 .add(MI.getOperand(3))
2299 .add(MI.getOperand(4))
2300 .add(MI.getOperand(1))
2301 .add(MI.getOperand(2))
2302 .add(MI.getOperand(5));
2303 MI.eraseFromParent();
2304 break;
2305 }
2306 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2307 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2308 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2309 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2310 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2319 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2325 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2326 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2327 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2328 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2329 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2330 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2331 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2332 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2333 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2334 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2335 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2336 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2337 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2338 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2339 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2340
2341 unsigned Opc;
2342 if (RI.hasVGPRs(EltRC)) {
2343 Opc = AMDGPU::V_MOVRELD_B32_e32;
2344 } else {
2345 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2346 : AMDGPU::S_MOVRELD_B32;
2347 }
2348
2349 const MCInstrDesc &OpDesc = get(Opc);
2350 Register VecReg = MI.getOperand(0).getReg();
2351 bool IsUndef = MI.getOperand(1).isUndef();
2352 unsigned SubReg = MI.getOperand(3).getImm();
2353 assert(VecReg == MI.getOperand(1).getReg());
2354
2356 BuildMI(MBB, MI, DL, OpDesc)
2357 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2358 .add(MI.getOperand(2))
2360 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2361
2362 const int ImpDefIdx =
2363 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2364 const int ImpUseIdx = ImpDefIdx + 1;
2365 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2366 MI.eraseFromParent();
2367 break;
2368 }
2369 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2370 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2371 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2372 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2373 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2374 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2375 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2376 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2377 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2378 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2379 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2380 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2381 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2382 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2383 assert(ST.useVGPRIndexMode());
2384 Register VecReg = MI.getOperand(0).getReg();
2385 bool IsUndef = MI.getOperand(1).isUndef();
2386 MachineOperand &Idx = MI.getOperand(3);
2387 Register SubReg = MI.getOperand(4).getImm();
2388
2389 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2390 .add(Idx)
2392 SetOn->getOperand(3).setIsUndef();
2393
2394 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2396 BuildMI(MBB, MI, DL, OpDesc)
2397 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2398 .add(MI.getOperand(2))
2400 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2401
2402 const int ImpDefIdx =
2403 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2404 const int ImpUseIdx = ImpDefIdx + 1;
2405 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2406
2407 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2408
2409 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2410
2411 MI.eraseFromParent();
2412 break;
2413 }
2414 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2415 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2416 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2417 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2418 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2419 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2420 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2421 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2422 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2423 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2424 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2425 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2426 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2427 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2428 assert(ST.useVGPRIndexMode());
2429 Register Dst = MI.getOperand(0).getReg();
2430 Register VecReg = MI.getOperand(1).getReg();
2431 bool IsUndef = MI.getOperand(1).isUndef();
2432 Register SubReg = MI.getOperand(3).getImm();
2433
2434 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2435 .add(MI.getOperand(2))
2437 SetOn->getOperand(3).setIsUndef();
2438
2439 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2440 .addDef(Dst)
2441 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2442 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2443
2444 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2445
2446 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2447
2448 MI.eraseFromParent();
2449 break;
2450 }
2451 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2452 MachineFunction &MF = *MBB.getParent();
2453 Register Reg = MI.getOperand(0).getReg();
2454 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2455 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2456 MachineOperand OpLo = MI.getOperand(1);
2457 MachineOperand OpHi = MI.getOperand(2);
2458
2459 // Create a bundle so these instructions won't be re-ordered by the
2460 // post-RA scheduler.
2461 MIBundleBuilder Bundler(MBB, MI);
2462 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2463
2464 // What we want here is an offset from the value returned by s_getpc (which
2465 // is the address of the s_add_u32 instruction) to the global variable, but
2466 // since the encoding of $symbol starts 4 bytes after the start of the
2467 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2468 // small. This requires us to add 4 to the global variable offset in order
2469 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2470 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2471 // instruction.
2472
2473 int64_t Adjust = 0;
2474 if (ST.hasGetPCZeroExtension()) {
2475 // Fix up hardware that does not sign-extend the 48-bit PC value by
2476 // inserting: s_sext_i32_i16 reghi, reghi
2477 Bundler.append(
2478 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2479 Adjust += 4;
2480 }
2481
2482 if (OpLo.isGlobal())
2483 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2484 Bundler.append(
2485 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2486
2487 if (OpHi.isGlobal())
2488 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2489 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2490 .addReg(RegHi)
2491 .add(OpHi));
2492
2493 finalizeBundle(MBB, Bundler.begin());
2494
2495 MI.eraseFromParent();
2496 break;
2497 }
2498 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2499 MachineFunction &MF = *MBB.getParent();
2500 Register Reg = MI.getOperand(0).getReg();
2501 MachineOperand Op = MI.getOperand(1);
2502
2503 // Create a bundle so these instructions won't be re-ordered by the
2504 // post-RA scheduler.
2505 MIBundleBuilder Bundler(MBB, MI);
2506 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2507 if (Op.isGlobal())
2508 Op.setOffset(Op.getOffset() + 4);
2509 Bundler.append(
2510 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2511
2512 finalizeBundle(MBB, Bundler.begin());
2513
2514 MI.eraseFromParent();
2515 break;
2516 }
2517 case AMDGPU::ENTER_STRICT_WWM: {
2518 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2519 // Whole Wave Mode is entered.
2520 MI.setDesc(get(LMC.OrSaveExecOpc));
2521 break;
2522 }
2523 case AMDGPU::ENTER_STRICT_WQM: {
2524 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2525 // STRICT_WQM is entered.
2526 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2527 .addReg(LMC.ExecReg);
2528 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2529
2530 MI.eraseFromParent();
2531 break;
2532 }
2533 case AMDGPU::EXIT_STRICT_WWM:
2534 case AMDGPU::EXIT_STRICT_WQM: {
2535 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2536 // WWM/STICT_WQM is exited.
2537 MI.setDesc(get(LMC.MovOpc));
2538 break;
2539 }
2540 case AMDGPU::SI_RETURN: {
2541 const MachineFunction *MF = MBB.getParent();
2542 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2543 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2544 // Hiding the return address use with SI_RETURN may lead to extra kills in
2545 // the function and missing live-ins. We are fine in practice because callee
2546 // saved register handling ensures the register value is restored before
2547 // RET, but we need the undef flag here to appease the MachineVerifier
2548 // liveness checks.
2550 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2551 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2552
2553 MIB.copyImplicitOps(MI);
2554 MI.eraseFromParent();
2555 break;
2556 }
2557
2558 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2559 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2560 MI.setDesc(get(AMDGPU::S_MUL_U64));
2561 break;
2562
2563 case AMDGPU::S_GETPC_B64_pseudo:
2564 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2565 if (ST.hasGetPCZeroExtension()) {
2566 Register Dst = MI.getOperand(0).getReg();
2567 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2568 // Fix up hardware that does not sign-extend the 48-bit PC value by
2569 // inserting: s_sext_i32_i16 dsthi, dsthi
2570 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2571 DstHi)
2572 .addReg(DstHi);
2573 }
2574 break;
2575
2576 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2577 assert(ST.hasBF16PackedInsts());
2578 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2579 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2580 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2581 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2582 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2583 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2584 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2585 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2586 break;
2587 }
2588
2589 case AMDGPU::GET_STACK_BASE:
2590 // The stack starts at offset 0 unless we need to reserve some space at the
2591 // bottom.
2592 if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
2593 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2594 // some of the VGPRs. The size of the required scratch space has already
2595 // been computed by prolog epilog insertion.
2596 const SIMachineFunctionInfo *MFI =
2597 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2598 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2599 Register DestReg = MI.getOperand(0).getReg();
2600 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
2603 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2604 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2605 // SCC, so we need to check for 0 manually.
2606 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
2607 // Change the implicif-def of SCC to an explicit use (but first remove
2608 // the dead flag if present).
2609 MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
2610 MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
2611 MI.setDesc(get(AMDGPU::S_CMOVK_I32));
2612 MI.addOperand(MachineOperand::CreateImm(VGPRSize));
2613 } else {
2614 MI.setDesc(get(AMDGPU::S_MOV_B32));
2615 MI.addOperand(MachineOperand::CreateImm(0));
2616 MI.removeOperand(
2617 MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2618 }
2619 break;
2620 }
2621
2622 return true;
2623}
2624
2627 unsigned SubIdx, const MachineInstr &Orig,
2628 LaneBitmask UsedLanes) const {
2629
2630 // Try shrinking the instruction to remat only the part needed for current
2631 // context.
2632 // TODO: Handle more cases.
2633 unsigned Opcode = Orig.getOpcode();
2634 switch (Opcode) {
2635 case AMDGPU::S_MOV_B64:
2636 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2637 if (SubIdx != 0)
2638 break;
2639
2640 if (!Orig.getOperand(1).isImm())
2641 break;
2642
2643 // Shrink S_MOV_B64 to S_MOV_B32 when UsedLanes indicates only a single
2644 // 32-bit lane of the 64-bit value is live at the rematerialization point.
2645 if (UsedLanes.all())
2646 break;
2647
2648 // Determine which half of the 64-bit immediate corresponds to the use.
2649 unsigned OrigSubReg = Orig.getOperand(0).getSubReg();
2650 unsigned LoSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub0);
2651 unsigned HiSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub1);
2652
2653 bool NeedLo = (UsedLanes & RI.getSubRegIndexLaneMask(LoSubReg)).any();
2654 bool NeedHi = (UsedLanes & RI.getSubRegIndexLaneMask(HiSubReg)).any();
2655
2656 if (NeedLo && NeedHi)
2657 break;
2658
2659 int64_t Imm64 = Orig.getOperand(1).getImm();
2660 int32_t Imm32 = NeedLo ? Lo_32(Imm64) : Hi_32(Imm64);
2661
2662 unsigned UseSubReg = NeedLo ? LoSubReg : HiSubReg;
2663
2664 // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
2665 BuildMI(MBB, I, Orig.getDebugLoc(), get(AMDGPU::S_MOV_B32))
2666 .addReg(DestReg, RegState::Define | RegState::Undef, UseSubReg)
2667 .addImm(Imm32);
2668 return;
2669 }
2670
2671 case AMDGPU::S_LOAD_DWORDX16_IMM:
2672 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2673 if (SubIdx != 0)
2674 break;
2675
2676 if (I == MBB.end())
2677 break;
2678
2679 if (I->isBundled())
2680 break;
2681
2682 // Look for a single use of the register that is also a subreg.
2683 Register RegToFind = Orig.getOperand(0).getReg();
2684 MachineOperand *UseMO = nullptr;
2685 for (auto &CandMO : I->operands()) {
2686 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2687 continue;
2688 if (UseMO) {
2689 UseMO = nullptr;
2690 break;
2691 }
2692 UseMO = &CandMO;
2693 }
2694 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2695 break;
2696
2697 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2698 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2699
2700 MachineFunction *MF = MBB.getParent();
2701 MachineRegisterInfo &MRI = MF->getRegInfo();
2702 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2703
2704 unsigned NewOpcode = -1;
2705 if (SubregSize == 256)
2706 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2707 else if (SubregSize == 128)
2708 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2709 else
2710 break;
2711
2712 const MCInstrDesc &TID = get(NewOpcode);
2713 const TargetRegisterClass *NewRC =
2714 RI.getAllocatableClass(getRegClass(TID, 0));
2715 MRI.setRegClass(DestReg, NewRC);
2716
2717 UseMO->setReg(DestReg);
2718 UseMO->setSubReg(AMDGPU::NoSubRegister);
2719
2720 // Use a smaller load with the desired size, possibly with updated offset.
2721 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2722 MI->setDesc(TID);
2723 MI->getOperand(0).setReg(DestReg);
2724 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2725 if (Offset) {
2726 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2727 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2728 OffsetMO->setImm(FinalOffset);
2729 }
2731 for (const MachineMemOperand *MemOp : Orig.memoperands())
2732 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2733 SubregSize / 8));
2734 MI->setMemRefs(*MF, NewMMOs);
2735
2736 MBB.insert(I, MI);
2737 return;
2738 }
2739
2740 default:
2741 break;
2742 }
2743
2744 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, UsedLanes);
2745}
2746
2747std::pair<MachineInstr*, MachineInstr*>
2749 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2750
2751 if (ST.hasVMovB64Inst() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2753 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2754 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2755 return std::pair(&MI, nullptr);
2756 }
2757
2758 MachineBasicBlock &MBB = *MI.getParent();
2759 DebugLoc DL = MBB.findDebugLoc(MI);
2760 MachineFunction *MF = MBB.getParent();
2761 MachineRegisterInfo &MRI = MF->getRegInfo();
2762 Register Dst = MI.getOperand(0).getReg();
2763 unsigned Part = 0;
2764 MachineInstr *Split[2];
2765
2766 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2767 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2768 if (Dst.isPhysical()) {
2769 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2770 } else {
2771 assert(MRI.isSSA());
2772 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2773 MovDPP.addDef(Tmp);
2774 }
2775
2776 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2777 const MachineOperand &SrcOp = MI.getOperand(I);
2778 assert(!SrcOp.isFPImm());
2779 if (SrcOp.isImm()) {
2780 APInt Imm(64, SrcOp.getImm());
2781 Imm.ashrInPlace(Part * 32);
2782 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2783 } else {
2784 assert(SrcOp.isReg());
2785 Register Src = SrcOp.getReg();
2786 if (Src.isPhysical())
2787 MovDPP.addReg(RI.getSubReg(Src, Sub));
2788 else
2789 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2790 }
2791 }
2792
2793 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2794 MovDPP.addImm(MO.getImm());
2795
2796 Split[Part] = MovDPP;
2797 ++Part;
2798 }
2799
2800 if (Dst.isVirtual())
2801 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2802 .addReg(Split[0]->getOperand(0).getReg())
2803 .addImm(AMDGPU::sub0)
2804 .addReg(Split[1]->getOperand(0).getReg())
2805 .addImm(AMDGPU::sub1);
2806
2807 MI.eraseFromParent();
2808 return std::pair(Split[0], Split[1]);
2809}
2810
2811std::optional<DestSourcePair>
2813 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2814 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2815
2816 return std::nullopt;
2817}
2818
2820 AMDGPU::OpName Src0OpName,
2821 MachineOperand &Src1,
2822 AMDGPU::OpName Src1OpName) const {
2823 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2824 if (!Src0Mods)
2825 return false;
2826
2827 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2828 assert(Src1Mods &&
2829 "All commutable instructions have both src0 and src1 modifiers");
2830
2831 int Src0ModsVal = Src0Mods->getImm();
2832 int Src1ModsVal = Src1Mods->getImm();
2833
2834 Src1Mods->setImm(Src0ModsVal);
2835 Src0Mods->setImm(Src1ModsVal);
2836 return true;
2837}
2838
2840 MachineOperand &RegOp,
2841 MachineOperand &NonRegOp) {
2842 Register Reg = RegOp.getReg();
2843 unsigned SubReg = RegOp.getSubReg();
2844 bool IsKill = RegOp.isKill();
2845 bool IsDead = RegOp.isDead();
2846 bool IsUndef = RegOp.isUndef();
2847 bool IsDebug = RegOp.isDebug();
2848
2849 if (NonRegOp.isImm())
2850 RegOp.ChangeToImmediate(NonRegOp.getImm());
2851 else if (NonRegOp.isFI())
2852 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2853 else if (NonRegOp.isGlobal()) {
2854 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2855 NonRegOp.getTargetFlags());
2856 } else
2857 return nullptr;
2858
2859 // Make sure we don't reinterpret a subreg index in the target flags.
2860 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2861
2862 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2863 NonRegOp.setSubReg(SubReg);
2864
2865 return &MI;
2866}
2867
2869 MachineOperand &NonRegOp1,
2870 MachineOperand &NonRegOp2) {
2871 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2872 int64_t NonRegVal = NonRegOp1.getImm();
2873
2874 NonRegOp1.setImm(NonRegOp2.getImm());
2875 NonRegOp2.setImm(NonRegVal);
2876 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2877 NonRegOp2.setTargetFlags(TargetFlags);
2878 return &MI;
2879}
2880
2881bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2882 unsigned OpIdx1) const {
2883 const MCInstrDesc &InstDesc = MI.getDesc();
2884 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2885 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2886
2887 unsigned Opc = MI.getOpcode();
2888 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2889
2890 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2891 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2892
2893 // Swap doesn't breach constant bus or literal limits
2894 // It may move literal to position other than src0, this is not allowed
2895 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2896 // FIXME: After gfx9, literal can be in place other than Src0
2897 if (isVALU(MI)) {
2898 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2899 !isInlineConstant(MO0, OpInfo1))
2900 return false;
2901 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2902 !isInlineConstant(MO1, OpInfo0))
2903 return false;
2904 }
2905
2906 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2907 if (OpInfo1.RegClass == -1)
2908 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2909 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2910 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2911 }
2912 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2913 if (OpInfo0.RegClass == -1)
2914 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2915 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2916 isLegalRegOperand(MI, OpIdx0, MO1);
2917 }
2918
2919 // No need to check 64-bit literals since swapping does not bring new
2920 // 64-bit literals into current instruction to fold to 32-bit
2921
2922 return isImmOperandLegal(MI, OpIdx1, MO0);
2923}
2924
2926 unsigned Src0Idx,
2927 unsigned Src1Idx) const {
2928 assert(!NewMI && "this should never be used");
2929
2930 unsigned Opc = MI.getOpcode();
2931 int CommutedOpcode = commuteOpcode(Opc);
2932 if (CommutedOpcode == -1)
2933 return nullptr;
2934
2935 if (Src0Idx > Src1Idx)
2936 std::swap(Src0Idx, Src1Idx);
2937
2938 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2939 static_cast<int>(Src0Idx) &&
2940 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2941 static_cast<int>(Src1Idx) &&
2942 "inconsistency with findCommutedOpIndices");
2943
2944 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2945 return nullptr;
2946
2947 MachineInstr *CommutedMI = nullptr;
2948 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2949 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2950 if (Src0.isReg() && Src1.isReg()) {
2951 // Be sure to copy the source modifiers to the right place.
2952 CommutedMI =
2953 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2954 } else if (Src0.isReg() && !Src1.isReg()) {
2955 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2956 } else if (!Src0.isReg() && Src1.isReg()) {
2957 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2958 } else if (Src0.isImm() && Src1.isImm()) {
2959 CommutedMI = swapImmOperands(MI, Src0, Src1);
2960 } else {
2961 // FIXME: Found two non registers to commute. This does happen.
2962 return nullptr;
2963 }
2964
2965 if (CommutedMI) {
2966 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2967 Src1, AMDGPU::OpName::src1_modifiers);
2968
2969 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2970 AMDGPU::OpName::src1_sel);
2971
2972 CommutedMI->setDesc(get(CommutedOpcode));
2973 }
2974
2975 return CommutedMI;
2976}
2977
2978// This needs to be implemented because the source modifiers may be inserted
2979// between the true commutable operands, and the base
2980// TargetInstrInfo::commuteInstruction uses it.
2982 unsigned &SrcOpIdx0,
2983 unsigned &SrcOpIdx1) const {
2984 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2985}
2986
2988 unsigned &SrcOpIdx0,
2989 unsigned &SrcOpIdx1) const {
2990 if (!Desc.isCommutable())
2991 return false;
2992
2993 unsigned Opc = Desc.getOpcode();
2994 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2995 if (Src0Idx == -1)
2996 return false;
2997
2998 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2999 if (Src1Idx == -1)
3000 return false;
3001
3002 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
3003}
3004
3006 int64_t BrOffset) const {
3007 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
3008 // because its dest block is unanalyzable.
3009 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
3010
3011 // Convert to dwords.
3012 BrOffset /= 4;
3013
3014 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
3015 // from the next instruction.
3016 BrOffset -= 1;
3017
3018 return isIntN(BranchOffsetBits, BrOffset);
3019}
3020
3023 return MI.getOperand(0).getMBB();
3024}
3025
3027 for (const MachineInstr &MI : MBB->terminators()) {
3028 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
3029 MI.getOpcode() == AMDGPU::SI_LOOP)
3030 return true;
3031 }
3032 return false;
3033}
3034
3036 MachineBasicBlock &DestBB,
3037 MachineBasicBlock &RestoreBB,
3038 const DebugLoc &DL, int64_t BrOffset,
3039 RegScavenger *RS) const {
3040 assert(MBB.empty() &&
3041 "new block should be inserted for expanding unconditional branch");
3042 assert(MBB.pred_size() == 1);
3043 assert(RestoreBB.empty() &&
3044 "restore block should be inserted for restoring clobbered registers");
3045
3046 MachineFunction *MF = MBB.getParent();
3047 MachineRegisterInfo &MRI = MF->getRegInfo();
3049 auto I = MBB.end();
3050 auto &MCCtx = MF->getContext();
3051
3052 if (ST.useAddPC64Inst()) {
3053 MCSymbol *Offset =
3054 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
3055 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
3057 MCSymbol *PostAddPCLabel =
3058 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
3059 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
3060 auto *OffsetExpr = MCBinaryExpr::createSub(
3061 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
3062 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
3063 Offset->setVariableValue(OffsetExpr);
3064 return;
3065 }
3066
3067 assert(RS && "RegScavenger required for long branching");
3068
3069 // FIXME: Virtual register workaround for RegScavenger not working with empty
3070 // blocks.
3071 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3072
3073 // Note: as this is used after hazard recognizer we need to apply some hazard
3074 // workarounds directly.
3075 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3076 ST.hasVALUReadSGPRHazard();
3077 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3078 if (FlushSGPRWrites)
3079 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
3081 };
3082
3083 // We need to compute the offset relative to the instruction immediately after
3084 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3085 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
3086 ApplyHazardWorkarounds();
3087
3088 MCSymbol *PostGetPCLabel =
3089 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
3090 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
3091
3092 MCSymbol *OffsetLo =
3093 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
3094 MCSymbol *OffsetHi =
3095 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
3096 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
3097 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
3098 .addReg(PCReg, {}, AMDGPU::sub0)
3099 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
3100 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
3101 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
3102 .addReg(PCReg, {}, AMDGPU::sub1)
3103 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
3104 ApplyHazardWorkarounds();
3105
3106 // Insert the indirect branch after the other terminator.
3107 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3108 .addReg(PCReg);
3109
3110 // If a spill is needed for the pc register pair, we need to insert a spill
3111 // restore block right before the destination block, and insert a short branch
3112 // into the old destination block's fallthrough predecessor.
3113 // e.g.:
3114 //
3115 // s_cbranch_scc0 skip_long_branch:
3116 //
3117 // long_branch_bb:
3118 // spill s[8:9]
3119 // s_getpc_b64 s[8:9]
3120 // s_add_u32 s8, s8, restore_bb
3121 // s_addc_u32 s9, s9, 0
3122 // s_setpc_b64 s[8:9]
3123 //
3124 // skip_long_branch:
3125 // foo;
3126 //
3127 // .....
3128 //
3129 // dest_bb_fallthrough_predecessor:
3130 // bar;
3131 // s_branch dest_bb
3132 //
3133 // restore_bb:
3134 // restore s[8:9]
3135 // fallthrough dest_bb
3136 ///
3137 // dest_bb:
3138 // buzz;
3139
3140 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3141 Register Scav;
3142
3143 // If we've previously reserved a register for long branches
3144 // avoid running the scavenger and just use those registers
3145 if (LongBranchReservedReg) {
3146 RS->enterBasicBlock(MBB);
3147 Scav = LongBranchReservedReg;
3148 } else {
3149 RS->enterBasicBlockEnd(MBB);
3150 Scav = RS->scavengeRegisterBackwards(
3151 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3152 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3153 }
3154 if (Scav) {
3155 RS->setRegUsed(Scav);
3156 MRI.replaceRegWith(PCReg, Scav);
3157 MRI.clearVirtRegs();
3158 } else {
3159 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3160 // SGPR spill.
3161 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3162 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3163 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3164 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3165 MRI.clearVirtRegs();
3166 }
3167
3168 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3169 // Now, the distance could be defined.
3171 MCSymbolRefExpr::create(DestLabel, MCCtx),
3172 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3173 // Add offset assignments.
3174 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3175 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3176 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3177 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3178}
3179
3180unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3181 switch (Cond) {
3182 case SIInstrInfo::SCC_TRUE:
3183 return AMDGPU::S_CBRANCH_SCC1;
3184 case SIInstrInfo::SCC_FALSE:
3185 return AMDGPU::S_CBRANCH_SCC0;
3186 case SIInstrInfo::VCCNZ:
3187 return AMDGPU::S_CBRANCH_VCCNZ;
3188 case SIInstrInfo::VCCZ:
3189 return AMDGPU::S_CBRANCH_VCCZ;
3190 case SIInstrInfo::EXECNZ:
3191 return AMDGPU::S_CBRANCH_EXECNZ;
3192 case SIInstrInfo::EXECZ:
3193 return AMDGPU::S_CBRANCH_EXECZ;
3194 default:
3195 llvm_unreachable("invalid branch predicate");
3196 }
3197}
3198
3199SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3200 switch (Opcode) {
3201 case AMDGPU::S_CBRANCH_SCC0:
3202 return SCC_FALSE;
3203 case AMDGPU::S_CBRANCH_SCC1:
3204 return SCC_TRUE;
3205 case AMDGPU::S_CBRANCH_VCCNZ:
3206 return VCCNZ;
3207 case AMDGPU::S_CBRANCH_VCCZ:
3208 return VCCZ;
3209 case AMDGPU::S_CBRANCH_EXECNZ:
3210 return EXECNZ;
3211 case AMDGPU::S_CBRANCH_EXECZ:
3212 return EXECZ;
3213 default:
3214 return INVALID_BR;
3215 }
3216}
3217
3221 MachineBasicBlock *&FBB,
3223 bool AllowModify) const {
3224 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3225 // Unconditional Branch
3226 TBB = I->getOperand(0).getMBB();
3227 return false;
3228 }
3229
3230 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3231 if (Pred == INVALID_BR)
3232 return true;
3233
3234 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3235 Cond.push_back(MachineOperand::CreateImm(Pred));
3236 Cond.push_back(I->getOperand(1)); // Save the branch register.
3237
3238 ++I;
3239
3240 if (I == MBB.end()) {
3241 // Conditional branch followed by fall-through.
3242 TBB = CondBB;
3243 return false;
3244 }
3245
3246 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3247 TBB = CondBB;
3248 FBB = I->getOperand(0).getMBB();
3249 return false;
3250 }
3251
3252 return true;
3253}
3254
3256 MachineBasicBlock *&FBB,
3258 bool AllowModify) const {
3259 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3260 auto E = MBB.end();
3261 if (I == E)
3262 return false;
3263
3264 // Skip over the instructions that are artificially terminators for special
3265 // exec management.
3266 while (I != E && !I->isBranch() && !I->isReturn()) {
3267 switch (I->getOpcode()) {
3268 case AMDGPU::S_MOV_B64_term:
3269 case AMDGPU::S_XOR_B64_term:
3270 case AMDGPU::S_OR_B64_term:
3271 case AMDGPU::S_ANDN2_B64_term:
3272 case AMDGPU::S_AND_B64_term:
3273 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3274 case AMDGPU::S_MOV_B32_term:
3275 case AMDGPU::S_XOR_B32_term:
3276 case AMDGPU::S_OR_B32_term:
3277 case AMDGPU::S_ANDN2_B32_term:
3278 case AMDGPU::S_AND_B32_term:
3279 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3280 break;
3281 case AMDGPU::SI_IF:
3282 case AMDGPU::SI_ELSE:
3283 case AMDGPU::SI_KILL_I1_TERMINATOR:
3284 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3285 // FIXME: It's messy that these need to be considered here at all.
3286 return true;
3287 default:
3288 llvm_unreachable("unexpected non-branch terminator inst");
3289 }
3290
3291 ++I;
3292 }
3293
3294 if (I == E)
3295 return false;
3296
3297 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3298}
3299
3301 int *BytesRemoved) const {
3302 unsigned Count = 0;
3303 unsigned RemovedSize = 0;
3304 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3305 // Skip over artificial terminators when removing instructions.
3306 if (MI.isBranch() || MI.isReturn()) {
3307 RemovedSize += getInstSizeInBytes(MI);
3308 MI.eraseFromParent();
3309 ++Count;
3310 }
3311 }
3312
3313 if (BytesRemoved)
3314 *BytesRemoved = RemovedSize;
3315
3316 return Count;
3317}
3318
3319// Copy the flags onto the implicit condition register operand.
3321 const MachineOperand &OrigCond) {
3322 CondReg.setIsUndef(OrigCond.isUndef());
3323 CondReg.setIsKill(OrigCond.isKill());
3324}
3325
3328 MachineBasicBlock *FBB,
3330 const DebugLoc &DL,
3331 int *BytesAdded) const {
3332 if (!FBB && Cond.empty()) {
3333 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3334 .addMBB(TBB);
3335 if (BytesAdded)
3336 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3337 return 1;
3338 }
3339
3340 assert(TBB && Cond[0].isImm());
3341
3342 unsigned Opcode
3343 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3344
3345 if (!FBB) {
3346 MachineInstr *CondBr =
3347 BuildMI(&MBB, DL, get(Opcode))
3348 .addMBB(TBB);
3349
3350 // Copy the flags onto the implicit condition register operand.
3351 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3352 fixImplicitOperands(*CondBr);
3353
3354 if (BytesAdded)
3355 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3356 return 1;
3357 }
3358
3359 assert(TBB && FBB);
3360
3361 MachineInstr *CondBr =
3362 BuildMI(&MBB, DL, get(Opcode))
3363 .addMBB(TBB);
3364 fixImplicitOperands(*CondBr);
3365 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3366 .addMBB(FBB);
3367
3368 MachineOperand &CondReg = CondBr->getOperand(1);
3369 CondReg.setIsUndef(Cond[1].isUndef());
3370 CondReg.setIsKill(Cond[1].isKill());
3371
3372 if (BytesAdded)
3373 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3374
3375 return 2;
3376}
3377
3380 if (Cond.size() != 2) {
3381 return true;
3382 }
3383
3384 if (Cond[0].isImm()) {
3385 Cond[0].setImm(-Cond[0].getImm());
3386 return false;
3387 }
3388
3389 return true;
3390}
3391
3394 Register DstReg, Register TrueReg,
3395 Register FalseReg, int &CondCycles,
3396 int &TrueCycles, int &FalseCycles) const {
3397 switch (Cond[0].getImm()) {
3398 case VCCNZ:
3399 case VCCZ: {
3400 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3401 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3402 if (MRI.getRegClass(FalseReg) != RC)
3403 return false;
3404
3405 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3406 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3407
3408 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3409 return RI.hasVGPRs(RC) && NumInsts <= 6;
3410 }
3411 case SCC_TRUE:
3412 case SCC_FALSE: {
3413 // FIXME: We could insert for VGPRs if we could replace the original compare
3414 // with a vector one.
3415 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3416 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3417 if (MRI.getRegClass(FalseReg) != RC)
3418 return false;
3419
3420 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3421
3422 // Multiples of 8 can do s_cselect_b64
3423 if (NumInsts % 2 == 0)
3424 NumInsts /= 2;
3425
3426 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3427 return RI.isSGPRClass(RC);
3428 }
3429 default:
3430 return false;
3431 }
3432}
3433
3437 Register TrueReg, Register FalseReg) const {
3438 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3439 if (Pred == VCCZ || Pred == SCC_FALSE) {
3440 Pred = static_cast<BranchPredicate>(-Pred);
3441 std::swap(TrueReg, FalseReg);
3442 }
3443
3444 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3445 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3446 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3447
3448 if (DstSize == 32) {
3450 if (Pred == SCC_TRUE) {
3451 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3452 .addReg(TrueReg)
3453 .addReg(FalseReg);
3454 } else {
3455 // Instruction's operands are backwards from what is expected.
3456 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3457 .addReg(FalseReg)
3458 .addReg(TrueReg);
3459 }
3460
3461 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3462 return;
3463 }
3464
3465 if (DstSize == 64 && Pred == SCC_TRUE) {
3467 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3468 .addReg(TrueReg)
3469 .addReg(FalseReg);
3470
3471 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3472 return;
3473 }
3474
3475 static const int16_t Sub0_15[] = {
3476 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3477 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3478 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3479 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3480 };
3481
3482 static const int16_t Sub0_15_64[] = {
3483 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3484 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3485 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3486 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3487 };
3488
3489 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3490 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3491 const int16_t *SubIndices = Sub0_15;
3492 int NElts = DstSize / 32;
3493
3494 // 64-bit select is only available for SALU.
3495 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3496 if (Pred == SCC_TRUE) {
3497 if (NElts % 2) {
3498 SelOp = AMDGPU::S_CSELECT_B32;
3499 EltRC = &AMDGPU::SGPR_32RegClass;
3500 } else {
3501 SelOp = AMDGPU::S_CSELECT_B64;
3502 EltRC = &AMDGPU::SGPR_64RegClass;
3503 SubIndices = Sub0_15_64;
3504 NElts /= 2;
3505 }
3506 }
3507
3509 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3510
3511 I = MIB->getIterator();
3512
3514 for (int Idx = 0; Idx != NElts; ++Idx) {
3515 Register DstElt = MRI.createVirtualRegister(EltRC);
3516 Regs.push_back(DstElt);
3517
3518 unsigned SubIdx = SubIndices[Idx];
3519
3521 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3522 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3523 .addReg(FalseReg, {}, SubIdx)
3524 .addReg(TrueReg, {}, SubIdx);
3525 } else {
3526 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3527 .addReg(TrueReg, {}, SubIdx)
3528 .addReg(FalseReg, {}, SubIdx);
3529 }
3530
3531 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3533
3534 MIB.addReg(DstElt)
3535 .addImm(SubIdx);
3536 }
3537}
3538
3540
3541 if (MI.isBranch() || MI.isCall() || MI.isReturn() || MI.isIndirectBranch())
3542 return true;
3543
3544 switch (MI.getOpcode()) {
3545 case AMDGPU::S_ENDPGM:
3546 case AMDGPU::S_ENDPGM_SAVED:
3547 case AMDGPU::S_TRAP:
3548 case AMDGPU::S_GETREG_B32:
3549 case AMDGPU::S_SETREG_B32:
3550 case AMDGPU::S_SETREG_B32_mode:
3551 case AMDGPU::S_SETREG_IMM32_B32:
3552 case AMDGPU::S_SETREG_IMM32_B32_mode:
3553 case AMDGPU::S_SENDMSG:
3554 case AMDGPU::S_SENDMSGHALT:
3555 case AMDGPU::S_SENDMSG_RTN_B32:
3556 case AMDGPU::S_SENDMSG_RTN_B64:
3557 case AMDGPU::S_BARRIER_WAIT:
3558 case AMDGPU::S_BARRIER_SIGNAL_M0:
3559 case AMDGPU::S_BARRIER_SIGNAL_IMM:
3560 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
3561 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
3562 return true;
3563 default:
3564 return false;
3565 }
3566}
3567
3569 switch (MI.getOpcode()) {
3570 case AMDGPU::V_MOV_B16_t16_e32:
3571 case AMDGPU::V_MOV_B16_t16_e64:
3572 case AMDGPU::V_MOV_B32_e32:
3573 case AMDGPU::V_MOV_B32_e64:
3574 case AMDGPU::V_MOV_B64_PSEUDO:
3575 case AMDGPU::V_MOV_B64_e32:
3576 case AMDGPU::V_MOV_B64_e64:
3577 case AMDGPU::S_MOV_B32:
3578 case AMDGPU::S_MOV_B64:
3579 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3580 case AMDGPU::COPY:
3581 case AMDGPU::WWM_COPY:
3582 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3583 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3584 case AMDGPU::V_ACCVGPR_MOV_B32:
3585 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3586 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3587 return true;
3588 default:
3589 return false;
3590 }
3591}
3592
3594 switch (MI.getOpcode()) {
3595 case AMDGPU::V_MOV_B16_t16_e32:
3596 case AMDGPU::V_MOV_B16_t16_e64:
3597 return 2;
3598 case AMDGPU::V_MOV_B32_e32:
3599 case AMDGPU::V_MOV_B32_e64:
3600 case AMDGPU::V_MOV_B64_PSEUDO:
3601 case AMDGPU::V_MOV_B64_e32:
3602 case AMDGPU::V_MOV_B64_e64:
3603 case AMDGPU::S_MOV_B32:
3604 case AMDGPU::S_MOV_B64:
3605 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3606 case AMDGPU::COPY:
3607 case AMDGPU::WWM_COPY:
3608 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3609 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3610 case AMDGPU::V_ACCVGPR_MOV_B32:
3611 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3612 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3613 return 1;
3614 default:
3615 llvm_unreachable("MI is not a foldable copy");
3616 }
3617}
3618
3619static constexpr AMDGPU::OpName ModifierOpNames[] = {
3620 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3621 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3622 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3623
3625 unsigned Opc = MI.getOpcode();
3626 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3627 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3628 if (Idx >= 0)
3629 MI.removeOperand(Idx);
3630 }
3631}
3632
3634 const MCInstrDesc &NewDesc) const {
3635 MI.setDesc(NewDesc);
3636
3637 // Remove any leftover implicit operands from mutating the instruction. e.g.
3638 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3639 // anymore.
3640 const MCInstrDesc &Desc = MI.getDesc();
3641 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3642 Desc.implicit_defs().size();
3643
3644 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3645 MI.removeOperand(I);
3646}
3647
3648std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3649 unsigned SubRegIndex) {
3650 switch (SubRegIndex) {
3651 case AMDGPU::NoSubRegister:
3652 return Imm;
3653 case AMDGPU::sub0:
3654 return SignExtend64<32>(Imm);
3655 case AMDGPU::sub1:
3656 return SignExtend64<32>(Imm >> 32);
3657 case AMDGPU::lo16:
3658 return SignExtend64<16>(Imm);
3659 case AMDGPU::hi16:
3660 return SignExtend64<16>(Imm >> 16);
3661 case AMDGPU::sub1_lo16:
3662 return SignExtend64<16>(Imm >> 32);
3663 case AMDGPU::sub1_hi16:
3664 return SignExtend64<16>(Imm >> 48);
3665 default:
3666 return std::nullopt;
3667 }
3668
3669 llvm_unreachable("covered subregister switch");
3670}
3671
3672static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3673 switch (Opc) {
3674 case AMDGPU::V_MAC_F16_e32:
3675 case AMDGPU::V_MAC_F16_e64:
3676 case AMDGPU::V_MAD_F16_e64:
3677 return AMDGPU::V_MADAK_F16;
3678 case AMDGPU::V_MAC_F32_e32:
3679 case AMDGPU::V_MAC_F32_e64:
3680 case AMDGPU::V_MAD_F32_e64:
3681 return AMDGPU::V_MADAK_F32;
3682 case AMDGPU::V_FMAC_F32_e32:
3683 case AMDGPU::V_FMAC_F32_e64:
3684 case AMDGPU::V_FMA_F32_e64:
3685 return AMDGPU::V_FMAAK_F32;
3686 case AMDGPU::V_FMAC_F16_e32:
3687 case AMDGPU::V_FMAC_F16_e64:
3688 case AMDGPU::V_FMAC_F16_t16_e64:
3689 case AMDGPU::V_FMAC_F16_fake16_e64:
3690 case AMDGPU::V_FMAC_F16_t16_e32:
3691 case AMDGPU::V_FMAC_F16_fake16_e32:
3692 case AMDGPU::V_FMA_F16_e64:
3693 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3694 ? AMDGPU::V_FMAAK_F16_t16
3695 : AMDGPU::V_FMAAK_F16_fake16
3696 : AMDGPU::V_FMAAK_F16;
3697 case AMDGPU::V_FMAC_F64_e32:
3698 case AMDGPU::V_FMAC_F64_e64:
3699 case AMDGPU::V_FMA_F64_e64:
3700 return AMDGPU::V_FMAAK_F64;
3701 default:
3702 llvm_unreachable("invalid instruction");
3703 }
3704}
3705
3706static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3707 switch (Opc) {
3708 case AMDGPU::V_MAC_F16_e32:
3709 case AMDGPU::V_MAC_F16_e64:
3710 case AMDGPU::V_MAD_F16_e64:
3711 return AMDGPU::V_MADMK_F16;
3712 case AMDGPU::V_MAC_F32_e32:
3713 case AMDGPU::V_MAC_F32_e64:
3714 case AMDGPU::V_MAD_F32_e64:
3715 return AMDGPU::V_MADMK_F32;
3716 case AMDGPU::V_FMAC_F32_e32:
3717 case AMDGPU::V_FMAC_F32_e64:
3718 case AMDGPU::V_FMA_F32_e64:
3719 return AMDGPU::V_FMAMK_F32;
3720 case AMDGPU::V_FMAC_F16_e32:
3721 case AMDGPU::V_FMAC_F16_e64:
3722 case AMDGPU::V_FMAC_F16_t16_e64:
3723 case AMDGPU::V_FMAC_F16_fake16_e64:
3724 case AMDGPU::V_FMAC_F16_t16_e32:
3725 case AMDGPU::V_FMAC_F16_fake16_e32:
3726 case AMDGPU::V_FMA_F16_e64:
3727 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3728 ? AMDGPU::V_FMAMK_F16_t16
3729 : AMDGPU::V_FMAMK_F16_fake16
3730 : AMDGPU::V_FMAMK_F16;
3731 case AMDGPU::V_FMAC_F64_e32:
3732 case AMDGPU::V_FMAC_F64_e64:
3733 case AMDGPU::V_FMA_F64_e64:
3734 return AMDGPU::V_FMAMK_F64;
3735 default:
3736 llvm_unreachable("invalid instruction");
3737 }
3738}
3739
3741 Register Reg, MachineRegisterInfo *MRI) const {
3742 int64_t Imm;
3743 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3744 return false;
3745
3746 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3747
3748 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3749
3750 unsigned Opc = UseMI.getOpcode();
3751 if (Opc == AMDGPU::COPY) {
3752 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3753
3754 Register DstReg = UseMI.getOperand(0).getReg();
3755 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3756
3757 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3758
3759 if (HasMultipleUses) {
3760 // TODO: This should fold in more cases with multiple use, but we need to
3761 // more carefully consider what those uses are.
3762 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3763
3764 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3765 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3766 return false;
3767
3768 // Most of the time folding a 32-bit inline constant is free (though this
3769 // might not be true if we can't later fold it into a real user).
3770 //
3771 // FIXME: This isInlineConstant check is imprecise if
3772 // getConstValDefinedInReg handled the tricky non-mov cases.
3773 if (ImmDefSize == 32 &&
3775 return false;
3776 }
3777
3778 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3779 RI.getSubRegIdxSize(UseSubReg) == 16;
3780
3781 if (Is16Bit) {
3782 if (RI.hasVGPRs(DstRC))
3783 return false; // Do not clobber vgpr_hi16
3784
3785 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3786 return false;
3787 }
3788
3789 MachineFunction *MF = UseMI.getMF();
3790
3791 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3792 MCRegister MovDstPhysReg =
3793 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3794
3795 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3796
3797 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3798 for (unsigned MovOp :
3799 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3800 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3801 const MCInstrDesc &MovDesc = get(MovOp);
3802
3803 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3804 if (Is16Bit) {
3805 // We just need to find a correctly sized register class, so the
3806 // subregister index compatibility doesn't matter since we're statically
3807 // extracting the immediate value.
3808 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3809 if (!MovDstRC)
3810 continue;
3811
3812 if (MovDstPhysReg) {
3813 // FIXME: We probably should not do this. If there is a live value in
3814 // the high half of the register, it will be corrupted.
3815 MovDstPhysReg =
3816 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3817 if (!MovDstPhysReg)
3818 continue;
3819 }
3820 }
3821
3822 // Result class isn't the right size, try the next instruction.
3823 if (MovDstPhysReg) {
3824 if (!MovDstRC->contains(MovDstPhysReg))
3825 return false;
3826 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3827 // TODO: This will be overly conservative in the case of 16-bit virtual
3828 // SGPRs. We could hack up the virtual register uses to use a compatible
3829 // 32-bit class.
3830 continue;
3831 }
3832
3833 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3834
3835 // Ensure the interpreted immediate value is a valid operand in the new
3836 // mov.
3837 //
3838 // FIXME: isImmOperandLegal should have form that doesn't require existing
3839 // MachineInstr or MachineOperand
3840 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3841 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3842 break;
3843
3844 NewOpc = MovOp;
3845 break;
3846 }
3847
3848 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3849 return false;
3850
3851 if (Is16Bit) {
3852 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3853 if (MovDstPhysReg)
3854 UseMI.getOperand(0).setReg(MovDstPhysReg);
3855 assert(UseMI.getOperand(1).getReg().isVirtual());
3856 }
3857
3858 const MCInstrDesc &NewMCID = get(NewOpc);
3859 UseMI.setDesc(NewMCID);
3860 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3861 UseMI.addImplicitDefUseOperands(*MF);
3862 return true;
3863 }
3864
3865 if (HasMultipleUses)
3866 return false;
3867
3868 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3869 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3870 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3871 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3872 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3873 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3874 Opc == AMDGPU::V_FMAC_F64_e64) {
3875 // Don't fold if we are using source or output modifiers. The new VOP2
3876 // instructions don't have them.
3878 return false;
3879
3880 // If this is a free constant, there's no reason to do this.
3881 // TODO: We could fold this here instead of letting SIFoldOperands do it
3882 // later.
3883 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3884
3885 // Any src operand can be used for the legality check.
3886 if (isInlineConstant(UseMI, Src0Idx, Imm))
3887 return false;
3888
3889 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3890
3891 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3892 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3893
3894 auto CopyRegOperandToNarrowerRC =
3895 [MRI, this](MachineInstr &MI, unsigned OpNo,
3896 const TargetRegisterClass *NewRC) -> void {
3897 if (!MI.getOperand(OpNo).isReg())
3898 return;
3899 Register Reg = MI.getOperand(OpNo).getReg();
3900 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3901 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3902 return;
3903 Register Tmp = MRI->createVirtualRegister(NewRC);
3904 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3905 get(AMDGPU::COPY), Tmp)
3906 .addReg(Reg);
3907 MI.getOperand(OpNo).setReg(Tmp);
3908 MI.getOperand(OpNo).setIsKill();
3909 };
3910
3911 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3912 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3913 (Src1->isReg() && Src1->getReg() == Reg)) {
3914 MachineOperand *RegSrc =
3915 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3916 if (!RegSrc->isReg())
3917 return false;
3918 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3919 ST.getConstantBusLimit(Opc) < 2)
3920 return false;
3921
3922 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3923 return false;
3924
3925 // If src2 is also a literal constant then we have to choose which one to
3926 // fold. In general it is better to choose madak so that the other literal
3927 // can be materialized in an sgpr instead of a vgpr:
3928 // s_mov_b32 s0, literal
3929 // v_madak_f32 v0, s0, v0, literal
3930 // Instead of:
3931 // v_mov_b32 v1, literal
3932 // v_madmk_f32 v0, v0, literal, v1
3933 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3934 if (Def && Def->isMoveImmediate() &&
3935 !isInlineConstant(Def->getOperand(1)))
3936 return false;
3937
3938 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3939 if (pseudoToMCOpcode(NewOpc) == -1)
3940 return false;
3941
3942 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3943 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3944
3945 // FIXME: This would be a lot easier if we could return a new instruction
3946 // instead of having to modify in place.
3947
3948 Register SrcReg = RegSrc->getReg();
3949 unsigned SrcSubReg = RegSrc->getSubReg();
3950 Src0->setReg(SrcReg);
3951 Src0->setSubReg(SrcSubReg);
3952 Src0->setIsKill(RegSrc->isKill());
3953
3954 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3955 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3956 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3957 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3958 UseMI.untieRegOperand(
3959 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3960
3961 Src1->ChangeToImmediate(*SubRegImm);
3962
3964 UseMI.setDesc(get(NewOpc));
3965
3966 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3967 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3968 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3969 Register Tmp = MRI->createVirtualRegister(NewRC);
3970 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3971 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3972 UseMI.getOperand(0).getReg())
3973 .addReg(Tmp, RegState::Kill);
3974 UseMI.getOperand(0).setReg(Tmp);
3975 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3976 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3977 }
3978
3979 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3980 if (DeleteDef)
3981 DefMI.eraseFromParent();
3982
3983 return true;
3984 }
3985
3986 // Added part is the constant: Use v_madak_{f16, f32}.
3987 if (Src2->isReg() && Src2->getReg() == Reg) {
3988 if (ST.getConstantBusLimit(Opc) < 2) {
3989 // Not allowed to use constant bus for another operand.
3990 // We can however allow an inline immediate as src0.
3991 bool Src0Inlined = false;
3992 if (Src0->isReg()) {
3993 // Try to inline constant if possible.
3994 // If the Def moves immediate and the use is single
3995 // We are saving VGPR here.
3996 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3997 if (Def && Def->isMoveImmediate() &&
3998 isInlineConstant(Def->getOperand(1)) &&
3999 MRI->hasOneNonDBGUse(Src0->getReg())) {
4000 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
4001 Src0Inlined = true;
4002 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
4003 RI.isSGPRReg(*MRI, Src0->getReg())) {
4004 return false;
4005 }
4006 // VGPR is okay as Src0 - fallthrough
4007 }
4008
4009 if (Src1->isReg() && !Src0Inlined) {
4010 // We have one slot for inlinable constant so far - try to fill it
4011 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
4012 if (Def && Def->isMoveImmediate() &&
4013 isInlineConstant(Def->getOperand(1)) &&
4014 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
4015 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
4016 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
4017 return false;
4018 // VGPR is okay as Src1 - fallthrough
4019 }
4020 }
4021
4022 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4023 if (pseudoToMCOpcode(NewOpc) == -1)
4024 return false;
4025
4026 // FIXME: This would be a lot easier if we could return a new instruction
4027 // instead of having to modify in place.
4028
4029 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
4030 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
4031 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
4032 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
4033 UseMI.untieRegOperand(
4034 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
4035
4036 const std::optional<int64_t> SubRegImm =
4037 extractSubregFromImm(Imm, Src2->getSubReg());
4038
4039 // ChangingToImmediate adds Src2 back to the instruction.
4040 Src2->ChangeToImmediate(*SubRegImm);
4041
4042 // These come before src2.
4044 UseMI.setDesc(get(NewOpc));
4045
4046 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
4047 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
4048 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
4049 Register Tmp = MRI->createVirtualRegister(NewRC);
4050 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
4051 UseMI.getDebugLoc(), get(AMDGPU::COPY),
4052 UseMI.getOperand(0).getReg())
4053 .addReg(Tmp, RegState::Kill);
4054 UseMI.getOperand(0).setReg(Tmp);
4055 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
4056 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
4057 }
4058
4059 // It might happen that UseMI was commuted
4060 // and we now have SGPR as SRC1. If so 2 inlined
4061 // constant and SGPR are illegal.
4063
4064 bool DeleteDef = MRI->use_nodbg_empty(Reg);
4065 if (DeleteDef)
4066 DefMI.eraseFromParent();
4067
4068 return true;
4069 }
4070 }
4071
4072 return false;
4073}
4074
4075static bool
4078 if (BaseOps1.size() != BaseOps2.size())
4079 return false;
4080 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
4081 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
4082 return false;
4083 }
4084 return true;
4085}
4086
4087static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4088 LocationSize WidthB, int OffsetB) {
4089 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4090 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4091 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4092 return LowWidth.hasValue() &&
4093 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4094}
4095
4096bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4097 const MachineInstr &MIb) const {
4098 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4099 int64_t Offset0, Offset1;
4100 LocationSize Dummy0 = LocationSize::precise(0);
4101 LocationSize Dummy1 = LocationSize::precise(0);
4102 bool Offset0IsScalable, Offset1IsScalable;
4103 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
4104 Dummy0, &RI) ||
4105 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
4106 Dummy1, &RI))
4107 return false;
4108
4109 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
4110 return false;
4111
4112 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4113 // FIXME: Handle ds_read2 / ds_write2.
4114 return false;
4115 }
4116 LocationSize Width0 = MIa.memoperands().front()->getSize();
4117 LocationSize Width1 = MIb.memoperands().front()->getSize();
4118 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
4119}
4120
4122 const MachineInstr &MIb) const {
4123 assert(MIa.mayLoadOrStore() &&
4124 "MIa must load from or modify a memory location");
4125 assert(MIb.mayLoadOrStore() &&
4126 "MIb must load from or modify a memory location");
4127
4129 return false;
4130
4131 // XXX - Can we relax this between address spaces?
4132 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4133 return false;
4134
4135 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4136 return false;
4137
4138 if (MIa.isBundle() || MIb.isBundle())
4139 return false;
4140
4141 // TODO: Should we check the address space from the MachineMemOperand? That
4142 // would allow us to distinguish objects we know don't alias based on the
4143 // underlying address space, even if it was lowered to a different one,
4144 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4145 // buffer.
4146 if (isDS(MIa)) {
4147 if (isDS(MIb))
4148 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4149
4150 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4151 }
4152
4153 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4154 if (isMUBUF(MIb) || isMTBUF(MIb))
4155 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4156
4157 if (isFLAT(MIb))
4158 return isFLATScratch(MIb);
4159
4160 return !isSMRD(MIb);
4161 }
4162
4163 if (isSMRD(MIa)) {
4164 if (isSMRD(MIb))
4165 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4166
4167 if (isFLAT(MIb))
4168 return isFLATScratch(MIb);
4169
4170 return !isMUBUF(MIb) && !isMTBUF(MIb);
4171 }
4172
4173 if (isFLAT(MIa)) {
4174 if (isFLAT(MIb)) {
4175 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4176 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4177 return true;
4178
4179 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4180 }
4181
4182 return false;
4183 }
4184
4185 return false;
4186}
4187
4189 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4190 if (Reg.isPhysical())
4191 return false;
4192 auto *Def = MRI.getUniqueVRegDef(Reg);
4193 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4194 Imm = Def->getOperand(1).getImm();
4195 if (DefMI)
4196 *DefMI = Def;
4197 return true;
4198 }
4199 return false;
4200}
4201
4202static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4203 MachineInstr **DefMI = nullptr) {
4204 if (!MO->isReg())
4205 return false;
4206 const MachineFunction *MF = MO->getParent()->getMF();
4207 const MachineRegisterInfo &MRI = MF->getRegInfo();
4208 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4209}
4210
4212 MachineInstr &NewMI) {
4213 if (LV) {
4214 unsigned NumOps = MI.getNumOperands();
4215 for (unsigned I = 1; I < NumOps; ++I) {
4216 MachineOperand &Op = MI.getOperand(I);
4217 if (Op.isReg() && Op.isKill())
4218 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4219 }
4220 }
4221}
4222
4223static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4224 switch (Opc) {
4225 case AMDGPU::V_MAC_F16_e32:
4226 case AMDGPU::V_MAC_F16_e64:
4227 return AMDGPU::V_MAD_F16_e64;
4228 case AMDGPU::V_MAC_F32_e32:
4229 case AMDGPU::V_MAC_F32_e64:
4230 return AMDGPU::V_MAD_F32_e64;
4231 case AMDGPU::V_MAC_LEGACY_F32_e32:
4232 case AMDGPU::V_MAC_LEGACY_F32_e64:
4233 return AMDGPU::V_MAD_LEGACY_F32_e64;
4234 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4235 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4236 return AMDGPU::V_FMA_LEGACY_F32_e64;
4237 case AMDGPU::V_FMAC_F16_e32:
4238 case AMDGPU::V_FMAC_F16_e64:
4239 case AMDGPU::V_FMAC_F16_t16_e64:
4240 case AMDGPU::V_FMAC_F16_fake16_e64:
4241 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4242 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4243 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4244 : AMDGPU::V_FMA_F16_gfx9_e64;
4245 case AMDGPU::V_FMAC_F32_e32:
4246 case AMDGPU::V_FMAC_F32_e64:
4247 return AMDGPU::V_FMA_F32_e64;
4248 case AMDGPU::V_FMAC_F64_e32:
4249 case AMDGPU::V_FMAC_F64_e64:
4250 return AMDGPU::V_FMA_F64_e64;
4251 default:
4252 llvm_unreachable("invalid instruction");
4253 }
4254}
4255
4256/// Helper struct for the implementation of 3-address conversion to communicate
4257/// updates made to instruction operands.
4259 /// Other instruction whose def is no longer used by the converted
4260 /// instruction.
4262};
4263
4265 LiveVariables *LV,
4266 LiveIntervals *LIS) const {
4267 MachineBasicBlock &MBB = *MI.getParent();
4268 MachineInstr *CandidateMI = &MI;
4269
4270 if (MI.isBundle()) {
4271 // This is a temporary placeholder for bundle handling that enables us to
4272 // exercise the relevant code paths in the two-address instruction pass.
4273 if (MI.getBundleSize() != 1)
4274 return nullptr;
4275 CandidateMI = MI.getNextNode();
4276 }
4277
4279 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4280 if (!NewMI)
4281 return nullptr;
4282
4283 if (MI.isBundle()) {
4284 CandidateMI->eraseFromBundle();
4285
4286 for (MachineOperand &MO : MI.all_defs()) {
4287 if (MO.isTied())
4288 MI.untieRegOperand(MO.getOperandNo());
4289 }
4290 } else {
4291 updateLiveVariables(LV, MI, *NewMI);
4292 if (LIS) {
4293 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4294 // SlotIndex of defs needs to be updated when converting to early-clobber
4295 MachineOperand &Def = NewMI->getOperand(0);
4296 if (Def.isEarlyClobber() && Def.isReg() &&
4297 LIS->hasInterval(Def.getReg())) {
4298 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4299 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4300 auto &LI = LIS->getInterval(Def.getReg());
4301 auto UpdateDefIndex = [&](LiveRange &LR) {
4302 auto *S = LR.find(OldIndex);
4303 if (S != LR.end() && S->start == OldIndex) {
4304 assert(S->valno && S->valno->def == OldIndex);
4305 S->start = NewIndex;
4306 S->valno->def = NewIndex;
4307 }
4308 };
4309 UpdateDefIndex(LI);
4310 for (auto &SR : LI.subranges())
4311 UpdateDefIndex(SR);
4312 }
4313 }
4314 }
4315
4316 if (U.RemoveMIUse) {
4317 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4318 // The only user is the instruction which will be killed.
4319 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4320
4321 if (MRI.hasOneNonDBGUse(DefReg)) {
4322 // We cannot just remove the DefMI here, calling pass will crash.
4323 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4324 U.RemoveMIUse->getOperand(0).setIsDead(true);
4325 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4326 U.RemoveMIUse->removeOperand(I);
4327 if (LV)
4328 LV->getVarInfo(DefReg).AliveBlocks.clear();
4329 }
4330
4331 if (MI.isBundle()) {
4332 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4333 if (!VRI.Reads && !VRI.Writes) {
4334 for (MachineOperand &MO : MI.all_uses()) {
4335 if (MO.isReg() && MO.getReg() == DefReg) {
4336 assert(MO.getSubReg() == 0 &&
4337 "tied sub-registers in bundles currently not supported");
4338 MI.removeOperand(MO.getOperandNo());
4339 break;
4340 }
4341 }
4342
4343 if (LIS)
4344 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4345 }
4346 } else if (LIS) {
4347 LiveInterval &DefLI = LIS->getInterval(DefReg);
4348
4349 // We cannot delete the original instruction here, so hack out the use
4350 // in the original instruction with a dummy register so we can use
4351 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4352 // not have the complexity of deleting a use to consider here.
4353 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4354 for (MachineOperand &MIOp : MI.uses()) {
4355 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4356 MIOp.setIsUndef(true);
4357 MIOp.setReg(DummyReg);
4358 }
4359 }
4360
4361 if (MI.isBundle()) {
4362 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4363 if (!VRI.Reads && !VRI.Writes) {
4364 for (MachineOperand &MIOp : MI.uses()) {
4365 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4366 MIOp.setIsUndef(true);
4367 MIOp.setReg(DummyReg);
4368 }
4369 }
4370 }
4371
4372 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4373 false, /*isUndef=*/true));
4374 }
4375
4376 LIS->shrinkToUses(&DefLI);
4377 }
4378 }
4379
4380 return MI.isBundle() ? &MI : NewMI;
4381}
4382
4384SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4385 ThreeAddressUpdates &U) const {
4386 MachineBasicBlock &MBB = *MI.getParent();
4387 unsigned Opc = MI.getOpcode();
4388
4389 // Handle MFMA.
4390 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4391 if (NewMFMAOpc != -1) {
4393 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4394 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4395 MIB.add(MI.getOperand(I));
4396 return MIB;
4397 }
4398
4399 if (SIInstrInfo::isWMMA(MI)) {
4400 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4401 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4402 .setMIFlags(MI.getFlags());
4403 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4404 MIB->addOperand(MI.getOperand(I));
4405 return MIB;
4406 }
4407
4408 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4409 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4410 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4411 "present pre-RA");
4412
4413 // Handle MAC/FMAC.
4414 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4415 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4416 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4417 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4418 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4419 bool Src0Literal = false;
4420
4421 switch (Opc) {
4422 default:
4423 return nullptr;
4424 case AMDGPU::V_MAC_F16_e64:
4425 case AMDGPU::V_FMAC_F16_e64:
4426 case AMDGPU::V_FMAC_F16_t16_e64:
4427 case AMDGPU::V_FMAC_F16_fake16_e64:
4428 case AMDGPU::V_MAC_F32_e64:
4429 case AMDGPU::V_MAC_LEGACY_F32_e64:
4430 case AMDGPU::V_FMAC_F32_e64:
4431 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4432 case AMDGPU::V_FMAC_F64_e64:
4433 break;
4434 case AMDGPU::V_MAC_F16_e32:
4435 case AMDGPU::V_FMAC_F16_e32:
4436 case AMDGPU::V_MAC_F32_e32:
4437 case AMDGPU::V_MAC_LEGACY_F32_e32:
4438 case AMDGPU::V_FMAC_F32_e32:
4439 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4440 case AMDGPU::V_FMAC_F64_e32: {
4441 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4442 AMDGPU::OpName::src0);
4443 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4444 if (!Src0->isReg() && !Src0->isImm())
4445 return nullptr;
4446
4447 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4448 Src0Literal = true;
4449
4450 break;
4451 }
4452 }
4453
4454 MachineInstrBuilder MIB;
4455 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4456 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4457 const MachineOperand *Src0Mods =
4458 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4459 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4460 const MachineOperand *Src1Mods =
4461 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4462 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4463 const MachineOperand *Src2Mods =
4464 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4465 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4466 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4467 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4468
4469 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4470 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4471 // If we have an SGPR input, we will violate the constant bus restriction.
4472 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4473 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4474 MachineInstr *DefMI;
4475
4476 int64_t Imm;
4477 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4478 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4479 if (pseudoToMCOpcode(NewOpc) != -1) {
4480 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4481 .add(*Dst)
4482 .add(*Src0)
4483 .add(*Src1)
4484 .addImm(Imm)
4485 .setMIFlags(MI.getFlags());
4486 U.RemoveMIUse = DefMI;
4487 return MIB;
4488 }
4489 }
4490 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4491 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4492 if (pseudoToMCOpcode(NewOpc) != -1) {
4493 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4494 .add(*Dst)
4495 .add(*Src0)
4496 .addImm(Imm)
4497 .add(*Src2)
4498 .setMIFlags(MI.getFlags());
4499 U.RemoveMIUse = DefMI;
4500 return MIB;
4501 }
4502 }
4503 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4504 if (Src0Literal) {
4505 Imm = Src0->getImm();
4506 DefMI = nullptr;
4507 }
4508 if (pseudoToMCOpcode(NewOpc) != -1 &&
4510 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4511 Src1)) {
4512 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4513 .add(*Dst)
4514 .add(*Src1)
4515 .addImm(Imm)
4516 .add(*Src2)
4517 .setMIFlags(MI.getFlags());
4518 U.RemoveMIUse = DefMI;
4519 return MIB;
4520 }
4521 }
4522 }
4523
4524 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4525 // if VOP3 does not allow a literal operand.
4526 if (Src0Literal && !ST.hasVOP3Literal())
4527 return nullptr;
4528
4529 unsigned NewOpc = getNewFMAInst(ST, Opc);
4530
4531 if (pseudoToMCOpcode(NewOpc) == -1)
4532 return nullptr;
4533
4534 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4535 .add(*Dst)
4536 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4537 .add(*Src0)
4538 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4539 .add(*Src1)
4540 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4541 .add(*Src2)
4542 .addImm(Clamp ? Clamp->getImm() : 0)
4543 .addImm(Omod ? Omod->getImm() : 0)
4544 .setMIFlags(MI.getFlags());
4545 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4546 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4547 return MIB;
4548}
4549
4550// It's not generally safe to move VALU instructions across these since it will
4551// start using the register as a base index rather than directly.
4552// XXX - Why isn't hasSideEffects sufficient for these?
4554 switch (MI.getOpcode()) {
4555 case AMDGPU::S_SET_GPR_IDX_ON:
4556 case AMDGPU::S_SET_GPR_IDX_MODE:
4557 case AMDGPU::S_SET_GPR_IDX_OFF:
4558 return true;
4559 default:
4560 return false;
4561 }
4562}
4563
4565 const MachineBasicBlock *MBB,
4566 const MachineFunction &MF) const {
4567 // Skipping the check for SP writes in the base implementation. The reason it
4568 // was added was apparently due to compile time concerns.
4569 //
4570 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4571 // but is probably avoidable.
4572
4573 // Copied from base implementation.
4574 // Terminators and labels can't be scheduled around.
4575 if (MI.isTerminator() || MI.isPosition())
4576 return true;
4577
4578 // INLINEASM_BR can jump to another block
4579 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4580 return true;
4581
4582 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4583 return true;
4584
4585 // Target-independent instructions do not have an implicit-use of EXEC, even
4586 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4587 // boundaries prevents incorrect movements of such instructions.
4588 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4589 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4590 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4591 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4592 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4594}
4595
4597 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4598 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4599 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4600}
4601
4603 // Instructions that access scratch use FLAT encoding or BUF encodings.
4604 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4605 return false;
4606
4607 // SCRATCH instructions always access scratch.
4608 if (isFLATScratch(MI))
4609 return true;
4610
4611 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4612 // via the aperture.
4613 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4614 return false;
4615
4616 // If there are no memory operands then conservatively assume the flat
4617 // operation may access scratch.
4618 if (MI.memoperands_empty())
4619 return true;
4620
4621 // See if any memory operand specifies an address space that involves scratch.
4622 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4623 unsigned AS = Memop->getAddrSpace();
4624 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4625 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4626 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4627 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4628 }
4629 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4630 });
4631}
4632
4634 assert(isFLAT(MI));
4635
4636 // All flat instructions use the VMEM counter except prefetch.
4637 if (!usesVM_CNT(MI))
4638 return false;
4639
4640 // If there are no memory operands then conservatively assume the flat
4641 // operation may access VMEM.
4642 if (MI.memoperands_empty())
4643 return true;
4644
4645 // See if any memory operand specifies an address space that involves VMEM.
4646 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4647 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4648 // (GDS) address space is not supported by flat operations. Therefore, simply
4649 // return true unless only the LDS address space is found.
4650 for (const MachineMemOperand *Memop : MI.memoperands()) {
4651 unsigned AS = Memop->getAddrSpace();
4653 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4654 return true;
4655 }
4656
4657 return false;
4658}
4659
4661 assert(isFLAT(MI));
4662
4663 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4664 if (!usesLGKM_CNT(MI))
4665 return false;
4666
4667 // If in tgsplit mode then there can be no use of LDS.
4668 if (ST.isTgSplitEnabled())
4669 return false;
4670
4671 // If there are no memory operands then conservatively assume the flat
4672 // operation may access LDS.
4673 if (MI.memoperands_empty())
4674 return true;
4675
4676 // See if any memory operand specifies an address space that involves LDS.
4677 for (const MachineMemOperand *Memop : MI.memoperands()) {
4678 unsigned AS = Memop->getAddrSpace();
4680 return true;
4681 }
4682
4683 return false;
4684}
4685
4687 // Skip the full operand and register alias search modifiesRegister
4688 // does. There's only a handful of instructions that touch this, it's only an
4689 // implicit def, and doesn't alias any other registers.
4690 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4691}
4692
4694 unsigned Opcode = MI.getOpcode();
4695
4696 if (MI.mayStore() && isSMRD(MI))
4697 return true; // scalar store or atomic
4698
4699 // This will terminate the function when other lanes may need to continue.
4700 if (MI.isReturn())
4701 return true;
4702
4703 // These instructions cause shader I/O that may cause hardware lockups
4704 // when executed with an empty EXEC mask.
4705 //
4706 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4707 // EXEC = 0, but checking for that case here seems not worth it
4708 // given the typical code patterns.
4709 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4710 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4711 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4712 Opcode == AMDGPU::S_SETHALT)
4713 return true;
4714
4715 if (MI.isCall() || MI.isInlineAsm())
4716 return true; // conservative assumption
4717
4718 // Assume that barrier interactions are only intended with active lanes.
4719 if (isBarrier(Opcode))
4720 return true;
4721
4722 // A mode change is a scalar operation that influences vector instructions.
4724 return true;
4725
4726 // These are like SALU instructions in terms of effects, so it's questionable
4727 // whether we should return true for those.
4728 //
4729 // However, executing them with EXEC = 0 causes them to operate on undefined
4730 // data, which we avoid by returning true here.
4731 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4732 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4733 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4734 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4735 return true;
4736
4737 return false;
4738}
4739
4741 const MachineInstr &MI) const {
4742 if (MI.isMetaInstruction())
4743 return false;
4744
4745 // This won't read exec if this is an SGPR->SGPR copy.
4746 if (MI.isCopyLike()) {
4747 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4748 return true;
4749
4750 // Make sure this isn't copying exec as a normal operand
4751 return MI.readsRegister(AMDGPU::EXEC, &RI);
4752 }
4753
4754 // Make a conservative assumption about the callee.
4755 if (MI.isCall())
4756 return true;
4757
4758 // Be conservative with any unhandled generic opcodes.
4759 if (!isTargetSpecificOpcode(MI.getOpcode()))
4760 return true;
4761
4762 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4763}
4764
4765bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4766 switch (Imm.getBitWidth()) {
4767 case 1: // This likely will be a condition code mask.
4768 return true;
4769
4770 case 32:
4771 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4772 ST.hasInv2PiInlineImm());
4773 case 64:
4774 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4775 ST.hasInv2PiInlineImm());
4776 case 16:
4777 return ST.has16BitInsts() &&
4778 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4779 ST.hasInv2PiInlineImm());
4780 default:
4781 llvm_unreachable("invalid bitwidth");
4782 }
4783}
4784
4786 APInt IntImm = Imm.bitcastToAPInt();
4787 int64_t IntImmVal = IntImm.getSExtValue();
4788 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4789 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4790 default:
4791 llvm_unreachable("invalid fltSemantics");
4794 return isInlineConstant(IntImm);
4796 return ST.has16BitInsts() &&
4797 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4799 return ST.has16BitInsts() &&
4800 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4801 }
4802}
4803
4804bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4805 // MachineOperand provides no way to tell the true operand size, since it only
4806 // records a 64-bit value. We need to know the size to determine if a 32-bit
4807 // floating point immediate bit pattern is legal for an integer immediate. It
4808 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4809 switch (OperandType) {
4819 int32_t Trunc = static_cast<int32_t>(Imm);
4820 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4821 }
4827 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4830 // We would expect inline immediates to not be concerned with an integer/fp
4831 // distinction. However, in the case of 16-bit integer operations, the
4832 // "floating point" values appear to not work. It seems read the low 16-bits
4833 // of 32-bit immediates, which happens to always work for the integer
4834 // values.
4835 //
4836 // See llvm bugzilla 46302.
4837 //
4838 // TODO: Theoretically we could use op-sel to use the high bits of the
4839 // 32-bit FP values.
4848 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4853 return false;
4856 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4857 // A few special case instructions have 16-bit operands on subtargets
4858 // where 16-bit instructions are not legal.
4859 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4860 // constants in these cases
4861 int16_t Trunc = static_cast<int16_t>(Imm);
4862 return ST.has16BitInsts() &&
4863 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4864 }
4865
4866 return false;
4867 }
4870 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4871 int16_t Trunc = static_cast<int16_t>(Imm);
4872 return ST.has16BitInsts() &&
4873 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4874 }
4875 return false;
4876 }
4880 return false;
4882 return isLegalAV64PseudoImm(Imm);
4885 // Always embedded in the instruction for free.
4886 return true;
4896 // Just ignore anything else.
4897 return true;
4898 default:
4899 llvm_unreachable("invalid operand type");
4900 }
4901}
4902
4903static bool compareMachineOp(const MachineOperand &Op0,
4904 const MachineOperand &Op1) {
4905 if (Op0.getType() != Op1.getType())
4906 return false;
4907
4908 switch (Op0.getType()) {
4910 return Op0.getReg() == Op1.getReg();
4912 return Op0.getImm() == Op1.getImm();
4913 default:
4914 llvm_unreachable("Didn't expect to be comparing these operand types");
4915 }
4916}
4917
4919 const MCOperandInfo &OpInfo) const {
4920 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4921 return true;
4922
4923 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4924 return false;
4925
4926 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4927 return true;
4928
4929 return ST.hasVOP3Literal();
4930}
4931
4932bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4933 int64_t ImmVal) const {
4934 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4935 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4936 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4937 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4938 AMDGPU::OpName::src2))
4939 return false;
4940 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4941 }
4942
4943 return isLiteralOperandLegal(InstDesc, OpInfo);
4944}
4945
4946bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4947 const MachineOperand &MO) const {
4948 if (MO.isImm())
4949 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4950
4951 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4952 "unexpected imm-like operand kind");
4953 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4954 return isLiteralOperandLegal(InstDesc, OpInfo);
4955}
4956
4958 // 2 32-bit inline constants packed into one.
4959 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4960 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4961}
4962
4963bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4964 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4965 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4966 return false;
4967
4968 int Op32 = AMDGPU::getVOPe32(Opcode);
4969 if (Op32 == -1)
4970 return false;
4971
4972 return pseudoToMCOpcode(Op32) != -1;
4973}
4974
4975bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4976 // The src0_modifier operand is present on all instructions
4977 // that have modifiers.
4978
4979 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4980}
4981
4983 AMDGPU::OpName OpName) const {
4984 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4985 return Mods && Mods->getImm();
4986}
4987
4989 return any_of(ModifierOpNames,
4990 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4991}
4992
4994 const MachineRegisterInfo &MRI) const {
4995 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4996 // Can't shrink instruction with three operands.
4997 if (Src2) {
4998 switch (MI.getOpcode()) {
4999 default: return false;
5000
5001 case AMDGPU::V_ADDC_U32_e64:
5002 case AMDGPU::V_SUBB_U32_e64:
5003 case AMDGPU::V_SUBBREV_U32_e64: {
5004 const MachineOperand *Src1
5005 = getNamedOperand(MI, AMDGPU::OpName::src1);
5006 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
5007 return false;
5008 // Additional verification is needed for sdst/src2.
5009 return true;
5010 }
5011 case AMDGPU::V_MAC_F16_e64:
5012 case AMDGPU::V_MAC_F32_e64:
5013 case AMDGPU::V_MAC_LEGACY_F32_e64:
5014 case AMDGPU::V_FMAC_F16_e64:
5015 case AMDGPU::V_FMAC_F16_t16_e64:
5016 case AMDGPU::V_FMAC_F16_fake16_e64:
5017 case AMDGPU::V_FMAC_F32_e64:
5018 case AMDGPU::V_FMAC_F64_e64:
5019 case AMDGPU::V_FMAC_LEGACY_F32_e64:
5020 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
5021 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
5022 return false;
5023 break;
5024
5025 case AMDGPU::V_CNDMASK_B32_e64:
5026 break;
5027 }
5028 }
5029
5030 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
5031 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
5032 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
5033 return false;
5034
5035 // We don't need to check src0, all input types are legal, so just make sure
5036 // src0 isn't using any modifiers.
5037 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
5038 return false;
5039
5040 // Can it be shrunk to a valid 32 bit opcode?
5041 if (!hasVALU32BitEncoding(MI.getOpcode()))
5042 return false;
5043
5044 // Check output modifiers
5045 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
5046 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
5047 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
5048 // TODO: Can we avoid checking bound_ctrl/fi here?
5049 // They are only used by permlane*_swap special case.
5050 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
5051 !hasModifiersSet(MI, AMDGPU::OpName::fi);
5052}
5053
5054// Set VCC operand with all flags from \p Orig, except for setting it as
5055// implicit.
5057 const MachineOperand &Orig) {
5058
5059 for (MachineOperand &Use : MI.implicit_operands()) {
5060 if (Use.isUse() &&
5061 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
5062 Use.setIsUndef(Orig.isUndef());
5063 Use.setIsKill(Orig.isKill());
5064 return;
5065 }
5066 }
5067}
5068
5070 unsigned Op32) const {
5071 MachineBasicBlock *MBB = MI.getParent();
5072
5073 const MCInstrDesc &Op32Desc = get(Op32);
5074 MachineInstrBuilder Inst32 =
5075 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
5076 .setMIFlags(MI.getFlags());
5077
5078 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
5079 // For VOPC instructions, this is replaced by an implicit def of vcc.
5080
5081 // We assume the defs of the shrunk opcode are in the same order, and the
5082 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
5083 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5084 Inst32.add(MI.getOperand(I));
5085
5086 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
5087
5088 int Idx = MI.getNumExplicitDefs();
5089 for (const MachineOperand &Use : MI.explicit_uses()) {
5090 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5092 continue;
5093
5094 if (&Use == Src2) {
5095 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
5096 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5097 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5098 // of vcc was already added during the initial BuildMI, but we
5099 // 1) may need to change vcc to vcc_lo to preserve the original register
5100 // 2) have to preserve the original flags.
5101 copyFlagsToImplicitVCC(*Inst32, *Src2);
5102 continue;
5103 }
5104 }
5105
5106 Inst32.add(Use);
5107 }
5108
5109 // FIXME: Losing implicit operands
5110 fixImplicitOperands(*Inst32);
5111 return Inst32;
5112}
5113
5115 // Null is free
5116 Register Reg = RegOp.getReg();
5117 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5118 return false;
5119
5120 // SGPRs use the constant bus
5121
5122 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5123 // physical register operands should also count, except for exec.
5124 if (RegOp.isImplicit())
5125 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5126
5127 // SGPRs use the constant bus
5128 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5129 AMDGPU::SReg_64RegClass.contains(Reg);
5130}
5131
5133 const MachineRegisterInfo &MRI) const {
5134 Register Reg = RegOp.getReg();
5135 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5136 : physRegUsesConstantBus(RegOp);
5137}
5138
5140 const MachineOperand &MO,
5141 const MCOperandInfo &OpInfo) const {
5142 // Literal constants use the constant bus.
5143 if (!MO.isReg())
5144 return !isInlineConstant(MO, OpInfo);
5145
5146 Register Reg = MO.getReg();
5147 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5149}
5150
5152 for (const MachineOperand &MO : MI.implicit_operands()) {
5153 // We only care about reads.
5154 if (MO.isDef())
5155 continue;
5156
5157 switch (MO.getReg()) {
5158 case AMDGPU::VCC:
5159 case AMDGPU::VCC_LO:
5160 case AMDGPU::VCC_HI:
5161 case AMDGPU::M0:
5162 case AMDGPU::FLAT_SCR:
5163 return MO.getReg();
5164
5165 default:
5166 break;
5167 }
5168 }
5169
5170 return Register();
5171}
5172
5173static bool shouldReadExec(const MachineInstr &MI) {
5174 if (SIInstrInfo::isVALU(MI)) {
5175 switch (MI.getOpcode()) {
5176 case AMDGPU::V_READLANE_B32:
5177 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5178 case AMDGPU::V_WRITELANE_B32:
5179 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5180 return false;
5181 }
5182
5183 return true;
5184 }
5185
5186 if (MI.isPreISelOpcode() ||
5187 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5190 return false;
5191
5192 return true;
5193}
5194
5195static bool isRegOrFI(const MachineOperand &MO) {
5196 return MO.isReg() || MO.isFI();
5197}
5198
5199static bool isSubRegOf(const SIRegisterInfo &TRI,
5200 const MachineOperand &SuperVec,
5201 const MachineOperand &SubReg) {
5202 if (SubReg.getReg().isPhysical())
5203 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5204
5205 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5206 SubReg.getReg() == SuperVec.getReg();
5207}
5208
5209// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5210bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5211 const MachineRegisterInfo &MRI,
5212 StringRef &ErrInfo) const {
5213 Register DstReg = MI.getOperand(0).getReg();
5214 Register SrcReg = MI.getOperand(1).getReg();
5215 // This is a check for copy from vector register to SGPR
5216 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5217 ErrInfo = "illegal copy from vector register to SGPR";
5218 return false;
5219 }
5220 return true;
5221}
5222
5224 StringRef &ErrInfo) const {
5225 uint32_t Opcode = MI.getOpcode();
5226 const MachineFunction *MF = MI.getMF();
5227 const MachineRegisterInfo &MRI = MF->getRegInfo();
5228
5229 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5230 // Find a better property to recognize the point where instruction selection
5231 // is just done.
5232 // We can only enforce this check after SIFixSGPRCopies pass so that the
5233 // illegal copies are legalized and thereafter we don't expect a pass
5234 // inserting similar copies.
5235 if (!MRI.isSSA() && MI.isCopy())
5236 return verifyCopy(MI, MRI, ErrInfo);
5237
5238 if (SIInstrInfo::isGenericOpcode(Opcode))
5239 return true;
5240
5241 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5242 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5243 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5244 int Src3Idx = -1;
5245 if (Src0Idx == -1) {
5246 // VOPD V_DUAL_* instructions use different operand names.
5247 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5248 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5249 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5250 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5251 }
5252
5253 // Make sure the number of operands is correct.
5254 const MCInstrDesc &Desc = get(Opcode);
5255 if (!Desc.isVariadic() &&
5256 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5257 ErrInfo = "Instruction has wrong number of operands.";
5258 return false;
5259 }
5260
5261 if (MI.isInlineAsm()) {
5262 // Verify register classes for inlineasm constraints.
5263 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5264 I != E; ++I) {
5265 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5266 if (!RC)
5267 continue;
5268
5269 const MachineOperand &Op = MI.getOperand(I);
5270 if (!Op.isReg())
5271 continue;
5272
5273 Register Reg = Op.getReg();
5274 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5275 ErrInfo = "inlineasm operand has incorrect register class.";
5276 return false;
5277 }
5278 }
5279
5280 return true;
5281 }
5282
5283 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5284 ErrInfo = "missing memory operand from image instruction.";
5285 return false;
5286 }
5287
5288 // Make sure the register classes are correct.
5289 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5290 const MachineOperand &MO = MI.getOperand(i);
5291 if (MO.isFPImm()) {
5292 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5293 "all fp values to integers.";
5294 return false;
5295 }
5296
5297 const MCOperandInfo &OpInfo = Desc.operands()[i];
5298 int16_t RegClass = getOpRegClassID(OpInfo);
5299
5300 switch (OpInfo.OperandType) {
5302 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5303 ErrInfo = "Illegal immediate value for operand.";
5304 return false;
5305 }
5306 break;
5316 break;
5318 break;
5319 break;
5333 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5334 ErrInfo = "Illegal immediate value for operand.";
5335 return false;
5336 }
5337 break;
5338 }
5343 if (ST.has64BitLiterals() && Desc.getSize() != 4 && MO.isImm() &&
5344 !isInlineConstant(MI, i) &&
5346 OpInfo.OperandType ==
5348 ErrInfo = "illegal 64-bit immediate value for operand.";
5349 return false;
5350 }
5351 break;
5354 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5355 ErrInfo = "Expected inline constant for operand.";
5356 return false;
5357 }
5358 break;
5361 break;
5366 // Check if this operand is an immediate.
5367 // FrameIndex operands will be replaced by immediates, so they are
5368 // allowed.
5369 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5370 ErrInfo = "Expected immediate, but got non-immediate";
5371 return false;
5372 }
5373 break;
5377 break;
5378 default:
5379 if (OpInfo.isGenericType())
5380 continue;
5381 break;
5382 }
5383
5384 if (!MO.isReg())
5385 continue;
5386 Register Reg = MO.getReg();
5387 if (!Reg)
5388 continue;
5389
5390 // FIXME: Ideally we would have separate instruction definitions with the
5391 // aligned register constraint.
5392 // FIXME: We do not verify inline asm operands, but custom inline asm
5393 // verification is broken anyway
5394 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5395 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5396 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5397 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5398 if (const TargetRegisterClass *SubRC =
5399 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5400 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5401 if (RC)
5402 RC = SubRC;
5403 }
5404 }
5405
5406 // Check that this is the aligned version of the class.
5407 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5408 ErrInfo = "Subtarget requires even aligned vector registers";
5409 return false;
5410 }
5411 }
5412
5413 if (RegClass != -1) {
5414 if (Reg.isVirtual())
5415 continue;
5416
5417 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5418 if (!RC->contains(Reg)) {
5419 ErrInfo = "Operand has incorrect register class.";
5420 return false;
5421 }
5422 }
5423 }
5424
5425 // Verify SDWA
5426 if (isSDWA(MI)) {
5427 if (!ST.hasSDWA()) {
5428 ErrInfo = "SDWA is not supported on this target";
5429 return false;
5430 }
5431
5432 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5433 AMDGPU::OpName::dst_sel}) {
5434 const MachineOperand *MO = getNamedOperand(MI, Op);
5435 if (!MO)
5436 continue;
5437 int64_t Imm = MO->getImm();
5438 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5439 ErrInfo = "Invalid SDWA selection";
5440 return false;
5441 }
5442 }
5443
5444 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5445
5446 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5447 if (OpIdx == -1)
5448 continue;
5449 const MachineOperand &MO = MI.getOperand(OpIdx);
5450
5451 if (!ST.hasSDWAScalar()) {
5452 // Only VGPRS on VI
5453 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5454 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5455 return false;
5456 }
5457 } else {
5458 // No immediates on GFX9
5459 if (!MO.isReg()) {
5460 ErrInfo =
5461 "Only reg allowed as operands in SDWA instructions on GFX9+";
5462 return false;
5463 }
5464 }
5465 }
5466
5467 if (!ST.hasSDWAOmod()) {
5468 // No omod allowed on VI
5469 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5470 if (OMod != nullptr &&
5471 (!OMod->isImm() || OMod->getImm() != 0)) {
5472 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5473 return false;
5474 }
5475 }
5476
5477 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5478 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5479 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5480 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5481 const MachineOperand *Src0ModsMO =
5482 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5483 unsigned Mods = Src0ModsMO->getImm();
5484 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5485 Mods & SISrcMods::SEXT) {
5486 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5487 return false;
5488 }
5489 }
5490
5491 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5492 if (isVOPC(BasicOpcode)) {
5493 if (!ST.hasSDWASdst() && DstIdx != -1) {
5494 // Only vcc allowed as dst on VI for VOPC
5495 const MachineOperand &Dst = MI.getOperand(DstIdx);
5496 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5497 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5498 return false;
5499 }
5500 } else if (!ST.hasSDWAOutModsVOPC()) {
5501 // No clamp allowed on GFX9 for VOPC
5502 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5503 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5504 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5505 return false;
5506 }
5507
5508 // No omod allowed on GFX9 for VOPC
5509 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5510 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5511 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5512 return false;
5513 }
5514 }
5515 }
5516
5517 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5518 if (DstUnused && DstUnused->isImm() &&
5519 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5520 const MachineOperand &Dst = MI.getOperand(DstIdx);
5521 if (!Dst.isReg() || !Dst.isTied()) {
5522 ErrInfo = "Dst register should have tied register";
5523 return false;
5524 }
5525
5526 const MachineOperand &TiedMO =
5527 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5528 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5529 ErrInfo =
5530 "Dst register should be tied to implicit use of preserved register";
5531 return false;
5532 }
5533 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5534 ErrInfo = "Dst register should use same physical register as preserved";
5535 return false;
5536 }
5537 }
5538 }
5539
5540 if (isDPP(MI) && !ST.hasDPPSrc1SGPR() && Src1Idx != -1) {
5541 const MachineOperand &Src1MO = MI.getOperand(Src1Idx);
5542 if (Src1MO.isReg() && RI.isSGPRReg(MRI, Src1MO.getReg())) {
5543 ErrInfo = "DPP src1 cannot be SGPR on this subtarget";
5544 return false;
5545 }
5546 }
5547
5548 // Verify MIMG / VIMAGE / VSAMPLE
5549 if (isImage(Opcode) && !MI.mayStore()) {
5550 // Ensure that the return type used is large enough for all the options
5551 // being used TFE/LWE require an extra result register.
5552 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5553 if (DMask) {
5554 uint64_t DMaskImm = DMask->getImm();
5555 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5556 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5557 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5558 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5559
5560 // Adjust for packed 16 bit values
5561 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5562 RegCount = divideCeil(RegCount, 2);
5563
5564 // Adjust if using LWE or TFE
5565 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5566 RegCount += 1;
5567
5568 const uint32_t DstIdx =
5569 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5570 const MachineOperand &Dst = MI.getOperand(DstIdx);
5571 if (Dst.isReg()) {
5572 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5573 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5574 if (RegCount > DstSize) {
5575 ErrInfo = "Image instruction returns too many registers for dst "
5576 "register class";
5577 return false;
5578 }
5579 }
5580 }
5581 }
5582
5583 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5584 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5585 unsigned ConstantBusCount = 0;
5586 bool UsesLiteral = false;
5587 const MachineOperand *LiteralVal = nullptr;
5588
5589 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5590 if (ImmIdx != -1) {
5591 ++ConstantBusCount;
5592 UsesLiteral = true;
5593 LiteralVal = &MI.getOperand(ImmIdx);
5594 }
5595
5596 SmallVector<Register, 2> SGPRsUsed;
5597 Register SGPRUsed;
5598
5599 // Only look at the true operands. Only a real operand can use the constant
5600 // bus, and we don't want to check pseudo-operands like the source modifier
5601 // flags.
5602 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5603 if (OpIdx == -1)
5604 continue;
5605 const MachineOperand &MO = MI.getOperand(OpIdx);
5606 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5607 if (MO.isReg()) {
5608 SGPRUsed = MO.getReg();
5609 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5610 ++ConstantBusCount;
5611 SGPRsUsed.push_back(SGPRUsed);
5612 }
5613 } else if (!MO.isFI()) { // Treat FI like a register.
5614 if (!UsesLiteral) {
5615 ++ConstantBusCount;
5616 UsesLiteral = true;
5617 LiteralVal = &MO;
5618 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5619 assert(isVOP2(MI) || isVOP3(MI));
5620 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5621 return false;
5622 }
5623 }
5624 }
5625 }
5626
5627 SGPRUsed = findImplicitSGPRRead(MI);
5628 if (SGPRUsed) {
5629 // Implicit uses may safely overlap true operands
5630 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5631 return !RI.regsOverlap(SGPRUsed, SGPR);
5632 })) {
5633 ++ConstantBusCount;
5634 SGPRsUsed.push_back(SGPRUsed);
5635 }
5636 }
5637
5638 // v_writelane_b32 is an exception from constant bus restriction:
5639 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5640 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5641 Opcode != AMDGPU::V_WRITELANE_B32) {
5642 ErrInfo = "VOP* instruction violates constant bus restriction";
5643 return false;
5644 }
5645
5646 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5647 ErrInfo = "VOP3 instruction uses literal";
5648 return false;
5649 }
5650 }
5651
5652 // Special case for writelane - this can break the multiple constant bus rule,
5653 // but still can't use more than one SGPR register
5654 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5655 unsigned SGPRCount = 0;
5656 Register SGPRUsed;
5657
5658 for (int OpIdx : {Src0Idx, Src1Idx}) {
5659 if (OpIdx == -1)
5660 break;
5661
5662 const MachineOperand &MO = MI.getOperand(OpIdx);
5663
5664 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5665 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5666 if (MO.getReg() != SGPRUsed)
5667 ++SGPRCount;
5668 SGPRUsed = MO.getReg();
5669 }
5670 }
5671 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5672 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5673 return false;
5674 }
5675 }
5676 }
5677
5678 // Verify misc. restrictions on specific instructions.
5679 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5680 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5681 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5682 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5683 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5684 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5685 if (!compareMachineOp(Src0, Src1) &&
5686 !compareMachineOp(Src0, Src2)) {
5687 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5688 return false;
5689 }
5690 }
5691 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5692 SISrcMods::ABS) ||
5693 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5694 SISrcMods::ABS) ||
5695 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5696 SISrcMods::ABS)) {
5697 ErrInfo = "ABS not allowed in VOP3B instructions";
5698 return false;
5699 }
5700 }
5701
5702 if (isSOP2(MI) || isSOPC(MI)) {
5703 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5704 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5705
5706 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5707 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5708 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5709 !Src0.isIdenticalTo(Src1)) {
5710 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5711 return false;
5712 }
5713 }
5714
5715 if (isSOPK(MI)) {
5716 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5717 if (Desc.isBranch()) {
5718 if (!Op->isMBB()) {
5719 ErrInfo = "invalid branch target for SOPK instruction";
5720 return false;
5721 }
5722 } else {
5723 uint64_t Imm = Op->getImm();
5724 if (sopkIsZext(Opcode)) {
5725 if (!isUInt<16>(Imm)) {
5726 ErrInfo = "invalid immediate for SOPK instruction";
5727 return false;
5728 }
5729 } else {
5730 if (!isInt<16>(Imm)) {
5731 ErrInfo = "invalid immediate for SOPK instruction";
5732 return false;
5733 }
5734 }
5735 }
5736 }
5737
5738 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5739 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5740 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5741 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5742 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5743 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5744
5745 const unsigned StaticNumOps =
5746 Desc.getNumOperands() + Desc.implicit_uses().size();
5747 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5748
5749 // Require additional implicit operands. This allows a fixup done by the
5750 // post RA scheduler where the main implicit operand is killed and
5751 // implicit-defs are added for sub-registers that remain live after this
5752 // instruction.
5753 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5754 ErrInfo = "missing implicit register operands";
5755 return false;
5756 }
5757
5758 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5759 if (IsDst) {
5760 if (!Dst->isUse()) {
5761 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5762 return false;
5763 }
5764
5765 unsigned UseOpIdx;
5766 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5767 UseOpIdx != StaticNumOps + 1) {
5768 ErrInfo = "movrel implicit operands should be tied";
5769 return false;
5770 }
5771 }
5772
5773 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5774 const MachineOperand &ImpUse
5775 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5776 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5777 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5778 ErrInfo = "src0 should be subreg of implicit vector use";
5779 return false;
5780 }
5781 }
5782
5783 // Make sure we aren't losing exec uses in the td files. This mostly requires
5784 // being careful when using let Uses to try to add other use registers.
5785 if (shouldReadExec(MI)) {
5786 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5787 ErrInfo = "VALU instruction does not implicitly read exec mask";
5788 return false;
5789 }
5790 }
5791
5792 if (isSMRD(MI)) {
5793 if (MI.mayStore() &&
5794 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5795 // The register offset form of scalar stores may only use m0 as the
5796 // soffset register.
5797 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5798 if (Soff && Soff->getReg() != AMDGPU::M0) {
5799 ErrInfo = "scalar stores must use m0 as offset register";
5800 return false;
5801 }
5802 }
5803 }
5804
5805 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5806 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5807 if (Offset->getImm() != 0) {
5808 ErrInfo = "subtarget does not support offsets in flat instructions";
5809 return false;
5810 }
5811 }
5812
5813 if (isDS(MI) && !ST.hasGDS()) {
5814 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5815 if (GDSOp && GDSOp->getImm() != 0) {
5816 ErrInfo = "GDS is not supported on this subtarget";
5817 return false;
5818 }
5819 }
5820
5821 if (isImage(MI)) {
5822 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5823 if (DimOp) {
5824 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5825 AMDGPU::OpName::vaddr0);
5826 AMDGPU::OpName RSrcOpName =
5827 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5828 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5829 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5830 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5831 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5832 const AMDGPU::MIMGDimInfo *Dim =
5834
5835 if (!Dim) {
5836 ErrInfo = "dim is out of range";
5837 return false;
5838 }
5839
5840 bool IsA16 = false;
5841 if (ST.hasR128A16()) {
5842 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5843 IsA16 = R128A16->getImm() != 0;
5844 } else if (ST.hasA16()) {
5845 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5846 IsA16 = A16->getImm() != 0;
5847 }
5848
5849 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5850
5851 unsigned AddrWords =
5852 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5853
5854 unsigned VAddrWords;
5855 if (IsNSA) {
5856 VAddrWords = RsrcIdx - VAddr0Idx;
5857 if (ST.hasPartialNSAEncoding() &&
5858 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5859 unsigned LastVAddrIdx = RsrcIdx - 1;
5860 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5861 }
5862 } else {
5863 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5864 if (AddrWords > 12)
5865 AddrWords = 16;
5866 }
5867
5868 if (VAddrWords != AddrWords) {
5869 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5870 << " but got " << VAddrWords << "\n");
5871 ErrInfo = "bad vaddr size";
5872 return false;
5873 }
5874 }
5875 }
5876
5877 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5878 if (DppCt) {
5879 using namespace AMDGPU::DPP;
5880
5881 unsigned DC = DppCt->getImm();
5882 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5883 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5884 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5885 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5886 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5887 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5888 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5889 ErrInfo = "Invalid dpp_ctrl value";
5890 return false;
5891 }
5892 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5893 !ST.hasDPPWavefrontShifts()) {
5894 ErrInfo = "Invalid dpp_ctrl value: "
5895 "wavefront shifts are not supported on GFX10+";
5896 return false;
5897 }
5898 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5899 !ST.hasDPPBroadcasts()) {
5900 ErrInfo = "Invalid dpp_ctrl value: "
5901 "broadcasts are not supported on GFX10+";
5902 return false;
5903 }
5904 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5905 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5906 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5907 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5908 !ST.hasGFX90AInsts()) {
5909 ErrInfo = "Invalid dpp_ctrl value: "
5910 "row_newbroadcast/row_share is not supported before "
5911 "GFX90A/GFX10";
5912 return false;
5913 }
5914 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5915 ErrInfo = "Invalid dpp_ctrl value: "
5916 "row_share and row_xmask are not supported before GFX10";
5917 return false;
5918 }
5919 }
5920
5921 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5923 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5924 ErrInfo = "Invalid dpp_ctrl value: "
5925 "DP ALU dpp only support row_newbcast";
5926 return false;
5927 }
5928 }
5929
5930 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5931 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5932 AMDGPU::OpName DataName =
5933 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5934 const MachineOperand *Data = getNamedOperand(MI, DataName);
5935 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5936 if (Data && !Data->isReg())
5937 Data = nullptr;
5938
5939 if (ST.hasGFX90AInsts()) {
5940 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5941 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5942 ErrInfo = "Invalid register class: "
5943 "vdata and vdst should be both VGPR or AGPR";
5944 return false;
5945 }
5946 if (Data && Data2 &&
5947 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5948 ErrInfo = "Invalid register class: "
5949 "both data operands should be VGPR or AGPR";
5950 return false;
5951 }
5952 } else {
5953 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5954 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5955 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5956 ErrInfo = "Invalid register class: "
5957 "agpr loads and stores not supported on this GPU";
5958 return false;
5959 }
5960 }
5961 }
5962
5963 if (ST.needsAlignedVGPRs()) {
5964 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5966 if (!Op)
5967 return true;
5968 Register Reg = Op->getReg();
5969 if (Reg.isPhysical())
5970 return !(RI.getHWRegIndex(Reg) & 1);
5971 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5972 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5973 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5974 };
5975
5976 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5977 Opcode == AMDGPU::DS_GWS_BARRIER) {
5978
5979 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5980 ErrInfo = "Subtarget requires even aligned vector registers "
5981 "for DS_GWS instructions";
5982 return false;
5983 }
5984 }
5985
5986 if (isMIMG(MI)) {
5987 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5988 ErrInfo = "Subtarget requires even aligned vector registers "
5989 "for vaddr operand of image instructions";
5990 return false;
5991 }
5992 }
5993 }
5994
5995 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5996 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5997 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5998 ErrInfo = "Invalid register class: "
5999 "v_accvgpr_write with an SGPR is not supported on this GPU";
6000 return false;
6001 }
6002 }
6003
6004 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
6005 const MachineOperand &SrcOp = MI.getOperand(1);
6006 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
6007 ErrInfo = "pseudo expects only physical SGPRs";
6008 return false;
6009 }
6010 }
6011
6012 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6013 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
6014 if (!ST.hasScaleOffset()) {
6015 ErrInfo = "Subtarget does not support offset scaling";
6016 return false;
6017 }
6018 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
6019 ErrInfo = "Instruction does not support offset scaling";
6020 return false;
6021 }
6022 }
6023 }
6024
6025 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6026 // information.
6027 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
6028 for (unsigned I = 0; I < 3; ++I) {
6030 return false;
6031 }
6032 }
6033
6034 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
6035 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
6036 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
6037 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
6038 &AMDGPU::SReg_64RegClass) ||
6039 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
6040 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
6041 return false;
6042 }
6043 }
6044
6045 return true;
6046}
6047
6049 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
6050 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6051 return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg())
6052 ? AMDGPU::COPY
6053 : AMDGPU::V_MOV_B32_e32;
6054 }
6055 return getVALUOp(MI.getOpcode());
6056}
6057
6058// It is more readable to list mapped opcodes on the same line.
6059// clang-format off
6060
6061unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
6062 switch (Opc) {
6063 default: return AMDGPU::INSTRUCTION_LIST_END;
6064 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
6065 case AMDGPU::COPY: return AMDGPU::COPY;
6066 case AMDGPU::PHI: return AMDGPU::PHI;
6067 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
6068 case AMDGPU::WQM: return AMDGPU::WQM;
6069 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
6070 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
6071 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
6072 case AMDGPU::S_ADD_I32:
6073 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
6074 case AMDGPU::S_ADDC_U32:
6075 return AMDGPU::V_ADDC_U32_e32;
6076 case AMDGPU::S_SUB_I32:
6077 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
6078 // FIXME: These are not consistently handled, and selected when the carry is
6079 // used.
6080 case AMDGPU::S_ADD_U32:
6081 return AMDGPU::V_ADD_CO_U32_e32;
6082 case AMDGPU::S_SUB_U32:
6083 return AMDGPU::V_SUB_CO_U32_e32;
6084 case AMDGPU::S_ADD_U64_PSEUDO:
6085 return AMDGPU::V_ADD_U64_PSEUDO;
6086 case AMDGPU::S_SUB_U64_PSEUDO:
6087 return AMDGPU::V_SUB_U64_PSEUDO;
6088 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
6089 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
6090 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
6091 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
6092 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
6093 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
6094 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
6095 case AMDGPU::S_XNOR_B32:
6096 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
6097 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
6098 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
6099 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
6100 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
6101 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
6102 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
6103 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
6104 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6105 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6106 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6107 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6108 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6109 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6110 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6111 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6112 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6113 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6114 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6115 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6116 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6117 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6118 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6119 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6120 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6121 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6122 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6123 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6124 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6125 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6126 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6127 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6128 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6129 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6130 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6131 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6132 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6133 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6134 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6135 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6136 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6137 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6138 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6139 case AMDGPU::S_CVT_F32_F16:
6140 case AMDGPU::S_CVT_HI_F32_F16:
6141 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6142 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6143 case AMDGPU::S_CVT_F16_F32:
6144 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6145 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6146 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6147 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6148 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6149 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6150 case AMDGPU::S_CEIL_F16:
6151 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6152 : AMDGPU::V_CEIL_F16_fake16_e64;
6153 case AMDGPU::S_FLOOR_F16:
6154 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6155 : AMDGPU::V_FLOOR_F16_fake16_e64;
6156 case AMDGPU::S_TRUNC_F16:
6157 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6158 : AMDGPU::V_TRUNC_F16_fake16_e64;
6159 case AMDGPU::S_RNDNE_F16:
6160 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6161 : AMDGPU::V_RNDNE_F16_fake16_e64;
6162 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6163 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6164 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6165 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6166 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6167 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6168 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6169 case AMDGPU::S_ADD_F16:
6170 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6171 : AMDGPU::V_ADD_F16_fake16_e64;
6172 case AMDGPU::S_SUB_F16:
6173 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6174 : AMDGPU::V_SUB_F16_fake16_e64;
6175 case AMDGPU::S_MIN_F16:
6176 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6177 : AMDGPU::V_MIN_F16_fake16_e64;
6178 case AMDGPU::S_MAX_F16:
6179 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6180 : AMDGPU::V_MAX_F16_fake16_e64;
6181 case AMDGPU::S_MINIMUM_F16:
6182 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6183 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6184 case AMDGPU::S_MAXIMUM_F16:
6185 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6186 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6187 case AMDGPU::S_MUL_F16:
6188 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6189 : AMDGPU::V_MUL_F16_fake16_e64;
6190 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6191 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6192 case AMDGPU::S_FMAC_F16:
6193 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6194 : AMDGPU::V_FMAC_F16_fake16_e64;
6195 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6196 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6197 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6198 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6199 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6200 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6201 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6202 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6203 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6204 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6205 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6206 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6207 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6208 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6209 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6210 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6211 case AMDGPU::S_CMP_LT_F16:
6212 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6213 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6214 case AMDGPU::S_CMP_EQ_F16:
6215 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6216 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6217 case AMDGPU::S_CMP_LE_F16:
6218 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6219 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6220 case AMDGPU::S_CMP_GT_F16:
6221 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6222 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6223 case AMDGPU::S_CMP_LG_F16:
6224 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6225 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6226 case AMDGPU::S_CMP_GE_F16:
6227 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6228 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6229 case AMDGPU::S_CMP_O_F16:
6230 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6231 : AMDGPU::V_CMP_O_F16_fake16_e64;
6232 case AMDGPU::S_CMP_U_F16:
6233 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6234 : AMDGPU::V_CMP_U_F16_fake16_e64;
6235 case AMDGPU::S_CMP_NGE_F16:
6236 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6237 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6238 case AMDGPU::S_CMP_NLG_F16:
6239 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6240 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6241 case AMDGPU::S_CMP_NGT_F16:
6242 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6243 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6244 case AMDGPU::S_CMP_NLE_F16:
6245 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6246 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6247 case AMDGPU::S_CMP_NEQ_F16:
6248 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6249 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6250 case AMDGPU::S_CMP_NLT_F16:
6251 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6252 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6253 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6254 case AMDGPU::V_S_EXP_F16_e64:
6255 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6256 : AMDGPU::V_EXP_F16_fake16_e64;
6257 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6258 case AMDGPU::V_S_LOG_F16_e64:
6259 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6260 : AMDGPU::V_LOG_F16_fake16_e64;
6261 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6262 case AMDGPU::V_S_RCP_F16_e64:
6263 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6264 : AMDGPU::V_RCP_F16_fake16_e64;
6265 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6266 case AMDGPU::V_S_RSQ_F16_e64:
6267 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6268 : AMDGPU::V_RSQ_F16_fake16_e64;
6269 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6270 case AMDGPU::V_S_SQRT_F16_e64:
6271 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6272 : AMDGPU::V_SQRT_F16_fake16_e64;
6273 }
6275 "Unexpected scalar opcode without corresponding vector one!");
6276}
6277
6278// clang-format on
6279
6283 const DebugLoc &DL, Register Reg,
6284 bool IsSCCLive,
6285 SlotIndexes *Indexes) const {
6286 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6287 const SIInstrInfo *TII = ST.getInstrInfo();
6289 if (IsSCCLive) {
6290 // Insert two move instructions, one to save the original value of EXEC and
6291 // the other to turn on all bits in EXEC. This is required as we can't use
6292 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6293 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6295 auto FlipExecMI =
6296 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6297 if (Indexes) {
6298 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6299 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6300 }
6301 } else {
6302 auto SaveExec =
6303 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6304 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6305 if (Indexes)
6306 Indexes->insertMachineInstrInMaps(*SaveExec);
6307 }
6308}
6309
6312 const DebugLoc &DL, Register Reg,
6313 SlotIndexes *Indexes) const {
6315 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6316 .addReg(Reg, RegState::Kill);
6317 if (Indexes)
6318 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6319}
6320
6324 "Not a whole wave func");
6325 MachineBasicBlock &MBB = *MF.begin();
6326 for (MachineInstr &MI : MBB)
6327 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6328 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6329 return &MI;
6330
6331 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6332}
6333
6335 unsigned OpNo) const {
6336 const MCInstrDesc &Desc = get(MI.getOpcode());
6337 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6338 Desc.operands()[OpNo].RegClass == -1) {
6339 Register Reg = MI.getOperand(OpNo).getReg();
6340
6341 if (Reg.isVirtual()) {
6342 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6343 return MRI.getRegClass(Reg);
6344 }
6345 return RI.getPhysRegBaseClass(Reg);
6346 }
6347
6348 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6349 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6350}
6351
6354 MachineBasicBlock *MBB = MI.getParent();
6355 MachineOperand &MO = MI.getOperand(OpIdx);
6356 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6357 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6358 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6359 unsigned Size = RI.getRegSizeInBits(*RC);
6360 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6361 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6362 : AMDGPU::V_MOV_B32_e32;
6363 if (MO.isReg())
6364 Opcode = AMDGPU::COPY;
6365 else if (RI.isSGPRClass(RC))
6366 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6367
6368 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6369 Register Reg = MRI.createVirtualRegister(VRC);
6370 DebugLoc DL = MBB->findDebugLoc(I);
6371 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6372 MO.ChangeToRegister(Reg, false);
6373}
6374
6377 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6378 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6379 if (!SuperReg.getReg().isVirtual())
6380 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6381
6382 MachineBasicBlock *MBB = MI->getParent();
6383 const DebugLoc &DL = MI->getDebugLoc();
6384 Register SubReg = MRI.createVirtualRegister(SubRC);
6385
6386 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6387 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6388 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6389 return SubReg;
6390}
6391
6394 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6395 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6396 if (Op.isImm()) {
6397 if (SubIdx == AMDGPU::sub0)
6398 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6399 if (SubIdx == AMDGPU::sub1)
6400 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6401
6402 llvm_unreachable("Unhandled register index for immediate");
6403 }
6404
6405 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6406 SubIdx, SubRC);
6407 return MachineOperand::CreateReg(SubReg, false);
6408}
6409
6410// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6411void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6412 assert(Inst.getNumExplicitOperands() == 3);
6413 MachineOperand Op1 = Inst.getOperand(1);
6414 Inst.removeOperand(1);
6415 Inst.addOperand(Op1);
6416}
6417
6419 const MCOperandInfo &OpInfo,
6420 const MachineOperand &MO) const {
6421 if (!MO.isReg())
6422 return false;
6423
6424 Register Reg = MO.getReg();
6425
6426 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6427 if (Reg.isPhysical())
6428 return DRC->contains(Reg);
6429
6430 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6431
6432 if (MO.getSubReg()) {
6433 const MachineFunction *MF = MO.getParent()->getMF();
6434 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6435 if (!SuperRC)
6436 return false;
6437 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6438 }
6439
6440 return RI.getCommonSubClass(DRC, RC) != nullptr;
6441}
6442
6444 const MachineOperand &MO) const {
6445 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6446 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6447 unsigned Opc = MI.getOpcode();
6448
6449 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6450 // information.
6451 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6452 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6453 constexpr AMDGPU::OpName OpNames[] = {
6454 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6455
6456 for (auto [I, OpName] : enumerate(OpNames)) {
6457 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6458 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6460 return false;
6461 }
6462 }
6463
6464 if (!isLegalRegOperand(MRI, OpInfo, MO))
6465 return false;
6466
6467 // check Accumulate GPR operand
6468 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6469 if (IsAGPR && !ST.hasMAIInsts())
6470 return false;
6471 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6472 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6473 return false;
6474 // Atomics should have both vdst and vdata either vgpr or agpr.
6475 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6476 const int DataIdx = AMDGPU::getNamedOperandIdx(
6477 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6478 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6479 MI.getOperand(DataIdx).isReg() &&
6480 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6481 return false;
6482 if ((int)OpIdx == DataIdx) {
6483 if (VDstIdx != -1 &&
6484 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6485 return false;
6486 // DS instructions with 2 src operands also must have tied RC.
6487 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6488 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6489 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6490 return false;
6491 }
6492
6493 // Check V_ACCVGPR_WRITE_B32_e64
6494 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6495 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6496 RI.isSGPRReg(MRI, MO.getReg()))
6497 return false;
6498
6499 if (ST.hasFlatScratchHiInB64InstHazard() &&
6500 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6501 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6502 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6503 64)
6504 return false;
6505 }
6506 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6507 return false;
6508 }
6509 if (!ST.hasDPPSrc1SGPR() && isDPP(MI) && RI.isSGPRReg(MRI, MO.getReg()) &&
6510 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1))
6511 return false;
6512
6513 return true;
6514}
6515
6517 const MCOperandInfo &OpInfo,
6518 const MachineOperand &MO) const {
6519 if (MO.isReg())
6520 return isLegalRegOperand(MRI, OpInfo, MO);
6521
6522 // Handle non-register types that are treated like immediates.
6523 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6524 return true;
6525}
6526
6528 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6529 const MachineOperand *MO) const {
6530 constexpr unsigned NumOps = 3;
6531 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6532 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6533 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6534 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6535
6536 assert(SrcN < NumOps);
6537
6538 if (!MO) {
6539 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6540 if (SrcIdx == -1)
6541 return true;
6542 MO = &MI.getOperand(SrcIdx);
6543 }
6544
6545 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6546 return true;
6547
6548 int ModsIdx =
6549 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6550 if (ModsIdx == -1)
6551 return true;
6552
6553 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6554 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6555 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6556
6557 return !OpSel && !OpSelHi;
6558}
6559
6561 const MachineOperand *MO) const {
6562 const MachineFunction &MF = *MI.getMF();
6563 const MachineRegisterInfo &MRI = MF.getRegInfo();
6564 const MCInstrDesc &InstDesc = MI.getDesc();
6565 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6566 int64_t RegClass = getOpRegClassID(OpInfo);
6567 const TargetRegisterClass *DefinedRC =
6568 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6569 if (!MO)
6570 MO = &MI.getOperand(OpIdx);
6571
6572 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6573
6574 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6575 const MachineOperand *UsedLiteral = nullptr;
6576
6577 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6578 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6579
6580 // TODO: Be more permissive with frame indexes.
6581 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6582 if (!LiteralLimit--)
6583 return false;
6584
6585 UsedLiteral = MO;
6586 }
6587
6589 if (MO->isReg())
6590 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6591
6592 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6593 if (i == OpIdx)
6594 continue;
6595 const MachineOperand &Op = MI.getOperand(i);
6596 if (Op.isReg()) {
6597 if (Op.isUse()) {
6598 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6599 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6600 if (--ConstantBusLimit <= 0)
6601 return false;
6602 }
6603 }
6604 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6605 !isInlineConstant(Op, InstDesc.operands()[i])) {
6606 // The same literal may be used multiple times.
6607 if (!UsedLiteral)
6608 UsedLiteral = &Op;
6609 else if (UsedLiteral->isIdenticalTo(Op))
6610 continue;
6611
6612 if (!LiteralLimit--)
6613 return false;
6614 if (--ConstantBusLimit <= 0)
6615 return false;
6616 }
6617 }
6618 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6619 // There can be at most one literal operand, but it can be repeated.
6620 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6621 if (i == OpIdx)
6622 continue;
6623 const MachineOperand &Op = MI.getOperand(i);
6624 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6625 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6626 !Op.isIdenticalTo(*MO))
6627 return false;
6628
6629 // Do not fold a non-inlineable and non-register operand into an
6630 // instruction that already has a frame index. The frame index handling
6631 // code could not handle well when a frame index co-exists with another
6632 // non-register operand, unless that operand is an inlineable immediate.
6633 if (Op.isFI())
6634 return false;
6635 }
6636 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6637 isF16PseudoScalarTrans(MI.getOpcode())) {
6638 return false;
6639 }
6640
6641 if (MO->isReg()) {
6642 if (!DefinedRC)
6643 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6644 return isLegalRegOperand(MI, OpIdx, *MO);
6645 }
6646
6647 if (MO->isImm()) {
6648 uint64_t Imm = MO->getImm();
6649 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6650 bool Is64BitOp = Is64BitFPOp ||
6651 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6652 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6653 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6654 if (Is64BitOp &&
6655 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6656 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6657 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6658 return false;
6659
6660 // FIXME: We can use sign extended 64-bit literals, but only for signed
6661 // operands. At the moment we do not know if an operand is signed.
6662 // Such operand will be encoded as its low 32 bits and then either
6663 // correctly sign extended or incorrectly zero extended by HW.
6664 // If 64-bit literals are supported and the literal will be encoded
6665 // as full 64 bit we still can use it.
6666 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6667 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6668 return false;
6669 }
6670 }
6671
6672 // Handle non-register types that are treated like immediates.
6673 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6674
6675 if (!DefinedRC) {
6676 // This operand expects an immediate.
6677 return true;
6678 }
6679
6680 return isImmOperandLegal(MI, OpIdx, *MO);
6681}
6682
6684 bool IsGFX950Only = ST.hasGFX950Insts();
6685 bool IsGFX940Only = ST.hasGFX940Insts();
6686
6687 if (!IsGFX950Only && !IsGFX940Only)
6688 return false;
6689
6690 if (!isVALU(MI))
6691 return false;
6692
6693 // V_COS, V_EXP, V_RCP, etc.
6694 if (isTRANS(MI))
6695 return true;
6696
6697 // DOT2, DOT2C, DOT4, etc.
6698 if (isDOT(MI))
6699 return true;
6700
6701 // MFMA, SMFMA
6702 if (isMFMA(MI))
6703 return true;
6704
6705 unsigned Opcode = MI.getOpcode();
6706 switch (Opcode) {
6707 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6708 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6709 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6710 case AMDGPU::V_MQSAD_U32_U8_e64:
6711 case AMDGPU::V_PK_ADD_F16:
6712 case AMDGPU::V_PK_ADD_F32:
6713 case AMDGPU::V_PK_ADD_I16:
6714 case AMDGPU::V_PK_ADD_U16:
6715 case AMDGPU::V_PK_ASHRREV_I16:
6716 case AMDGPU::V_PK_FMA_F16:
6717 case AMDGPU::V_PK_FMA_F32:
6718 case AMDGPU::V_PK_FMAC_F16_e32:
6719 case AMDGPU::V_PK_FMAC_F16_e64:
6720 case AMDGPU::V_PK_LSHLREV_B16:
6721 case AMDGPU::V_PK_LSHRREV_B16:
6722 case AMDGPU::V_PK_MAD_I16:
6723 case AMDGPU::V_PK_MAD_U16:
6724 case AMDGPU::V_PK_MAX_F16:
6725 case AMDGPU::V_PK_MAX_I16:
6726 case AMDGPU::V_PK_MAX_U16:
6727 case AMDGPU::V_PK_MIN_F16:
6728 case AMDGPU::V_PK_MIN_I16:
6729 case AMDGPU::V_PK_MIN_U16:
6730 case AMDGPU::V_PK_MOV_B32:
6731 case AMDGPU::V_PK_MUL_F16:
6732 case AMDGPU::V_PK_MUL_F32:
6733 case AMDGPU::V_PK_MUL_LO_U16:
6734 case AMDGPU::V_PK_SUB_I16:
6735 case AMDGPU::V_PK_SUB_U16:
6736 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6737 return true;
6738 default:
6739 return false;
6740 }
6741}
6742
6744 MachineInstr &MI) const {
6745 unsigned Opc = MI.getOpcode();
6746 const MCInstrDesc &InstrDesc = get(Opc);
6747
6748 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6749 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6750
6751 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6752 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6753
6754 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6755 // we need to only have one constant bus use before GFX10.
6756 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6757 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6758 RI.isSGPRReg(MRI, Src0.getReg()))
6759 legalizeOpWithMove(MI, Src0Idx);
6760
6761 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6762 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6763 // src0/src1 with V_READFIRSTLANE.
6764 if (Opc == AMDGPU::V_WRITELANE_B32) {
6765 const DebugLoc &DL = MI.getDebugLoc();
6766 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6767 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6768 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6769 .add(Src0);
6770 Src0.ChangeToRegister(Reg, false);
6771 }
6772 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6773 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6774 const DebugLoc &DL = MI.getDebugLoc();
6775 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6776 .add(Src1);
6777 Src1.ChangeToRegister(Reg, false);
6778 }
6779 return;
6780 }
6781
6782 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6783 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6784 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6785 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6786 legalizeOpWithMove(MI, Src2Idx);
6787 }
6788
6789 // VOP2 src0 instructions support all operand types, so we don't need to check
6790 // their legality. If src1 is already legal, we don't need to do anything.
6791 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6792 return;
6793
6794 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6795 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6796 // select is uniform.
6797 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6798 RI.isVGPR(MRI, Src1.getReg())) {
6799 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6800 const DebugLoc &DL = MI.getDebugLoc();
6801 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6802 .add(Src1);
6803 Src1.ChangeToRegister(Reg, false);
6804 return;
6805 }
6806
6807 // We do not use commuteInstruction here because it is too aggressive and will
6808 // commute if it is possible. We only want to commute here if it improves
6809 // legality. This can be called a fairly large number of times so don't waste
6810 // compile time pointlessly swapping and checking legality again.
6811 if (HasImplicitSGPR || !MI.isCommutable()) {
6812 legalizeOpWithMove(MI, Src1Idx);
6813 return;
6814 }
6815
6816 // If src0 can be used as src1, commuting will make the operands legal.
6817 // Otherwise we have to give up and insert a move.
6818 //
6819 // TODO: Other immediate-like operand kinds could be commuted if there was a
6820 // MachineOperand::ChangeTo* for them.
6821 if ((!Src1.isImm() && !Src1.isReg()) ||
6822 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6823 legalizeOpWithMove(MI, Src1Idx);
6824 return;
6825 }
6826
6827 int CommutedOpc = commuteOpcode(MI);
6828 if (CommutedOpc == -1) {
6829 legalizeOpWithMove(MI, Src1Idx);
6830 return;
6831 }
6832
6833 MI.setDesc(get(CommutedOpc));
6834
6835 Register Src0Reg = Src0.getReg();
6836 unsigned Src0SubReg = Src0.getSubReg();
6837 bool Src0Kill = Src0.isKill();
6838
6839 if (Src1.isImm())
6840 Src0.ChangeToImmediate(Src1.getImm());
6841 else if (Src1.isReg()) {
6842 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6843 Src0.setSubReg(Src1.getSubReg());
6844 } else
6845 llvm_unreachable("Should only have register or immediate operands");
6846
6847 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6848 Src1.setSubReg(Src0SubReg);
6850}
6851
6852// Legalize VOP3 operands. All operand types are supported for any operand
6853// but only one literal constant and only starting from GFX10.
6855 MachineInstr &MI) const {
6856 unsigned Opc = MI.getOpcode();
6857
6858 int VOP3Idx[3] = {
6859 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6860 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6861 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6862 };
6863
6864 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6865 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6866 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6867 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6868 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6869 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6870 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6871 // src1 and src2 must be scalar
6872 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6873 const DebugLoc &DL = MI.getDebugLoc();
6874 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6875 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6876 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6877 .add(Src1);
6878 Src1.ChangeToRegister(Reg, false);
6879 }
6880 if (VOP3Idx[2] != -1) {
6881 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6882 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6883 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6884 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6885 .add(Src2);
6886 Src2.ChangeToRegister(Reg, false);
6887 }
6888 }
6889 }
6890
6891 // Find the one SGPR operand we are allowed to use.
6892 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6893 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6894 SmallDenseSet<unsigned> SGPRsUsed;
6895 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6896 if (SGPRReg) {
6897 SGPRsUsed.insert(SGPRReg);
6898 --ConstantBusLimit;
6899 }
6900
6901 for (int Idx : VOP3Idx) {
6902 if (Idx == -1)
6903 break;
6904 MachineOperand &MO = MI.getOperand(Idx);
6905
6906 if (!MO.isReg()) {
6907 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6908 continue;
6909
6910 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6911 --LiteralLimit;
6912 --ConstantBusLimit;
6913 continue;
6914 }
6915
6916 --LiteralLimit;
6917 --ConstantBusLimit;
6918 legalizeOpWithMove(MI, Idx);
6919 continue;
6920 }
6921
6922 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6923 continue; // VGPRs are legal
6924
6925 // We can use one SGPR in each VOP3 instruction prior to GFX10
6926 // and two starting from GFX10.
6927 if (SGPRsUsed.count(MO.getReg()))
6928 continue;
6929 if (ConstantBusLimit > 0) {
6930 SGPRsUsed.insert(MO.getReg());
6931 --ConstantBusLimit;
6932 continue;
6933 }
6934
6935 // If we make it this far, then the operand is not legal and we must
6936 // legalize it.
6937 legalizeOpWithMove(MI, Idx);
6938 }
6939
6940 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6941 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6942 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6943 legalizeOpWithMove(MI, VOP3Idx[2]);
6944
6945 // Fix the register class of packed FP32 instructions on gfx12+. See
6946 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6948 for (unsigned I = 0; I < 3; ++I) {
6949 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6950 legalizeOpWithMove(MI, VOP3Idx[I]);
6951 }
6952 }
6953}
6954
6957 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6958 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6959 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6960 if (DstRC)
6961 SRC = RI.getCommonSubClass(SRC, DstRC);
6962
6963 Register DstReg = MRI.createVirtualRegister(SRC);
6964 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6965
6966 if (RI.hasAGPRs(VRC)) {
6967 VRC = RI.getEquivalentVGPRClass(VRC);
6968 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6969 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6970 get(TargetOpcode::COPY), NewSrcReg)
6971 .addReg(SrcReg);
6972 SrcReg = NewSrcReg;
6973 }
6974
6975 if (SubRegs == 1) {
6976 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6977 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6978 .addReg(SrcReg);
6979 return DstReg;
6980 }
6981
6983 for (unsigned i = 0; i < SubRegs; ++i) {
6984 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6985 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6986 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6987 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6988 SRegs.push_back(SGPR);
6989 }
6990
6992 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6993 get(AMDGPU::REG_SEQUENCE), DstReg);
6994 for (unsigned i = 0; i < SubRegs; ++i) {
6995 MIB.addReg(SRegs[i]);
6996 MIB.addImm(RI.getSubRegFromChannel(i));
6997 }
6998 return DstReg;
6999}
7000
7002 MachineInstr &MI) const {
7003
7004 // If the pointer is store in VGPRs, then we need to move them to
7005 // SGPRs using v_readfirstlane. This is safe because we only select
7006 // loads with uniform pointers to SMRD instruction so we know the
7007 // pointer value is uniform.
7008 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
7009 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
7010 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
7011 SBase->setReg(SGPR);
7012 }
7013 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
7014 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
7015 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
7016 SOff->setReg(SGPR);
7017 }
7018}
7019
7021 unsigned Opc = Inst.getOpcode();
7022 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
7023 if (OldSAddrIdx < 0)
7024 return false;
7025
7026 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
7027
7028 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
7029 if (NewOpc < 0)
7031 if (NewOpc < 0)
7032 return false;
7033
7034 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
7035 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
7036 if (RI.isSGPRReg(MRI, SAddr.getReg()))
7037 return false;
7038
7039 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
7040 if (NewVAddrIdx < 0)
7041 return false;
7042
7043 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
7044
7045 // Check vaddr, it shall be zero or absent.
7046 MachineInstr *VAddrDef = nullptr;
7047 if (OldVAddrIdx >= 0) {
7048 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
7049 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
7050 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
7051 !VAddrDef->getOperand(1).isImm() ||
7052 VAddrDef->getOperand(1).getImm() != 0)
7053 return false;
7054 }
7055
7056 const MCInstrDesc &NewDesc = get(NewOpc);
7057 Inst.setDesc(NewDesc);
7058
7059 // Callers expect iterator to be valid after this call, so modify the
7060 // instruction in place.
7061 if (OldVAddrIdx == NewVAddrIdx) {
7062 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
7063 // Clear use list from the old vaddr holding a zero register.
7064 MRI.removeRegOperandFromUseList(&NewVAddr);
7065 MRI.moveOperands(&NewVAddr, &SAddr, 1);
7066 Inst.removeOperand(OldSAddrIdx);
7067 // Update the use list with the pointer we have just moved from vaddr to
7068 // saddr position. Otherwise new vaddr will be missing from the use list.
7069 MRI.removeRegOperandFromUseList(&NewVAddr);
7070 MRI.addRegOperandToUseList(&NewVAddr);
7071 } else {
7072 assert(OldSAddrIdx == NewVAddrIdx);
7073
7074 if (OldVAddrIdx >= 0) {
7075 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
7076 AMDGPU::OpName::vdst_in);
7077
7078 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
7079 // it asserts. Untie the operands for now and retie them afterwards.
7080 if (NewVDstIn != -1) {
7081 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
7082 Inst.untieRegOperand(OldVDstIn);
7083 }
7084
7085 Inst.removeOperand(OldVAddrIdx);
7086
7087 if (NewVDstIn != -1) {
7088 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
7089 Inst.tieOperands(NewVDst, NewVDstIn);
7090 }
7091 }
7092 }
7093
7094 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
7095 VAddrDef->eraseFromParent();
7096
7097 return true;
7098}
7099
7100// FIXME: Remove this when SelectionDAG is obsoleted.
7102 MachineInstr &MI) const {
7103 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
7104 return;
7105
7106 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
7107 // thinks they are uniform, so a readfirstlane should be valid.
7108 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
7109 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
7110 return;
7111
7113 return;
7114
7115 const TargetRegisterClass *DeclaredRC =
7116 getRegClass(MI.getDesc(), SAddr->getOperandNo());
7117
7118 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
7119 SAddr->setReg(ToSGPR);
7120}
7121
7124 const TargetRegisterClass *DstRC,
7127 const DebugLoc &DL) const {
7128 Register OpReg = Op.getReg();
7129 unsigned OpSubReg = Op.getSubReg();
7130
7131 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7132 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
7133
7134 // Check if operand is already the correct register class.
7135 if (DstRC == OpRC)
7136 return;
7137
7138 Register DstReg = MRI.createVirtualRegister(DstRC);
7139 auto Copy =
7140 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
7141 Op.setReg(DstReg);
7142
7143 MachineInstr *Def = MRI.getVRegDef(OpReg);
7144 if (!Def)
7145 return;
7146
7147 // Try to eliminate the copy if it is copying an immediate value.
7148 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7149 foldImmediate(*Copy, *Def, OpReg, &MRI);
7150
7151 bool ImpDef = Def->isImplicitDef();
7152 while (!ImpDef && Def && Def->isCopy()) {
7153 if (Def->getOperand(1).getReg().isPhysical())
7154 break;
7155 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7156 ImpDef = Def && Def->isImplicitDef();
7157 }
7158 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7159 !ImpDef)
7160 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7161}
7162
7163// Emit the actual waterfall loop, executing the wrapped instruction for each
7164// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7165// iteration, in the worst case we execute 64 (once per lane).
7168 MachineBasicBlock &BodyBB, const DebugLoc &DL,
7169 ArrayRef<MachineOperand *> ScalarOps, ArrayRef<Register> PhySGPRs = {}) {
7170 MachineFunction &MF = *LoopBB.getParent();
7172 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7174 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7175
7177 Register CondReg;
7178 for (auto [Idx, ScalarOp] : enumerate(ScalarOps)) {
7179 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7180 unsigned NumSubRegs = RegSize / 32;
7181 Register VScalarOp = ScalarOp->getReg();
7182
7183 const TargetRegisterClass *RFLSrcRC =
7184 TII.getRegClass(TII.get(AMDGPU::V_READFIRSTLANE_B32), 1);
7185
7186 if (NumSubRegs == 1) {
7187 const TargetRegisterClass *VScalarOpRC = MRI.getRegClass(VScalarOp);
7188 if (const TargetRegisterClass *Common =
7189 TRI->getCommonSubClass(VScalarOpRC, RFLSrcRC);
7190 Common != VScalarOpRC) {
7191 Register VRReg = MRI.createVirtualRegister(Common);
7192 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::COPY), VRReg).addReg(VScalarOp);
7193 VScalarOp = VRReg;
7194 }
7195 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7196
7197 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7198 .addReg(VScalarOp);
7199
7200 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7201
7202 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7203 .addReg(CurReg)
7204 .addReg(VScalarOp);
7205
7206 // Combine the comparison results with AND.
7207 if (!CondReg) // First.
7208 CondReg = NewCondReg;
7209 else { // If not the first, we create an AND.
7210 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7211 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7212 .addReg(CondReg)
7213 .addReg(NewCondReg);
7214 CondReg = AndReg;
7215 }
7216
7217 // Update ScalarOp operand to use the SGPR ScalarOp.
7218 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7219 ScalarOp->setReg(CurReg);
7220 else {
7221 // Insert into the same block of use
7222 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7223 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7224 .addReg(CurReg);
7225 ScalarOp->setReg(PhySGPRs[Idx]);
7226 }
7227 ScalarOp->setIsKill();
7228 } else {
7229 SmallVector<Register, 8> ReadlanePieces;
7230 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7231 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7232 "Unhandled register size");
7233
7234 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7235 Register CurRegLo =
7236 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7237 Register CurRegHi =
7238 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7239
7240 // Read the next variant <- also loop target.
7241 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7242 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7243
7244 // Read the next variant <- also loop target.
7245 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7246 .addReg(VScalarOp, VScalarOpUndef,
7247 TRI->getSubRegFromChannel(Idx + 1));
7248
7249 ReadlanePieces.push_back(CurRegLo);
7250 ReadlanePieces.push_back(CurRegHi);
7251
7252 // Comparison is to be done as 64-bit.
7253 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7254 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7255 .addReg(CurRegLo)
7256 .addImm(AMDGPU::sub0)
7257 .addReg(CurRegHi)
7258 .addImm(AMDGPU::sub1);
7259
7260 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7261 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7262 NewCondReg)
7263 .addReg(CurReg);
7264 if (NumSubRegs <= 2)
7265 Cmp.addReg(VScalarOp);
7266 else
7267 Cmp.addReg(VScalarOp, VScalarOpUndef,
7268 TRI->getSubRegFromChannel(Idx, 2));
7269
7270 // Combine the comparison results with AND.
7271 if (!CondReg) // First.
7272 CondReg = NewCondReg;
7273 else { // If not the first, we create an AND.
7274 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7275 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7276 .addReg(CondReg)
7277 .addReg(NewCondReg);
7278 CondReg = AndReg;
7279 }
7280 } // End for loop.
7281
7282 const auto *SScalarOpRC =
7283 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7284 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7285
7286 // Build scalar ScalarOp.
7287 auto Merge =
7288 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7289 unsigned Channel = 0;
7290 for (Register Piece : ReadlanePieces) {
7291 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7292 }
7293
7294 // Update ScalarOp operand to use the SGPR ScalarOp.
7295 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7296 ScalarOp->setReg(SScalarOp);
7297 else {
7298 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7299 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7300 .addReg(SScalarOp);
7301 ScalarOp->setReg(PhySGPRs[Idx]);
7302 }
7303 ScalarOp->setIsKill();
7304 }
7305 }
7306
7307 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7308 MRI.setSimpleHint(SaveExec, CondReg);
7309
7310 // Update EXEC to matching lanes, saving original to SaveExec.
7311 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7312 .addReg(CondReg, RegState::Kill);
7313
7314 // The original instruction is here; we insert the terminators after it.
7315 I = BodyBB.end();
7316
7317 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7318 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7319 .addReg(LMC.ExecReg)
7320 .addReg(SaveExec);
7321
7322 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7323}
7324
7325// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7326// with SGPRs by iterating over all unique values across all lanes.
7327// Returns the loop basic block that now contains \p MI.
7328static MachineBasicBlock *
7332 MachineBasicBlock::iterator Begin = nullptr,
7333 MachineBasicBlock::iterator End = nullptr,
7334 ArrayRef<Register> PhySGPRs = {}) {
7335 assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) &&
7336 "Physical SGPRs must be empty or match the number of scalar operands");
7337 MachineBasicBlock &MBB = *MI.getParent();
7338 MachineFunction &MF = *MBB.getParent();
7340 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7341 MachineRegisterInfo &MRI = MF.getRegInfo();
7342 if (!Begin.isValid())
7343 Begin = &MI;
7344 if (!End.isValid()) {
7345 End = &MI;
7346 ++End;
7347 }
7348 const DebugLoc &DL = MI.getDebugLoc();
7350 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7351
7352 // Save SCC. Waterfall Loop may overwrite SCC.
7353 Register SaveSCCReg;
7354
7355 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7356 // rather than unlimited scan everywhere
7357 bool SCCNotDead =
7358 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7359 std::numeric_limits<unsigned>::max()) !=
7361 if (SCCNotDead) {
7362 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7363 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7364 .addImm(1)
7365 .addImm(0);
7366 }
7367
7368 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7369
7370 // Save the EXEC mask
7371 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7372
7373 // Killed uses in the instruction we are waterfalling around will be
7374 // incorrect due to the added control-flow.
7376 ++AfterMI;
7377 for (auto I = Begin; I != AfterMI; I++) {
7378 for (auto &MO : I->all_uses())
7379 MRI.clearKillFlags(MO.getReg());
7380 }
7381
7382 // To insert the loop we need to split the block. Move everything after this
7383 // point to a new block, and insert a new empty block between the two.
7386 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7388 ++MBBI;
7389
7390 MF.insert(MBBI, LoopBB);
7391 MF.insert(MBBI, BodyBB);
7392 MF.insert(MBBI, RemainderBB);
7393
7394 LoopBB->addSuccessor(BodyBB);
7395 BodyBB->addSuccessor(LoopBB);
7396 BodyBB->addSuccessor(RemainderBB);
7397
7398 // Move Begin to MI to the BodyBB, and the remainder of the block to
7399 // RemainderBB.
7400 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7401 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7402 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7403
7404 MBB.addSuccessor(LoopBB);
7405
7406 // Update dominators. We know that MBB immediately dominates LoopBB, that
7407 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7408 // RemainderBB. RemainderBB immediately dominates all of the successors
7409 // transferred to it from MBB that MBB used to properly dominate.
7410 if (MDT) {
7411 MDT->addNewBlock(LoopBB, &MBB);
7412 MDT->addNewBlock(BodyBB, LoopBB);
7413 MDT->addNewBlock(RemainderBB, BodyBB);
7414 for (auto &Succ : RemainderBB->successors()) {
7415 if (MDT->properlyDominates(&MBB, Succ)) {
7416 MDT->changeImmediateDominator(Succ, RemainderBB);
7417 }
7418 }
7419 }
7420
7421 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps,
7422 PhySGPRs);
7423
7424 MachineBasicBlock::iterator First = RemainderBB->begin();
7425 // Restore SCC
7426 if (SCCNotDead) {
7427 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7428 .addReg(SaveSCCReg, RegState::Kill)
7429 .addImm(0);
7430 }
7431
7432 // Restore the EXEC mask
7433 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7434 .addReg(SaveExec);
7435 return BodyBB;
7436}
7437
7438// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7439static std::tuple<unsigned, unsigned>
7441 MachineBasicBlock &MBB = *MI.getParent();
7442 MachineFunction &MF = *MBB.getParent();
7443 MachineRegisterInfo &MRI = MF.getRegInfo();
7444
7445 // Extract the ptr from the resource descriptor.
7446 unsigned RsrcPtr =
7447 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7448 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7449
7450 // Create an empty resource descriptor
7451 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7452 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7453 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7454 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7455 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7456
7457 // Zero64 = 0
7458 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7459 .addImm(0);
7460
7461 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7462 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7463 .addImm(Lo_32(RsrcDataFormat));
7464
7465 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7466 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7467 .addImm(Hi_32(RsrcDataFormat));
7468
7469 // NewSRsrc = {Zero64, SRsrcFormat}
7470 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7471 .addReg(Zero64)
7472 .addImm(AMDGPU::sub0_sub1)
7473 .addReg(SRsrcFormatLo)
7474 .addImm(AMDGPU::sub2)
7475 .addReg(SRsrcFormatHi)
7476 .addImm(AMDGPU::sub3);
7477
7478 return std::tuple(RsrcPtr, NewSRsrc);
7479}
7480
7483 MachineDominatorTree *MDT) const {
7484 MachineFunction &MF = *MI.getMF();
7485 MachineRegisterInfo &MRI = MF.getRegInfo();
7486 MachineBasicBlock *CreatedBB = nullptr;
7487
7488 // Legalize VOP2
7489 if (isVOP2(MI) || isVOPC(MI)) {
7491 return CreatedBB;
7492 }
7493
7494 // Legalize VOP3
7495 if (isVOP3(MI)) {
7497 return CreatedBB;
7498 }
7499
7500 // Legalize SMRD
7501 if (isSMRD(MI)) {
7503 return CreatedBB;
7504 }
7505
7506 // Legalize FLAT
7507 if (isFLAT(MI)) {
7509 return CreatedBB;
7510 }
7511
7512 // Legalize PHI
7513 // The register class of the operands must be the same type as the register
7514 // class of the output.
7515 if (MI.getOpcode() == AMDGPU::PHI) {
7516 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7517 assert(!RI.isSGPRClass(VRC));
7518
7519 // Update all the operands so they have the same type.
7520 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7521 MachineOperand &Op = MI.getOperand(I);
7522 if (!Op.isReg() || !Op.getReg().isVirtual())
7523 continue;
7524
7525 // MI is a PHI instruction.
7526 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7528
7529 // Avoid creating no-op copies with the same src and dst reg class. These
7530 // confuse some of the machine passes.
7531 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7532 }
7533 }
7534
7535 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7536 // VGPR dest type and SGPR sources, insert copies so all operands are
7537 // VGPRs. This seems to help operand folding / the register coalescer.
7538 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7539 MachineBasicBlock *MBB = MI.getParent();
7540 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7541 if (RI.hasVGPRs(DstRC)) {
7542 // Update all the operands so they are VGPR register classes. These may
7543 // not be the same register class because REG_SEQUENCE supports mixing
7544 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7545 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7546 MachineOperand &Op = MI.getOperand(I);
7547 if (!Op.isReg() || !Op.getReg().isVirtual())
7548 continue;
7549
7550 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7551 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7552 if (VRC == OpRC)
7553 continue;
7554
7555 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7556 Op.setIsKill();
7557 }
7558 }
7559
7560 return CreatedBB;
7561 }
7562
7563 // Legalize INSERT_SUBREG
7564 // src0 must have the same register class as dst
7565 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7566 Register Dst = MI.getOperand(0).getReg();
7567 Register Src0 = MI.getOperand(1).getReg();
7568 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7569 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7570 if (DstRC != Src0RC) {
7571 MachineBasicBlock *MBB = MI.getParent();
7572 MachineOperand &Op = MI.getOperand(1);
7573 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7574 }
7575 return CreatedBB;
7576 }
7577
7578 // Legalize SI_INIT_M0
7579 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7580 MachineOperand &Src = MI.getOperand(0);
7581 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7582 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7583 return CreatedBB;
7584 }
7585
7586 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7587 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7588 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7589 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7590 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7591 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7592 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7593 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7594 MachineOperand &Src = MI.getOperand(1);
7595 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7596 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7597 return CreatedBB;
7598 }
7599
7600 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7601 //
7602 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7603 // scratch memory access. In both cases, the legalization never involves
7604 // conversion to the addr64 form.
7606 (isMUBUF(MI) || isMTBUF(MI)))) {
7607 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7608 ? AMDGPU::OpName::rsrc
7609 : AMDGPU::OpName::srsrc;
7610 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7611 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7612 CreatedBB = generateWaterFallLoop(*this, MI, {SRsrc}, MDT);
7613
7614 AMDGPU::OpName SampOpName =
7615 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7616 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7617 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7618 CreatedBB = generateWaterFallLoop(*this, MI, {SSamp}, MDT);
7619
7620 return CreatedBB;
7621 }
7622
7623 // Legalize SI_CALL
7624 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7625 MachineOperand *Dest = &MI.getOperand(0);
7626 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7627 createWaterFallForSiCall(&MI, MDT, {Dest});
7628 }
7629 }
7630
7631 // Legalize s_sleep_var.
7632 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7633 const DebugLoc &DL = MI.getDebugLoc();
7634 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7635 int Src0Idx =
7636 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7637 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7638 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7639 .add(Src0);
7640 Src0.ChangeToRegister(Reg, false);
7641 return nullptr;
7642 }
7643
7644 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7645 // operands are scalar.
7646 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7647 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7648 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7649 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7650 for (MachineOperand &Src : MI.explicit_operands()) {
7651 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7652 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7653 }
7654 return CreatedBB;
7655 }
7656
7657 // Legalize MUBUF instructions.
7658 bool isSoffsetLegal = true;
7659 int SoffsetIdx =
7660 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7661 if (SoffsetIdx != -1) {
7662 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7663 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7664 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7665 isSoffsetLegal = false;
7666 }
7667 }
7668
7669 bool isRsrcLegal = true;
7670 int RsrcIdx =
7671 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7672 if (RsrcIdx != -1) {
7673 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7674 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7675 isRsrcLegal = false;
7676 }
7677
7678 // The operands are legal.
7679 if (isRsrcLegal && isSoffsetLegal)
7680 return CreatedBB;
7681
7682 if (!isRsrcLegal) {
7683 // Legalize a VGPR Rsrc
7684 //
7685 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7686 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7687 // a zero-value SRsrc.
7688 //
7689 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7690 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7691 // above.
7692 //
7693 // Otherwise we are on non-ADDR64 hardware, and/or we have
7694 // idxen/offen/bothen and we fall back to a waterfall loop.
7695
7696 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7697 MachineBasicBlock &MBB = *MI.getParent();
7698
7699 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7700 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7701 // This is already an ADDR64 instruction so we need to add the pointer
7702 // extracted from the resource descriptor to the current value of VAddr.
7703 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7704 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7705 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7706
7707 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7708 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7709 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7710
7711 unsigned RsrcPtr, NewSRsrc;
7712 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7713
7714 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7715 const DebugLoc &DL = MI.getDebugLoc();
7716 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7717 .addDef(CondReg0)
7718 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7719 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7720 .addImm(0);
7721
7722 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7723 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7724 .addDef(CondReg1, RegState::Dead)
7725 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7726 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7727 .addReg(CondReg0, RegState::Kill)
7728 .addImm(0);
7729
7730 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7731 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7732 .addReg(NewVAddrLo)
7733 .addImm(AMDGPU::sub0)
7734 .addReg(NewVAddrHi)
7735 .addImm(AMDGPU::sub1);
7736
7737 VAddr->setReg(NewVAddr);
7738 Rsrc->setReg(NewSRsrc);
7739 } else if (!VAddr && ST.hasAddr64()) {
7740 // This instructions is the _OFFSET variant, so we need to convert it to
7741 // ADDR64.
7742 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7743 "FIXME: Need to emit flat atomics here");
7744
7745 unsigned RsrcPtr, NewSRsrc;
7746 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7747
7748 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7749 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7750 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7751 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7752 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7753
7754 // Atomics with return have an additional tied operand and are
7755 // missing some of the special bits.
7756 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7757 MachineInstr *Addr64;
7758
7759 if (!VDataIn) {
7760 // Regular buffer load / store.
7762 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7763 .add(*VData)
7764 .addReg(NewVAddr)
7765 .addReg(NewSRsrc)
7766 .add(*SOffset)
7767 .add(*Offset);
7768
7769 if (const MachineOperand *CPol =
7770 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7771 MIB.addImm(CPol->getImm());
7772 }
7773
7774 if (const MachineOperand *TFE =
7775 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7776 MIB.addImm(TFE->getImm());
7777 }
7778
7779 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7780
7781 MIB.cloneMemRefs(MI);
7782 Addr64 = MIB;
7783 } else {
7784 // Atomics with return.
7785 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7786 .add(*VData)
7787 .add(*VDataIn)
7788 .addReg(NewVAddr)
7789 .addReg(NewSRsrc)
7790 .add(*SOffset)
7791 .add(*Offset)
7792 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7793 .cloneMemRefs(MI);
7794 }
7795
7796 MI.removeFromParent();
7797
7798 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7799 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7800 NewVAddr)
7801 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7802 .addImm(AMDGPU::sub0)
7803 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7804 .addImm(AMDGPU::sub1);
7805 } else {
7806 // Legalize a VGPR Rsrc and soffset together.
7807 if (!isSoffsetLegal) {
7808 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7809 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc, Soffset}, MDT);
7810 return CreatedBB;
7811 }
7812 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc}, MDT);
7813 return CreatedBB;
7814 }
7815 }
7816
7817 // Legalize a VGPR soffset.
7818 if (!isSoffsetLegal) {
7819 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7820 CreatedBB = generateWaterFallLoop(*this, MI, {Soffset}, MDT);
7821 return CreatedBB;
7822 }
7823 return CreatedBB;
7824}
7825
7827 InstrList.insert(MI);
7828 // Add MBUF instructiosn to deferred list.
7829 int RsrcIdx =
7830 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7831 if (RsrcIdx != -1) {
7832 DeferredList.insert(MI);
7833 }
7834}
7835
7837 return DeferredList.contains(MI);
7838}
7839
7840// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7841// lowering (change sgpr to vgpr).
7842// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7843// size. Need to legalize the size of the operands during the vgpr lowering
7844// chain. This can be removed after we have sgpr16 in place
7846 MachineRegisterInfo &MRI) const {
7847 if (!ST.useRealTrue16Insts())
7848 return;
7849
7850 unsigned Opcode = MI.getOpcode();
7851 MachineBasicBlock *MBB = MI.getParent();
7852 // Legalize operands and check for size mismatch
7853 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7854 OpIdx >= get(Opcode).getNumOperands() ||
7855 get(Opcode).operands()[OpIdx].RegClass == -1)
7856 return;
7857
7858 MachineOperand &Op = MI.getOperand(OpIdx);
7859 if (!Op.isReg() || !Op.getReg().isVirtual())
7860 return;
7861
7862 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7863 if (!RI.isVGPRClass(CurrRC))
7864 return;
7865
7866 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7867 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7868 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7869 Op.setSubReg(AMDGPU::lo16);
7870 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7871 const DebugLoc &DL = MI.getDebugLoc();
7872 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7873 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7874 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7875 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7876 .addReg(Op.getReg())
7877 .addImm(AMDGPU::lo16)
7878 .addReg(Undef)
7879 .addImm(AMDGPU::hi16);
7880 Op.setReg(NewDstReg);
7881 }
7882}
7884 MachineRegisterInfo &MRI) const {
7885 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7887}
7888
7892 ArrayRef<Register> PhySGPRs) const {
7893 assert(MI->getOpcode() == AMDGPU::SI_CALL_ISEL &&
7894 "This only handle waterfall for SI_CALL_ISEL");
7895 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7896 // following copies, we also need to move copies from and to physical
7897 // registers into the loop block.
7898 // Also move the copies to physical registers into the loop block
7899 MachineBasicBlock &MBB = *MI->getParent();
7901 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
7902 --Start;
7904 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
7905 ++End;
7906
7907 // Also include following copies of the return value
7908 ++End;
7909 while (End != MBB.end() && End->isCopy() &&
7910 MI->definesRegister(End->getOperand(1).getReg(), &RI))
7911 ++End;
7912
7913 generateWaterFallLoop(*this, *MI, ScalarOps, MDT, Start, End, PhySGPRs);
7914}
7915
7917 MachineDominatorTree *MDT) const {
7919 DenseMap<MachineInstr *, bool> V2SPhyCopiesToErase;
7920 while (!Worklist.empty()) {
7921 MachineInstr &Inst = *Worklist.top();
7922 Worklist.erase_top();
7923 // Skip MachineInstr in the deferred list.
7924 if (Worklist.isDeferred(&Inst))
7925 continue;
7926 moveToVALUImpl(Worklist, MDT, Inst, WaterFalls, V2SPhyCopiesToErase);
7927 }
7928
7929 // Deferred list of instructions will be processed once
7930 // all the MachineInstr in the worklist are done.
7931 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7932 moveToVALUImpl(Worklist, MDT, *Inst, WaterFalls, V2SPhyCopiesToErase);
7933 assert(Worklist.empty() &&
7934 "Deferred MachineInstr are not supposed to re-populate worklist");
7935 }
7936
7937 for (std::pair<MachineInstr *, V2PhysSCopyInfo> &Entry : WaterFalls) {
7938 if (Entry.first->getOpcode() == AMDGPU::SI_CALL_ISEL)
7939 createWaterFallForSiCall(Entry.first, MDT, Entry.second.MOs,
7940 Entry.second.SGPRs);
7941 }
7942
7943 for (std::pair<MachineInstr *, bool> Entry : V2SPhyCopiesToErase)
7944 if (Entry.second)
7945 Entry.first->eraseFromParent();
7946}
7948 MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const {
7949 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7950 // hope for the best.
7951 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI, DstReg);
7952 ArrayRef<int16_t> SubRegIndices = RI.getRegSplitParts(DstRC, 4);
7953 if (SubRegIndices.size() <= 1) {
7954 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7955 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7956 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7957 .add(Inst.getOperand(1));
7958 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7959 DstReg)
7960 .addReg(NewDst);
7961 } else {
7963 for (int16_t Indice : SubRegIndices) {
7964 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7965 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7966 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7967 .addReg(Inst.getOperand(1).getReg(), {}, Indice);
7968
7969 DstRegs.push_back(NewDst);
7970 }
7972 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7973 get(AMDGPU::REG_SEQUENCE), DstReg);
7974 for (unsigned i = 0; i < SubRegIndices.size(); ++i) {
7975 MIB.addReg(DstRegs[i]);
7976 MIB.addImm(RI.getSubRegFromChannel(i));
7977 }
7978 }
7979}
7980
7982 SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst,
7985 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7986 if (DstReg == AMDGPU::M0) {
7987 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7988 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7989 return;
7990 }
7991 Register SrcReg = Inst.getOperand(1).getReg();
7994 // Only search current block since phyreg's def & use cannot cross
7995 // blocks when MF.NoPhi = false.
7996 while (++I != E) {
7997 // For SI_CALL_ISEL users, replace the phys SGPR with the VGPR source
7998 // and record the operand for later waterfall loop generation.
7999 if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) {
8000 MachineInstr *UseMI = &*I;
8001 for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) {
8002 if (UseMI->getOperand(i).isReg() &&
8003 UseMI->getOperand(i).getReg() == DstReg) {
8004 MachineOperand *MO = &UseMI->getOperand(i);
8005 MO->setReg(SrcReg);
8006 V2PhysSCopyInfo &V2SCopyInfo = WaterFalls[UseMI];
8007 V2SCopyInfo.MOs.push_back(MO);
8008 V2SCopyInfo.SGPRs.push_back(DstReg);
8009 V2SPhyCopiesToErase.try_emplace(&Inst, true);
8010 }
8011 }
8012 } else if (I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG &&
8013 I->getOperand(0).isReg() &&
8014 I->getOperand(0).getReg() == DstReg) {
8015 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
8016 V2SPhyCopiesToErase.try_emplace(&Inst, true);
8017 } else if (I->readsRegister(DstReg, &RI)) {
8018 // COPY cannot be erased if other type of inst uses it.
8019 V2SPhyCopiesToErase[&Inst] = false;
8020 }
8021 if (I->findRegisterDefOperand(DstReg, &RI))
8022 break;
8023 }
8024}
8025
8027 SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst,
8029 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
8030
8032 if (!MBB)
8033 return;
8034 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
8035 unsigned Opcode = Inst.getOpcode();
8036 unsigned NewOpcode = getVALUOp(Inst);
8037 const DebugLoc &DL = Inst.getDebugLoc();
8038
8039 // Handle some special cases
8040 switch (Opcode) {
8041 default:
8042 break;
8043 case AMDGPU::S_ADD_I32:
8044 case AMDGPU::S_SUB_I32: {
8045 // FIXME: The u32 versions currently selected use the carry.
8046 bool Changed;
8047 MachineBasicBlock *CreatedBBTmp = nullptr;
8048 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
8049 if (Changed)
8050 return;
8051
8052 // Default handling
8053 break;
8054 }
8055
8056 case AMDGPU::S_MUL_U64:
8057 if (ST.hasVMulU64Inst()) {
8058 NewOpcode = AMDGPU::V_MUL_U64_e64;
8059 break;
8060 }
8061 // Split s_mul_u64 in 32-bit vector multiplications.
8062 splitScalarSMulU64(Worklist, Inst, MDT);
8063 Inst.eraseFromParent();
8064 return;
8065
8066 case AMDGPU::S_MUL_U64_U32_PSEUDO:
8067 case AMDGPU::S_MUL_I64_I32_PSEUDO:
8068 // This is a special case of s_mul_u64 where all the operands are either
8069 // zero extended or sign extended.
8070 splitScalarSMulPseudo(Worklist, Inst, MDT);
8071 Inst.eraseFromParent();
8072 return;
8073
8074 case AMDGPU::S_AND_B64:
8075 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
8076 Inst.eraseFromParent();
8077 return;
8078
8079 case AMDGPU::S_OR_B64:
8080 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
8081 Inst.eraseFromParent();
8082 return;
8083
8084 case AMDGPU::S_XOR_B64:
8085 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
8086 Inst.eraseFromParent();
8087 return;
8088
8089 case AMDGPU::S_NAND_B64:
8090 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
8091 Inst.eraseFromParent();
8092 return;
8093
8094 case AMDGPU::S_NOR_B64:
8095 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
8096 Inst.eraseFromParent();
8097 return;
8098
8099 case AMDGPU::S_XNOR_B64:
8100 if (ST.hasDLInsts())
8101 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
8102 else
8103 splitScalar64BitXnor(Worklist, Inst, MDT);
8104 Inst.eraseFromParent();
8105 return;
8106
8107 case AMDGPU::S_ANDN2_B64:
8108 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
8109 Inst.eraseFromParent();
8110 return;
8111
8112 case AMDGPU::S_ORN2_B64:
8113 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
8114 Inst.eraseFromParent();
8115 return;
8116
8117 case AMDGPU::S_BREV_B64:
8118 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
8119 Inst.eraseFromParent();
8120 return;
8121
8122 case AMDGPU::S_NOT_B64:
8123 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
8124 Inst.eraseFromParent();
8125 return;
8126
8127 case AMDGPU::S_BCNT1_I32_B64:
8128 splitScalar64BitBCNT(Worklist, Inst);
8129 Inst.eraseFromParent();
8130 return;
8131
8132 case AMDGPU::S_BFE_I64:
8133 splitScalar64BitBFE(Worklist, Inst);
8134 Inst.eraseFromParent();
8135 return;
8136
8137 case AMDGPU::S_FLBIT_I32_B64:
8138 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
8139 Inst.eraseFromParent();
8140 return;
8141 case AMDGPU::S_FF1_I32_B64:
8142 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
8143 Inst.eraseFromParent();
8144 return;
8145
8146 case AMDGPU::S_LSHL_B32:
8147 if (ST.hasOnlyRevVALUShifts()) {
8148 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
8149 swapOperands(Inst);
8150 }
8151 break;
8152 case AMDGPU::S_ASHR_I32:
8153 if (ST.hasOnlyRevVALUShifts()) {
8154 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
8155 swapOperands(Inst);
8156 }
8157 break;
8158 case AMDGPU::S_LSHR_B32:
8159 if (ST.hasOnlyRevVALUShifts()) {
8160 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
8161 swapOperands(Inst);
8162 }
8163 break;
8164 case AMDGPU::S_LSHL_B64:
8165 if (ST.hasOnlyRevVALUShifts()) {
8166 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
8167 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
8168 : AMDGPU::V_LSHLREV_B64_e64;
8169 swapOperands(Inst);
8170 }
8171 break;
8172 case AMDGPU::S_ASHR_I64:
8173 if (ST.hasOnlyRevVALUShifts()) {
8174 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
8175 swapOperands(Inst);
8176 }
8177 break;
8178 case AMDGPU::S_LSHR_B64:
8179 if (ST.hasOnlyRevVALUShifts()) {
8180 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
8181 swapOperands(Inst);
8182 }
8183 break;
8184
8185 case AMDGPU::S_ABS_I32:
8186 lowerScalarAbs(Worklist, Inst);
8187 Inst.eraseFromParent();
8188 return;
8189
8190 case AMDGPU::S_ABSDIFF_I32:
8191 lowerScalarAbsDiff(Worklist, Inst);
8192 Inst.eraseFromParent();
8193 return;
8194
8195 case AMDGPU::S_CBRANCH_SCC0:
8196 case AMDGPU::S_CBRANCH_SCC1: {
8197 // Clear unused bits of vcc
8198 Register CondReg = Inst.getOperand(1).getReg();
8199 bool IsSCC = CondReg == AMDGPU::SCC;
8201 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
8202 .addReg(LMC.ExecReg)
8203 .addReg(IsSCC ? LMC.VccReg : CondReg);
8204 Inst.removeOperand(1);
8205 } break;
8206
8207 case AMDGPU::S_BFE_U64:
8208 case AMDGPU::S_BFM_B64:
8209 llvm_unreachable("Moving this op to VALU not implemented");
8210
8211 case AMDGPU::S_PACK_LL_B32_B16:
8212 case AMDGPU::S_PACK_LH_B32_B16:
8213 case AMDGPU::S_PACK_HL_B32_B16:
8214 case AMDGPU::S_PACK_HH_B32_B16:
8215 movePackToVALU(Worklist, MRI, Inst);
8216 Inst.eraseFromParent();
8217 return;
8218
8219 case AMDGPU::S_XNOR_B32:
8220 lowerScalarXnor(Worklist, Inst);
8221 Inst.eraseFromParent();
8222 return;
8223
8224 case AMDGPU::S_NAND_B32:
8225 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
8226 Inst.eraseFromParent();
8227 return;
8228
8229 case AMDGPU::S_NOR_B32:
8230 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
8231 Inst.eraseFromParent();
8232 return;
8233
8234 case AMDGPU::S_ANDN2_B32:
8235 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
8236 Inst.eraseFromParent();
8237 return;
8238
8239 case AMDGPU::S_ORN2_B32:
8240 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
8241 Inst.eraseFromParent();
8242 return;
8243
8244 // TODO: remove as soon as everything is ready
8245 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8246 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8247 // can only be selected from the uniform SDNode.
8248 case AMDGPU::S_ADD_CO_PSEUDO:
8249 case AMDGPU::S_SUB_CO_PSEUDO: {
8250 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8251 ? AMDGPU::V_ADDC_U32_e64
8252 : AMDGPU::V_SUBB_U32_e64;
8253 const auto *CarryRC = RI.getWaveMaskRegClass();
8254
8255 Register CarryInReg = Inst.getOperand(4).getReg();
8256 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8257 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8258 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8259 .addReg(CarryInReg);
8260 }
8261
8262 Register CarryOutReg = Inst.getOperand(1).getReg();
8263
8264 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8265 MRI.getRegClass(Inst.getOperand(0).getReg())));
8266 MachineInstr *CarryOp =
8267 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8268 .addReg(CarryOutReg, RegState::Define)
8269 .add(Inst.getOperand(2))
8270 .add(Inst.getOperand(3))
8271 .addReg(CarryInReg)
8272 .addImm(0);
8273 legalizeOperands(*CarryOp);
8274 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8275 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8276 Inst.eraseFromParent();
8277 }
8278 return;
8279 case AMDGPU::S_UADDO_PSEUDO:
8280 case AMDGPU::S_USUBO_PSEUDO: {
8281 MachineOperand &Dest0 = Inst.getOperand(0);
8282 MachineOperand &Dest1 = Inst.getOperand(1);
8283 MachineOperand &Src0 = Inst.getOperand(2);
8284 MachineOperand &Src1 = Inst.getOperand(3);
8285
8286 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8287 ? AMDGPU::V_ADD_CO_U32_e64
8288 : AMDGPU::V_SUB_CO_U32_e64;
8289 const TargetRegisterClass *NewRC =
8290 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8291 Register DestReg = MRI.createVirtualRegister(NewRC);
8292 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8293 .addReg(Dest1.getReg(), RegState::Define)
8294 .add(Src0)
8295 .add(Src1)
8296 .addImm(0); // clamp bit
8297
8298 legalizeOperands(*NewInstr, MDT);
8299 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8300 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8301 Inst.eraseFromParent();
8302 }
8303 return;
8304 case AMDGPU::S_LSHL1_ADD_U32:
8305 case AMDGPU::S_LSHL2_ADD_U32:
8306 case AMDGPU::S_LSHL3_ADD_U32:
8307 case AMDGPU::S_LSHL4_ADD_U32: {
8308 MachineOperand &Dest = Inst.getOperand(0);
8309 MachineOperand &Src0 = Inst.getOperand(1);
8310 MachineOperand &Src1 = Inst.getOperand(2);
8311 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8312 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8313 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8314 : 4);
8315
8316 const TargetRegisterClass *NewRC =
8317 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8318 Register DestReg = MRI.createVirtualRegister(NewRC);
8319 MachineInstr *NewInstr =
8320 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8321 .add(Src0)
8322 .addImm(ShiftAmt)
8323 .add(Src1);
8324
8325 legalizeOperands(*NewInstr, MDT);
8326 MRI.replaceRegWith(Dest.getReg(), DestReg);
8327 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8328 Inst.eraseFromParent();
8329 }
8330 return;
8331 case AMDGPU::S_CSELECT_B32:
8332 case AMDGPU::S_CSELECT_B64:
8333 lowerSelect(Worklist, Inst, MDT);
8334 Inst.eraseFromParent();
8335 return;
8336 case AMDGPU::S_CMP_EQ_I32:
8337 case AMDGPU::S_CMP_LG_I32:
8338 case AMDGPU::S_CMP_GT_I32:
8339 case AMDGPU::S_CMP_GE_I32:
8340 case AMDGPU::S_CMP_LT_I32:
8341 case AMDGPU::S_CMP_LE_I32:
8342 case AMDGPU::S_CMP_EQ_U32:
8343 case AMDGPU::S_CMP_LG_U32:
8344 case AMDGPU::S_CMP_GT_U32:
8345 case AMDGPU::S_CMP_GE_U32:
8346 case AMDGPU::S_CMP_LT_U32:
8347 case AMDGPU::S_CMP_LE_U32:
8348 case AMDGPU::S_CMP_EQ_U64:
8349 case AMDGPU::S_CMP_LG_U64:
8350 case AMDGPU::S_CMP_LT_F32:
8351 case AMDGPU::S_CMP_EQ_F32:
8352 case AMDGPU::S_CMP_LE_F32:
8353 case AMDGPU::S_CMP_GT_F32:
8354 case AMDGPU::S_CMP_LG_F32:
8355 case AMDGPU::S_CMP_GE_F32:
8356 case AMDGPU::S_CMP_O_F32:
8357 case AMDGPU::S_CMP_U_F32:
8358 case AMDGPU::S_CMP_NGE_F32:
8359 case AMDGPU::S_CMP_NLG_F32:
8360 case AMDGPU::S_CMP_NGT_F32:
8361 case AMDGPU::S_CMP_NLE_F32:
8362 case AMDGPU::S_CMP_NEQ_F32:
8363 case AMDGPU::S_CMP_NLT_F32: {
8364 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8365 auto NewInstr =
8366 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8367 .setMIFlags(Inst.getFlags());
8368 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8369 0) {
8370 NewInstr
8371 .addImm(0) // src0_modifiers
8372 .add(Inst.getOperand(0)) // src0
8373 .addImm(0) // src1_modifiers
8374 .add(Inst.getOperand(1)) // src1
8375 .addImm(0); // clamp
8376 } else {
8377 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8378 }
8379 legalizeOperands(*NewInstr, MDT);
8380 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8381 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8382 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8383 Inst.eraseFromParent();
8384 return;
8385 }
8386 case AMDGPU::S_CMP_LT_F16:
8387 case AMDGPU::S_CMP_EQ_F16:
8388 case AMDGPU::S_CMP_LE_F16:
8389 case AMDGPU::S_CMP_GT_F16:
8390 case AMDGPU::S_CMP_LG_F16:
8391 case AMDGPU::S_CMP_GE_F16:
8392 case AMDGPU::S_CMP_O_F16:
8393 case AMDGPU::S_CMP_U_F16:
8394 case AMDGPU::S_CMP_NGE_F16:
8395 case AMDGPU::S_CMP_NLG_F16:
8396 case AMDGPU::S_CMP_NGT_F16:
8397 case AMDGPU::S_CMP_NLE_F16:
8398 case AMDGPU::S_CMP_NEQ_F16:
8399 case AMDGPU::S_CMP_NLT_F16: {
8400 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8401 auto NewInstr =
8402 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8403 .setMIFlags(Inst.getFlags());
8404 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8405 NewInstr
8406 .addImm(0) // src0_modifiers
8407 .add(Inst.getOperand(0)) // src0
8408 .addImm(0) // src1_modifiers
8409 .add(Inst.getOperand(1)) // src1
8410 .addImm(0); // clamp
8411 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8412 NewInstr.addImm(0); // op_sel0
8413 } else {
8414 NewInstr
8415 .add(Inst.getOperand(0))
8416 .add(Inst.getOperand(1));
8417 }
8418 legalizeOperandsVALUt16(*NewInstr, MRI);
8419 legalizeOperands(*NewInstr, MDT);
8420 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8421 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8422 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8423 Inst.eraseFromParent();
8424 return;
8425 }
8426 case AMDGPU::S_CVT_HI_F32_F16: {
8427 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8428 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8429 if (ST.useRealTrue16Insts()) {
8430 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8431 .add(Inst.getOperand(1));
8432 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8433 .addImm(0) // src0_modifiers
8434 .addReg(TmpReg, {}, AMDGPU::hi16)
8435 .addImm(0) // clamp
8436 .addImm(0) // omod
8437 .addImm(0); // op_sel0
8438 } else {
8439 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8440 .addImm(16)
8441 .add(Inst.getOperand(1));
8442 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8443 .addImm(0) // src0_modifiers
8444 .addReg(TmpReg)
8445 .addImm(0) // clamp
8446 .addImm(0); // omod
8447 }
8448
8449 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8450 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8451 Inst.eraseFromParent();
8452 return;
8453 }
8454 case AMDGPU::S_MINIMUM_F32:
8455 case AMDGPU::S_MAXIMUM_F32: {
8456 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8457 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8458 .addImm(0) // src0_modifiers
8459 .add(Inst.getOperand(1))
8460 .addImm(0) // src1_modifiers
8461 .add(Inst.getOperand(2))
8462 .addImm(0) // clamp
8463 .addImm(0); // omod
8464 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8465
8466 legalizeOperands(*NewInstr, MDT);
8467 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8468 Inst.eraseFromParent();
8469 return;
8470 }
8471 case AMDGPU::S_MINIMUM_F16:
8472 case AMDGPU::S_MAXIMUM_F16: {
8473 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8474 ? &AMDGPU::VGPR_16RegClass
8475 : &AMDGPU::VGPR_32RegClass);
8476 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8477 .addImm(0) // src0_modifiers
8478 .add(Inst.getOperand(1))
8479 .addImm(0) // src1_modifiers
8480 .add(Inst.getOperand(2))
8481 .addImm(0) // clamp
8482 .addImm(0) // omod
8483 .addImm(0); // opsel0
8484 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8485 legalizeOperandsVALUt16(*NewInstr, MRI);
8486 legalizeOperands(*NewInstr, MDT);
8487 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8488 Inst.eraseFromParent();
8489 return;
8490 }
8491 case AMDGPU::V_S_EXP_F16_e64:
8492 case AMDGPU::V_S_LOG_F16_e64:
8493 case AMDGPU::V_S_RCP_F16_e64:
8494 case AMDGPU::V_S_RSQ_F16_e64:
8495 case AMDGPU::V_S_SQRT_F16_e64: {
8496 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8497 ? &AMDGPU::VGPR_16RegClass
8498 : &AMDGPU::VGPR_32RegClass);
8499 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8500 .add(Inst.getOperand(1)) // src0_modifiers
8501 .add(Inst.getOperand(2))
8502 .add(Inst.getOperand(3)) // clamp
8503 .add(Inst.getOperand(4)) // omod
8504 .setMIFlags(Inst.getFlags());
8505 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8506 NewInstr.addImm(0); // opsel0
8507 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8508 legalizeOperandsVALUt16(*NewInstr, MRI);
8509 legalizeOperands(*NewInstr, MDT);
8510 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8511 Inst.eraseFromParent();
8512 return;
8513 }
8514 }
8515
8516 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8517 // We cannot move this instruction to the VALU, so we should try to
8518 // legalize its operands instead.
8519 legalizeOperands(Inst, MDT);
8520 return;
8521 }
8522 // Handle converting generic instructions like COPY-to-SGPR into
8523 // COPY-to-VGPR.
8524 if (NewOpcode == Opcode) {
8525 Register DstReg = Inst.getOperand(0).getReg();
8526 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8527
8528 if (Inst.isCopy() && DstReg.isPhysical() &&
8529 Inst.getOperand(1).getReg().isVirtual()) {
8530 handleCopyToPhysHelper(Worklist, DstReg, Inst, MRI, WaterFalls,
8531 V2SPhyCopiesToErase);
8532 return;
8533 }
8534
8535 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8536 Register NewDstReg = Inst.getOperand(1).getReg();
8537 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8538 if (const TargetRegisterClass *CommonRC =
8539 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8540 // Instead of creating a copy where src and dst are the same register
8541 // class, we just replace all uses of dst with src. These kinds of
8542 // copies interfere with the heuristics MachineSink uses to decide
8543 // whether or not to split a critical edge. Since the pass assumes
8544 // that copies will end up as machine instructions and not be
8545 // eliminated.
8546 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8547 MRI.replaceRegWith(DstReg, NewDstReg);
8548 MRI.clearKillFlags(NewDstReg);
8549 Inst.getOperand(0).setReg(DstReg);
8550
8551 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8552 llvm_unreachable("failed to constrain register");
8553
8554 Inst.eraseFromParent();
8555
8556 for (MachineOperand &UseMO :
8557 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8558 MachineInstr &UseMI = *UseMO.getParent();
8559
8560 // Legalize t16 operands since replaceReg is called after
8561 // addUsersToVALU.
8563
8564 unsigned OpIdx = UseMI.getOperandNo(&UseMO);
8565 if (const TargetRegisterClass *OpRC =
8566 getRegClass(UseMI.getDesc(), OpIdx))
8567 MRI.constrainRegClass(NewDstReg, OpRC);
8568 }
8569
8570 return;
8571 }
8572 }
8573
8574 // If this is a v2s copy between 16bit and 32bit reg,
8575 // replace vgpr copy to reg_sequence/extract_subreg
8576 // This can be remove after we have sgpr16 in place
8577 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8578 Inst.getOperand(1).getReg().isVirtual() &&
8579 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8580 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8581 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8582 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8583 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8584 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8585 get(AMDGPU::IMPLICIT_DEF), Undef);
8586 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8587 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8588 .addReg(Inst.getOperand(1).getReg())
8589 .addImm(AMDGPU::lo16)
8590 .addReg(Undef)
8591 .addImm(AMDGPU::hi16);
8592 Inst.eraseFromParent();
8593 MRI.replaceRegWith(DstReg, NewDstReg);
8594 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8595 return;
8596 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8597 AMDGPU::lo16)) {
8598 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8599 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8600 MRI.replaceRegWith(DstReg, NewDstReg);
8601 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8602 return;
8603 }
8604 }
8605
8606 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8607 MRI.replaceRegWith(DstReg, NewDstReg);
8608 legalizeOperands(Inst, MDT);
8609 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8610 return;
8611 }
8612
8613 // Use the new VALU Opcode.
8614 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8615 .setMIFlags(Inst.getFlags());
8616 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8617 // Intersperse VOP3 modifiers among the SALU operands.
8618 NewInstr->addOperand(Inst.getOperand(0));
8619 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8620 AMDGPU::OpName::src0_modifiers) >= 0)
8621 NewInstr.addImm(0);
8622 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8623 const MachineOperand &Src = Inst.getOperand(1);
8624 NewInstr->addOperand(Src);
8625 }
8626
8627 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8628 // We are converting these to a BFE, so we need to add the missing
8629 // operands for the size and offset.
8630 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8631 NewInstr.addImm(0);
8632 NewInstr.addImm(Size);
8633 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8634 // The VALU version adds the second operand to the result, so insert an
8635 // extra 0 operand.
8636 NewInstr.addImm(0);
8637 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8638 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8639 // If we need to move this to VGPRs, we need to unpack the second
8640 // operand back into the 2 separate ones for bit offset and width.
8641 assert(OffsetWidthOp.isImm() &&
8642 "Scalar BFE is only implemented for constant width and offset");
8643 uint32_t Imm = OffsetWidthOp.getImm();
8644
8645 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8646 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8647 NewInstr.addImm(Offset);
8648 NewInstr.addImm(BitWidth);
8649 } else {
8650 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8651 AMDGPU::OpName::src1_modifiers) >= 0)
8652 NewInstr.addImm(0);
8653 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8654 NewInstr->addOperand(Inst.getOperand(2));
8655 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8656 AMDGPU::OpName::src2_modifiers) >= 0)
8657 NewInstr.addImm(0);
8658 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8659 NewInstr->addOperand(Inst.getOperand(3));
8660 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8661 NewInstr.addImm(0);
8662 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8663 NewInstr.addImm(0);
8664 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8665 NewInstr.addImm(0);
8666 }
8667 } else {
8668 // Just copy the SALU operands.
8669 for (const MachineOperand &Op : Inst.explicit_operands())
8670 NewInstr->addOperand(Op);
8671 }
8672
8673 // Remove any references to SCC. Vector instructions can't read from it, and
8674 // We're just about to add the implicit use / defs of VCC, and we don't want
8675 // both.
8676 for (MachineOperand &Op : Inst.implicit_operands()) {
8677 if (Op.getReg() == AMDGPU::SCC) {
8678 // Only propagate through live-def of SCC.
8679 if (Op.isDef() && !Op.isDead())
8680 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8681 if (Op.isUse())
8682 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8683 }
8684 }
8685 Inst.eraseFromParent();
8686 Register NewDstReg;
8687 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8688 Register DstReg = NewInstr->getOperand(0).getReg();
8689 assert(DstReg.isVirtual());
8690 // Update the destination register class.
8691 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8692 assert(NewDstRC);
8693 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8694 MRI.replaceRegWith(DstReg, NewDstReg);
8695 }
8696 fixImplicitOperands(*NewInstr);
8697
8698 legalizeOperandsVALUt16(*NewInstr, MRI);
8699
8700 // Legalize the operands
8701 legalizeOperands(*NewInstr, MDT);
8702 if (NewDstReg)
8703 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8704}
8705
8706// Add/sub require special handling to deal with carry outs.
8707std::pair<bool, MachineBasicBlock *>
8708SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8709 MachineDominatorTree *MDT) const {
8710 if (ST.hasAddNoCarryInsts()) {
8711 // Assume there is no user of scc since we don't select this in that case.
8712 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8713 // is used.
8714
8715 MachineBasicBlock &MBB = *Inst.getParent();
8716 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8717
8718 Register OldDstReg = Inst.getOperand(0).getReg();
8719 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8720
8721 unsigned Opc = Inst.getOpcode();
8722 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8723
8724 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8725 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8726
8727 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8728 Inst.removeOperand(3);
8729
8730 Inst.setDesc(get(NewOpc));
8731 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8732 Inst.addImplicitDefUseOperands(*MBB.getParent());
8733 MRI.replaceRegWith(OldDstReg, ResultReg);
8734 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8735
8736 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8737 return std::pair(true, NewBB);
8738 }
8739
8740 return std::pair(false, nullptr);
8741}
8742
8743void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8744 MachineDominatorTree *MDT) const {
8745
8746 MachineBasicBlock &MBB = *Inst.getParent();
8747 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8748 MachineBasicBlock::iterator MII = Inst;
8749 const DebugLoc &DL = Inst.getDebugLoc();
8750
8751 MachineOperand &Dest = Inst.getOperand(0);
8752 MachineOperand &Src0 = Inst.getOperand(1);
8753 MachineOperand &Src1 = Inst.getOperand(2);
8754 MachineOperand &Cond = Inst.getOperand(3);
8755
8756 Register CondReg = Cond.getReg();
8757 bool IsSCC = (CondReg == AMDGPU::SCC);
8758
8759 // If this is a trivial select where the condition is effectively not SCC
8760 // (CondReg is a source of copy to SCC), then the select is semantically
8761 // equivalent to copying CondReg. Hence, there is no need to create
8762 // V_CNDMASK, we can just use that and bail out.
8763 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8764 (Src1.getImm() == 0)) {
8765 MRI.replaceRegWith(Dest.getReg(), CondReg);
8766 return;
8767 }
8768
8769 Register NewCondReg = CondReg;
8770 if (IsSCC) {
8771 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8772 NewCondReg = MRI.createVirtualRegister(TC);
8773
8774 // Now look for the closest SCC def if it is a copy
8775 // replacing the CondReg with the COPY source register
8776 bool CopyFound = false;
8777 for (MachineInstr &CandI :
8779 Inst.getParent()->rend())) {
8780 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8781 -1) {
8782 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8783 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8784 .addReg(CandI.getOperand(1).getReg());
8785 CopyFound = true;
8786 }
8787 break;
8788 }
8789 }
8790 if (!CopyFound) {
8791 // SCC def is not a copy
8792 // Insert a trivial select instead of creating a copy, because a copy from
8793 // SCC would semantically mean just copying a single bit, but we may need
8794 // the result to be a vector condition mask that needs preserving.
8795 unsigned Opcode =
8796 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8797 auto NewSelect =
8798 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8799 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8800 }
8801 }
8802
8803 Register NewDestReg = MRI.createVirtualRegister(
8804 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8805 MachineInstr *NewInst;
8806 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8807 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8808 .addImm(0)
8809 .add(Src1) // False
8810 .addImm(0)
8811 .add(Src0) // True
8812 .addReg(NewCondReg);
8813 } else {
8814 NewInst =
8815 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8816 .add(Src1) // False
8817 .add(Src0) // True
8818 .addReg(NewCondReg);
8819 }
8820 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8821 legalizeOperands(*NewInst, MDT);
8822 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8823}
8824
8825void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8826 MachineInstr &Inst) const {
8827 MachineBasicBlock &MBB = *Inst.getParent();
8828 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8829 MachineBasicBlock::iterator MII = Inst;
8830 const DebugLoc &DL = Inst.getDebugLoc();
8831
8832 MachineOperand &Dest = Inst.getOperand(0);
8833 MachineOperand &Src = Inst.getOperand(1);
8834 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8835 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8836
8837 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8838 : AMDGPU::V_SUB_CO_U32_e32;
8839
8840 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8841 .addImm(0)
8842 .addReg(Src.getReg());
8843
8844 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8845 .addReg(Src.getReg())
8846 .addReg(TmpReg);
8847
8848 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8849 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8850}
8851
8852void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8853 MachineInstr &Inst) const {
8854 MachineBasicBlock &MBB = *Inst.getParent();
8855 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8856 MachineBasicBlock::iterator MII = Inst;
8857 const DebugLoc &DL = Inst.getDebugLoc();
8858
8859 MachineOperand &Dest = Inst.getOperand(0);
8860 MachineOperand &Src1 = Inst.getOperand(1);
8861 MachineOperand &Src2 = Inst.getOperand(2);
8862 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8863 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8864 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8865
8866 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8867 : AMDGPU::V_SUB_CO_U32_e32;
8868
8869 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8870 .addReg(Src1.getReg())
8871 .addReg(Src2.getReg());
8872
8873 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8874
8875 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8876 .addReg(SubResultReg)
8877 .addReg(TmpReg);
8878
8879 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8880 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8881}
8882
8883void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8884 MachineInstr &Inst) const {
8885 MachineBasicBlock &MBB = *Inst.getParent();
8886 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8887 MachineBasicBlock::iterator MII = Inst;
8888 const DebugLoc &DL = Inst.getDebugLoc();
8889
8890 MachineOperand &Dest = Inst.getOperand(0);
8891 MachineOperand &Src0 = Inst.getOperand(1);
8892 MachineOperand &Src1 = Inst.getOperand(2);
8893
8894 if (ST.hasDLInsts()) {
8895 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8896 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8897 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8898
8899 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8900 .add(Src0)
8901 .add(Src1);
8902
8903 MRI.replaceRegWith(Dest.getReg(), NewDest);
8904 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8905 } else {
8906 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8907 // invert either source and then perform the XOR. If either source is a
8908 // scalar register, then we can leave the inversion on the scalar unit to
8909 // achieve a better distribution of scalar and vector instructions.
8910 bool Src0IsSGPR = Src0.isReg() &&
8911 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8912 bool Src1IsSGPR = Src1.isReg() &&
8913 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8914 MachineInstr *Xor;
8915 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8916 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8917
8918 // Build a pair of scalar instructions and add them to the work list.
8919 // The next iteration over the work list will lower these to the vector
8920 // unit as necessary.
8921 if (Src0IsSGPR) {
8922 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8923 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8924 .addReg(Temp)
8925 .add(Src1);
8926 } else if (Src1IsSGPR) {
8927 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8928 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8929 .add(Src0)
8930 .addReg(Temp);
8931 } else {
8932 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8933 .add(Src0)
8934 .add(Src1);
8935 MachineInstr *Not =
8936 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8937 Worklist.insert(Not);
8938 }
8939
8940 MRI.replaceRegWith(Dest.getReg(), NewDest);
8941
8942 Worklist.insert(Xor);
8943
8944 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8945 }
8946}
8947
8948void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8949 MachineInstr &Inst,
8950 unsigned Opcode) const {
8951 MachineBasicBlock &MBB = *Inst.getParent();
8952 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8953 MachineBasicBlock::iterator MII = Inst;
8954 const DebugLoc &DL = Inst.getDebugLoc();
8955
8956 MachineOperand &Dest = Inst.getOperand(0);
8957 MachineOperand &Src0 = Inst.getOperand(1);
8958 MachineOperand &Src1 = Inst.getOperand(2);
8959
8960 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8961 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8962
8963 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8964 .add(Src0)
8965 .add(Src1);
8966
8967 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8968 .addReg(Interm);
8969
8970 Worklist.insert(&Op);
8971 Worklist.insert(&Not);
8972
8973 MRI.replaceRegWith(Dest.getReg(), NewDest);
8974 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8975}
8976
8977void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8978 MachineInstr &Inst,
8979 unsigned Opcode) const {
8980 MachineBasicBlock &MBB = *Inst.getParent();
8981 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8982 MachineBasicBlock::iterator MII = Inst;
8983 const DebugLoc &DL = Inst.getDebugLoc();
8984
8985 MachineOperand &Dest = Inst.getOperand(0);
8986 MachineOperand &Src0 = Inst.getOperand(1);
8987 MachineOperand &Src1 = Inst.getOperand(2);
8988
8989 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8990 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8991
8992 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8993 .add(Src1);
8994
8995 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8996 .add(Src0)
8997 .addReg(Interm);
8998
8999 Worklist.insert(&Not);
9000 Worklist.insert(&Op);
9001
9002 MRI.replaceRegWith(Dest.getReg(), NewDest);
9003 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
9004}
9005
9006void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
9007 MachineInstr &Inst, unsigned Opcode,
9008 bool Swap) const {
9009 MachineBasicBlock &MBB = *Inst.getParent();
9010 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9011
9012 MachineOperand &Dest = Inst.getOperand(0);
9013 MachineOperand &Src0 = Inst.getOperand(1);
9014 const DebugLoc &DL = Inst.getDebugLoc();
9015
9016 MachineBasicBlock::iterator MII = Inst;
9017
9018 const MCInstrDesc &InstDesc = get(Opcode);
9019 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9020 MRI.getRegClass(Src0.getReg()) :
9021 &AMDGPU::SGPR_32RegClass;
9022
9023 const TargetRegisterClass *Src0SubRC =
9024 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9025
9026 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9027 AMDGPU::sub0, Src0SubRC);
9028
9029 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9030 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9031 const TargetRegisterClass *NewDestSubRC =
9032 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9033
9034 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9035 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
9036
9037 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9038 AMDGPU::sub1, Src0SubRC);
9039
9040 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9041 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
9042
9043 if (Swap)
9044 std::swap(DestSub0, DestSub1);
9045
9046 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9047 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9048 .addReg(DestSub0)
9049 .addImm(AMDGPU::sub0)
9050 .addReg(DestSub1)
9051 .addImm(AMDGPU::sub1);
9052
9053 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9054
9055 Worklist.insert(&LoHalf);
9056 Worklist.insert(&HiHalf);
9057
9058 // We don't need to legalizeOperands here because for a single operand, src0
9059 // will support any kind of input.
9060
9061 // Move all users of this moved value.
9062 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9063}
9064
9065// There is not a vector equivalent of s_mul_u64. For this reason, we need to
9066// split the s_mul_u64 in 32-bit vector multiplications.
9067void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
9068 MachineInstr &Inst,
9069 MachineDominatorTree *MDT) const {
9070 MachineBasicBlock &MBB = *Inst.getParent();
9071 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9072
9073 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9074 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9075 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9076
9077 MachineOperand &Dest = Inst.getOperand(0);
9078 MachineOperand &Src0 = Inst.getOperand(1);
9079 MachineOperand &Src1 = Inst.getOperand(2);
9080 const DebugLoc &DL = Inst.getDebugLoc();
9081 MachineBasicBlock::iterator MII = Inst;
9082
9083 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9084 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9085 const TargetRegisterClass *Src0SubRC =
9086 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9087 if (RI.isSGPRClass(Src0SubRC))
9088 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9089 const TargetRegisterClass *Src1SubRC =
9090 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9091 if (RI.isSGPRClass(Src1SubRC))
9092 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9093
9094 // First, we extract the low 32-bit and high 32-bit values from each of the
9095 // operands.
9096 MachineOperand Op0L =
9097 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9098 MachineOperand Op1L =
9099 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9100 MachineOperand Op0H =
9101 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
9102 MachineOperand Op1H =
9103 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
9104
9105 // The multilication is done as follows:
9106 //
9107 // Op1H Op1L
9108 // * Op0H Op0L
9109 // --------------------
9110 // Op1H*Op0L Op1L*Op0L
9111 // + Op1H*Op0H Op1L*Op0H
9112 // -----------------------------------------
9113 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
9114 //
9115 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
9116 // value and that would overflow.
9117 // The low 32-bit value is Op1L*Op0L.
9118 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
9119
9120 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9121 MachineInstr *Op1L_Op0H =
9122 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
9123 .add(Op1L)
9124 .add(Op0H);
9125
9126 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9127 MachineInstr *Op1H_Op0L =
9128 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
9129 .add(Op1H)
9130 .add(Op0L);
9131
9132 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9133 MachineInstr *Carry =
9134 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
9135 .add(Op1L)
9136 .add(Op0L);
9137
9138 MachineInstr *LoHalf =
9139 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9140 .add(Op1L)
9141 .add(Op0L);
9142
9143 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9144 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
9145 .addReg(Op1L_Op0H_Reg)
9146 .addReg(Op1H_Op0L_Reg);
9147
9148 MachineInstr *HiHalf =
9149 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
9150 .addReg(AddReg)
9151 .addReg(CarryReg);
9152
9153 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9154 .addReg(DestSub0)
9155 .addImm(AMDGPU::sub0)
9156 .addReg(DestSub1)
9157 .addImm(AMDGPU::sub1);
9158
9159 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9160
9161 // Try to legalize the operands in case we need to swap the order to keep it
9162 // valid.
9163 legalizeOperands(*Op1L_Op0H, MDT);
9164 legalizeOperands(*Op1H_Op0L, MDT);
9165 legalizeOperands(*Carry, MDT);
9166 legalizeOperands(*LoHalf, MDT);
9167 legalizeOperands(*Add, MDT);
9168 legalizeOperands(*HiHalf, MDT);
9169
9170 // Move all users of this moved value.
9171 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9172}
9173
9174// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
9175// multiplications.
9176void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
9177 MachineInstr &Inst,
9178 MachineDominatorTree *MDT) const {
9179 MachineBasicBlock &MBB = *Inst.getParent();
9180 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9181
9182 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9183 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9184 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9185
9186 MachineOperand &Dest = Inst.getOperand(0);
9187 MachineOperand &Src0 = Inst.getOperand(1);
9188 MachineOperand &Src1 = Inst.getOperand(2);
9189 const DebugLoc &DL = Inst.getDebugLoc();
9190 MachineBasicBlock::iterator MII = Inst;
9191
9192 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9193 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9194 const TargetRegisterClass *Src0SubRC =
9195 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9196 if (RI.isSGPRClass(Src0SubRC))
9197 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9198 const TargetRegisterClass *Src1SubRC =
9199 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9200 if (RI.isSGPRClass(Src1SubRC))
9201 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9202
9203 // First, we extract the low 32-bit and high 32-bit values from each of the
9204 // operands.
9205 MachineOperand Op0L =
9206 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9207 MachineOperand Op1L =
9208 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9209
9210 unsigned Opc = Inst.getOpcode();
9211 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9212 ? AMDGPU::V_MUL_HI_U32_e64
9213 : AMDGPU::V_MUL_HI_I32_e64;
9214 MachineInstr *HiHalf =
9215 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
9216
9217 MachineInstr *LoHalf =
9218 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9219 .add(Op1L)
9220 .add(Op0L);
9221
9222 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9223 .addReg(DestSub0)
9224 .addImm(AMDGPU::sub0)
9225 .addReg(DestSub1)
9226 .addImm(AMDGPU::sub1);
9227
9228 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9229
9230 // Try to legalize the operands in case we need to swap the order to keep it
9231 // valid.
9232 legalizeOperands(*HiHalf, MDT);
9233 legalizeOperands(*LoHalf, MDT);
9234
9235 // Move all users of this moved value.
9236 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9237}
9238
9239void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9240 MachineInstr &Inst, unsigned Opcode,
9241 MachineDominatorTree *MDT) const {
9242 MachineBasicBlock &MBB = *Inst.getParent();
9243 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9244
9245 MachineOperand &Dest = Inst.getOperand(0);
9246 MachineOperand &Src0 = Inst.getOperand(1);
9247 MachineOperand &Src1 = Inst.getOperand(2);
9248 const DebugLoc &DL = Inst.getDebugLoc();
9249
9250 MachineBasicBlock::iterator MII = Inst;
9251
9252 const MCInstrDesc &InstDesc = get(Opcode);
9253 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9254 MRI.getRegClass(Src0.getReg()) :
9255 &AMDGPU::SGPR_32RegClass;
9256
9257 const TargetRegisterClass *Src0SubRC =
9258 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9259 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9260 MRI.getRegClass(Src1.getReg()) :
9261 &AMDGPU::SGPR_32RegClass;
9262
9263 const TargetRegisterClass *Src1SubRC =
9264 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9265
9266 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9267 AMDGPU::sub0, Src0SubRC);
9268 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9269 AMDGPU::sub0, Src1SubRC);
9270 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9271 AMDGPU::sub1, Src0SubRC);
9272 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9273 AMDGPU::sub1, Src1SubRC);
9274
9275 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9276 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9277 const TargetRegisterClass *NewDestSubRC =
9278 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9279
9280 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9281 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9282 .add(SrcReg0Sub0)
9283 .add(SrcReg1Sub0);
9284
9285 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9286 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9287 .add(SrcReg0Sub1)
9288 .add(SrcReg1Sub1);
9289
9290 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9291 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9292 .addReg(DestSub0)
9293 .addImm(AMDGPU::sub0)
9294 .addReg(DestSub1)
9295 .addImm(AMDGPU::sub1);
9296
9297 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9298
9299 Worklist.insert(&LoHalf);
9300 Worklist.insert(&HiHalf);
9301
9302 // Move all users of this moved value.
9303 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9304}
9305
9306void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9307 MachineInstr &Inst,
9308 MachineDominatorTree *MDT) const {
9309 MachineBasicBlock &MBB = *Inst.getParent();
9310 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9311
9312 MachineOperand &Dest = Inst.getOperand(0);
9313 MachineOperand &Src0 = Inst.getOperand(1);
9314 MachineOperand &Src1 = Inst.getOperand(2);
9315 const DebugLoc &DL = Inst.getDebugLoc();
9316
9317 MachineBasicBlock::iterator MII = Inst;
9318
9319 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9320
9321 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9322
9323 MachineOperand* Op0;
9324 MachineOperand* Op1;
9325
9326 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9327 Op0 = &Src0;
9328 Op1 = &Src1;
9329 } else {
9330 Op0 = &Src1;
9331 Op1 = &Src0;
9332 }
9333
9334 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9335 .add(*Op0);
9336
9337 Register NewDest = MRI.createVirtualRegister(DestRC);
9338
9339 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9340 .addReg(Interm)
9341 .add(*Op1);
9342
9343 MRI.replaceRegWith(Dest.getReg(), NewDest);
9344
9345 Worklist.insert(&Xor);
9346}
9347
9348void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9349 MachineInstr &Inst) const {
9350 MachineBasicBlock &MBB = *Inst.getParent();
9351 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9352
9353 MachineBasicBlock::iterator MII = Inst;
9354 const DebugLoc &DL = Inst.getDebugLoc();
9355
9356 MachineOperand &Dest = Inst.getOperand(0);
9357 MachineOperand &Src = Inst.getOperand(1);
9358
9359 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9360 const TargetRegisterClass *SrcRC = Src.isReg() ?
9361 MRI.getRegClass(Src.getReg()) :
9362 &AMDGPU::SGPR_32RegClass;
9363
9364 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9365 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9366
9367 const TargetRegisterClass *SrcSubRC =
9368 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9369
9370 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9371 AMDGPU::sub0, SrcSubRC);
9372 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9373 AMDGPU::sub1, SrcSubRC);
9374
9375 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9376
9377 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9378
9379 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9380
9381 // We don't need to legalize operands here. src0 for either instruction can be
9382 // an SGPR, and the second input is unused or determined here.
9383 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9384}
9385
9386void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9387 MachineInstr &Inst) const {
9388 MachineBasicBlock &MBB = *Inst.getParent();
9389 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9390 MachineBasicBlock::iterator MII = Inst;
9391 const DebugLoc &DL = Inst.getDebugLoc();
9392
9393 MachineOperand &Dest = Inst.getOperand(0);
9394 uint32_t Imm = Inst.getOperand(2).getImm();
9395 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9396 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9397
9398 (void) Offset;
9399
9400 // Only sext_inreg cases handled.
9401 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9402 Offset == 0 && "Not implemented");
9403
9404 if (BitWidth < 32) {
9405 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9406 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9407 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9408
9409 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9410 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9411 .addImm(0)
9412 .addImm(BitWidth);
9413
9414 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9415 .addImm(31)
9416 .addReg(MidRegLo);
9417
9418 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9419 .addReg(MidRegLo)
9420 .addImm(AMDGPU::sub0)
9421 .addReg(MidRegHi)
9422 .addImm(AMDGPU::sub1);
9423
9424 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9425 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9426 return;
9427 }
9428
9429 MachineOperand &Src = Inst.getOperand(1);
9430 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9431 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9432
9433 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9434 .addImm(31)
9435 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9436
9437 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9438 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9439 .addImm(AMDGPU::sub0)
9440 .addReg(TmpReg)
9441 .addImm(AMDGPU::sub1);
9442
9443 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9444 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9445}
9446
9447void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9448 MachineInstr &Inst, unsigned Opcode,
9449 MachineDominatorTree *MDT) const {
9450 // (S_FLBIT_I32_B64 hi:lo) ->
9451 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9452 // (S_FF1_I32_B64 hi:lo) ->
9453 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9454
9455 MachineBasicBlock &MBB = *Inst.getParent();
9456 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9457 MachineBasicBlock::iterator MII = Inst;
9458 const DebugLoc &DL = Inst.getDebugLoc();
9459
9460 MachineOperand &Dest = Inst.getOperand(0);
9461 MachineOperand &Src = Inst.getOperand(1);
9462
9463 const MCInstrDesc &InstDesc = get(Opcode);
9464
9465 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9466 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9467 : AMDGPU::V_ADD_CO_U32_e32;
9468
9469 const TargetRegisterClass *SrcRC =
9470 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9471 const TargetRegisterClass *SrcSubRC =
9472 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9473
9474 MachineOperand SrcRegSub0 =
9475 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9476 MachineOperand SrcRegSub1 =
9477 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9478
9479 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9480 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9481 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9482 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9483
9484 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9485
9486 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9487
9488 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9489 .addReg(IsCtlz ? MidReg1 : MidReg2)
9490 .addImm(32)
9491 .addImm(1); // enable clamp
9492
9493 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9494 .addReg(MidReg3)
9495 .addReg(IsCtlz ? MidReg2 : MidReg1);
9496
9497 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9498
9499 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9500}
9501
9502void SIInstrInfo::addUsersToMoveToVALUWorklist(
9503 Register DstReg, MachineRegisterInfo &MRI,
9504 SIInstrWorklist &Worklist) const {
9505 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9506 MachineInstr &UseMI = *MO.getParent();
9507
9508 unsigned OpNo = 0;
9509
9510 switch (UseMI.getOpcode()) {
9511 case AMDGPU::COPY:
9512 case AMDGPU::WQM:
9513 case AMDGPU::SOFT_WQM:
9514 case AMDGPU::STRICT_WWM:
9515 case AMDGPU::STRICT_WQM:
9516 case AMDGPU::REG_SEQUENCE:
9517 case AMDGPU::PHI:
9518 case AMDGPU::INSERT_SUBREG:
9519 break;
9520 default:
9521 OpNo = MO.getOperandNo();
9522 break;
9523 }
9524
9525 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9526 MRI.constrainRegClass(DstReg, OpRC);
9527
9528 if (!RI.hasVectorRegisters(OpRC))
9529 Worklist.insert(&UseMI);
9530 else
9531 // Legalization could change user list.
9532 legalizeOperandsVALUt16(UseMI, OpNo, MRI);
9533 }
9534}
9535
9536void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9538 MachineInstr &Inst) const {
9539 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9540 MachineBasicBlock *MBB = Inst.getParent();
9541 MachineOperand &Src0 = Inst.getOperand(1);
9542 MachineOperand &Src1 = Inst.getOperand(2);
9543 const DebugLoc &DL = Inst.getDebugLoc();
9544
9545 if (ST.useRealTrue16Insts()) {
9546 Register SrcReg0, SrcReg1;
9547 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9548 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9549 BuildMI(*MBB, Inst, DL,
9550 get(Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg0)
9551 .add(Src0);
9552 } else {
9553 SrcReg0 = Src0.getReg();
9554 }
9555
9556 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9557 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9558 BuildMI(*MBB, Inst, DL,
9559 get(Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg1)
9560 .add(Src1);
9561 } else {
9562 SrcReg1 = Src1.getReg();
9563 }
9564
9565 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9566 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9567
9568 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9569 switch (Inst.getOpcode()) {
9570 case AMDGPU::S_PACK_LL_B32_B16:
9571 NewMI
9572 .addReg(SrcReg0, {},
9573 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9574 .addImm(AMDGPU::lo16)
9575 .addReg(SrcReg1, {},
9576 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9577 .addImm(AMDGPU::hi16);
9578 break;
9579 case AMDGPU::S_PACK_LH_B32_B16:
9580 NewMI
9581 .addReg(SrcReg0, {},
9582 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9583 .addImm(AMDGPU::lo16)
9584 .addReg(SrcReg1, {}, AMDGPU::hi16)
9585 .addImm(AMDGPU::hi16);
9586 break;
9587 case AMDGPU::S_PACK_HL_B32_B16:
9588 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9589 .addImm(AMDGPU::lo16)
9590 .addReg(SrcReg1, {},
9591 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9592 .addImm(AMDGPU::hi16);
9593 break;
9594 case AMDGPU::S_PACK_HH_B32_B16:
9595 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9596 .addImm(AMDGPU::lo16)
9597 .addReg(SrcReg1, {}, AMDGPU::hi16)
9598 .addImm(AMDGPU::hi16);
9599 break;
9600 default:
9601 llvm_unreachable("unhandled s_pack_* instruction");
9602 }
9603
9604 MachineOperand &Dest = Inst.getOperand(0);
9605 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9606 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9607 return;
9608 }
9609
9610 switch (Inst.getOpcode()) {
9611 case AMDGPU::S_PACK_LL_B32_B16: {
9612 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9613 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9614
9615 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9616 // 0.
9617 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9618 .addImm(0xffff);
9619
9620 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9621 .addReg(ImmReg, RegState::Kill)
9622 .add(Src0);
9623
9624 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9625 .add(Src1)
9626 .addImm(16)
9627 .addReg(TmpReg, RegState::Kill);
9628 break;
9629 }
9630 case AMDGPU::S_PACK_LH_B32_B16: {
9631 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9632 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9633 .addImm(0xffff);
9634 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9635 .addReg(ImmReg, RegState::Kill)
9636 .add(Src0)
9637 .add(Src1);
9638 break;
9639 }
9640 case AMDGPU::S_PACK_HL_B32_B16: {
9641 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9642 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9643 .addImm(16)
9644 .add(Src0);
9645 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9646 .add(Src1)
9647 .addImm(16)
9648 .addReg(TmpReg, RegState::Kill);
9649 break;
9650 }
9651 case AMDGPU::S_PACK_HH_B32_B16: {
9652 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9653 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9654 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9655 .addImm(16)
9656 .add(Src0);
9657 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9658 .addImm(0xffff0000);
9659 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9660 .add(Src1)
9661 .addReg(ImmReg, RegState::Kill)
9662 .addReg(TmpReg, RegState::Kill);
9663 break;
9664 }
9665 default:
9666 llvm_unreachable("unhandled s_pack_* instruction");
9667 }
9668
9669 MachineOperand &Dest = Inst.getOperand(0);
9670 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9671 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9672}
9673
9674void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9675 MachineInstr &SCCDefInst,
9676 SIInstrWorklist &Worklist,
9677 Register NewCond) const {
9678
9679 // Ensure that def inst defines SCC, which is still live.
9680 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9681 !Op.isDead() && Op.getParent() == &SCCDefInst);
9682 SmallVector<MachineInstr *, 4> CopyToDelete;
9683 // This assumes that all the users of SCC are in the same block
9684 // as the SCC def.
9685 for (MachineInstr &MI : // Skip the def inst itself.
9686 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9687 SCCDefInst.getParent()->end())) {
9688 // Check if SCC is used first.
9689 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9690 if (SCCIdx != -1) {
9691 if (MI.isCopy()) {
9692 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9693 Register DestReg = MI.getOperand(0).getReg();
9694
9695 MRI.replaceRegWith(DestReg, NewCond);
9696 CopyToDelete.push_back(&MI);
9697 } else {
9698
9699 if (NewCond.isValid())
9700 MI.getOperand(SCCIdx).setReg(NewCond);
9701
9702 Worklist.insert(&MI);
9703 }
9704 }
9705 // Exit if we find another SCC def.
9706 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9707 break;
9708 }
9709 for (auto &Copy : CopyToDelete)
9710 Copy->eraseFromParent();
9711}
9712
9713// Instructions that use SCC may be converted to VALU instructions. When that
9714// happens, the SCC register is changed to VCC_LO. The instruction that defines
9715// SCC must be changed to an instruction that defines VCC. This function makes
9716// sure that the instruction that defines SCC is added to the moveToVALU
9717// worklist.
9718void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9719 SIInstrWorklist &Worklist) const {
9720 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9721 // then there is nothing to do because the defining instruction has been
9722 // converted to a VALU already. If SCC then that instruction needs to be
9723 // converted to a VALU.
9724 for (MachineInstr &MI :
9725 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9726 SCCUseInst->getParent()->rend())) {
9727 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9728 break;
9729 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9730 Worklist.insert(&MI);
9731 break;
9732 }
9733 }
9734}
9735
9736const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9737 const MachineInstr &Inst) const {
9738 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9739
9740 switch (Inst.getOpcode()) {
9741 // For target instructions, getOpRegClass just returns the virtual register
9742 // class associated with the operand, so we need to find an equivalent VGPR
9743 // register class in order to move the instruction to the VALU.
9744 case AMDGPU::COPY:
9745 case AMDGPU::PHI:
9746 case AMDGPU::REG_SEQUENCE:
9747 case AMDGPU::INSERT_SUBREG:
9748 case AMDGPU::WQM:
9749 case AMDGPU::SOFT_WQM:
9750 case AMDGPU::STRICT_WWM:
9751 case AMDGPU::STRICT_WQM: {
9752 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9753 if (RI.isAGPRClass(SrcRC)) {
9754 if (RI.isAGPRClass(NewDstRC))
9755 return nullptr;
9756
9757 switch (Inst.getOpcode()) {
9758 case AMDGPU::PHI:
9759 case AMDGPU::REG_SEQUENCE:
9760 case AMDGPU::INSERT_SUBREG:
9761 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9762 break;
9763 default:
9764 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9765 }
9766
9767 if (!NewDstRC)
9768 return nullptr;
9769 } else {
9770 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9771 return nullptr;
9772
9773 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9774 if (!NewDstRC)
9775 return nullptr;
9776 }
9777
9778 return NewDstRC;
9779 }
9780 default:
9781 return NewDstRC;
9782 }
9783}
9784
9785// Find the one SGPR operand we are allowed to use.
9786Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9787 int OpIndices[3]) const {
9788 const MCInstrDesc &Desc = MI.getDesc();
9789
9790 // Find the one SGPR operand we are allowed to use.
9791 //
9792 // First we need to consider the instruction's operand requirements before
9793 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9794 // of VCC, but we are still bound by the constant bus requirement to only use
9795 // one.
9796 //
9797 // If the operand's class is an SGPR, we can never move it.
9798
9799 Register SGPRReg = findImplicitSGPRRead(MI);
9800 if (SGPRReg)
9801 return SGPRReg;
9802
9803 Register UsedSGPRs[3] = {Register()};
9804 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9805
9806 for (unsigned i = 0; i < 3; ++i) {
9807 int Idx = OpIndices[i];
9808 if (Idx == -1)
9809 break;
9810
9811 const MachineOperand &MO = MI.getOperand(Idx);
9812 if (!MO.isReg())
9813 continue;
9814
9815 // Is this operand statically required to be an SGPR based on the operand
9816 // constraints?
9817 const TargetRegisterClass *OpRC =
9818 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9819 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9820 if (IsRequiredSGPR)
9821 return MO.getReg();
9822
9823 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9824 Register Reg = MO.getReg();
9825 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9826 if (RI.isSGPRClass(RegRC))
9827 UsedSGPRs[i] = Reg;
9828 }
9829
9830 // We don't have a required SGPR operand, so we have a bit more freedom in
9831 // selecting operands to move.
9832
9833 // Try to select the most used SGPR. If an SGPR is equal to one of the
9834 // others, we choose that.
9835 //
9836 // e.g.
9837 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9838 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9839
9840 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9841 // prefer those.
9842
9843 if (UsedSGPRs[0]) {
9844 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9845 SGPRReg = UsedSGPRs[0];
9846 }
9847
9848 if (!SGPRReg && UsedSGPRs[1]) {
9849 if (UsedSGPRs[1] == UsedSGPRs[2])
9850 SGPRReg = UsedSGPRs[1];
9851 }
9852
9853 return SGPRReg;
9854}
9855
9857 AMDGPU::OpName OperandName) const {
9858 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9859 return nullptr;
9860
9861 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9862 if (Idx == -1)
9863 return nullptr;
9864
9865 return &MI.getOperand(Idx);
9866}
9867
9869 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9870 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9873 return (Format << 44) |
9874 (1ULL << 56) | // RESOURCE_LEVEL = 1
9875 (3ULL << 60); // OOB_SELECT = 3
9876 }
9877
9878 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9879 if (ST.isAmdHsaOS()) {
9880 // Set ATC = 1. GFX9 doesn't have this bit.
9881 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9882 RsrcDataFormat |= (1ULL << 56);
9883
9884 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9885 // BTW, it disables TC L2 and therefore decreases performance.
9886 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9887 RsrcDataFormat |= (2ULL << 59);
9888 }
9889
9890 return RsrcDataFormat;
9891}
9892
9896 0xffffffff; // Size;
9897
9898 // GFX9 doesn't have ELEMENT_SIZE.
9899 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9900 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9901 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9902 }
9903
9904 // IndexStride = 64 / 32.
9905 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9906 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9907
9908 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9909 // Clear them unless we want a huge stride.
9910 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9911 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9912 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9913
9914 return Rsrc23;
9915}
9916
9918 unsigned Opc = MI.getOpcode();
9919
9920 return isSMRD(Opc);
9921}
9922
9924 return get(Opc).mayLoad() &&
9925 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9926}
9927
9929 TypeSize &MemBytes) const {
9930 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9931 if (!Addr || !Addr->isFI())
9932 return Register();
9933
9934 assert(!MI.memoperands_empty() &&
9935 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9936
9937 FrameIndex = Addr->getIndex();
9938
9939 int VDataIdx =
9940 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
9941 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), VDataIdx));
9942 return MI.getOperand(VDataIdx).getReg();
9943}
9944
9946 TypeSize &MemBytes) const {
9947 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9948 assert(Addr && Addr->isFI());
9949 FrameIndex = Addr->getIndex();
9950
9951 int DataIdx =
9952 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::data);
9953 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), DataIdx));
9954 return MI.getOperand(DataIdx).getReg();
9955}
9956
9958 int &FrameIndex,
9959 TypeSize &MemBytes) const {
9960 if (!MI.mayLoad())
9961 return Register();
9962
9963 if (isMUBUF(MI) || isVGPRSpill(MI))
9964 return isStackAccess(MI, FrameIndex, MemBytes);
9965
9966 if (isSGPRSpill(MI))
9967 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9968
9969 return Register();
9970}
9971
9973 int &FrameIndex,
9974 TypeSize &MemBytes) const {
9975 if (!MI.mayStore())
9976 return Register();
9977
9978 if (isMUBUF(MI) || isVGPRSpill(MI))
9979 return isStackAccess(MI, FrameIndex, MemBytes);
9980
9981 if (isSGPRSpill(MI))
9982 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9983
9984 return Register();
9985}
9986
9988 unsigned Opc = MI.getOpcode();
9990 unsigned DescSize = Desc.getSize();
9991
9992 // If we have a definitive size, we can use it. Otherwise we need to inspect
9993 // the operands to know the size.
9994 if (isFixedSize(MI)) {
9995 unsigned Size = DescSize;
9996
9997 // If we hit the buggy offset, an extra nop will be inserted in MC so
9998 // estimate the worst case.
9999 if (MI.isBranch() && ST.hasOffset3fBug())
10000 Size += 4;
10001
10002 return Size;
10003 }
10004
10005 // Instructions may have a 32-bit literal encoded after them. Check
10006 // operands that could ever be literals.
10007 if (isVALU(MI) || isSALU(MI)) {
10008 if (isDPP(MI))
10009 return DescSize;
10010 bool HasLiteral = false;
10011 unsigned LiteralSize = 4;
10012 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
10013 const MachineOperand &Op = MI.getOperand(I);
10014 const MCOperandInfo &OpInfo = Desc.operands()[I];
10015 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
10016 HasLiteral = true;
10017 if (ST.has64BitLiterals()) {
10018 switch (OpInfo.OperandType) {
10019 default:
10020 break;
10022 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
10023 LiteralSize = 8;
10024 break;
10026 // A 32-bit literal is only valid when the value fits in BOTH signed
10027 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
10028 // emitter's getLit64Encoding logic. This is because of the lack of
10029 // abilility to tell signedness of the literal, therefore we need to
10030 // be conservative and assume values outside this range require a
10031 // 64-bit literal encoding (8 bytes).
10032 if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
10033 !isUInt<32>(Op.getImm()))
10034 LiteralSize = 8;
10035 break;
10036 }
10037 }
10038 break;
10039 }
10040 }
10041 return HasLiteral ? DescSize + LiteralSize : DescSize;
10042 }
10043
10044 // Check whether we have extra NSA words.
10045 if (isMIMG(MI)) {
10046 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
10047 if (VAddr0Idx < 0)
10048 return 8;
10049
10050 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
10051 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
10052 }
10053
10054 switch (Opc) {
10055 case TargetOpcode::BUNDLE:
10056 return getInstBundleSize(MI);
10057 case TargetOpcode::INLINEASM:
10058 case TargetOpcode::INLINEASM_BR: {
10059 const MachineFunction *MF = MI.getMF();
10060 const char *AsmStr = MI.getOperand(0).getSymbolName();
10061 return getInlineAsmLength(AsmStr, MF->getTarget().getMCAsmInfo(), &ST);
10062 }
10063 default:
10064 if (MI.isMetaInstruction())
10065 return 0;
10066
10067 // If D16 Pseudo inst, get correct MC code size
10068 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
10069 if (D16Info) {
10070 // Assume d16_lo/hi inst are always in same size
10071 unsigned LoInstOpcode = D16Info->LoOp;
10072 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
10073 DescSize = Desc.getSize();
10074 }
10075
10076 // If FMA Pseudo inst, get correct MC code size
10077 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
10078 // All potential lowerings are the same size; arbitrarily pick one.
10079 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
10080 DescSize = Desc.getSize();
10081 }
10082
10083 return DescSize;
10084 }
10085}
10086
10089 if (MI.isBranch() && ST.hasOffset3fBug())
10090 return InstSizeVerifyMode::NoVerify;
10091 return InstSizeVerifyMode::ExactSize;
10092}
10093
10095 if (!isFLAT(MI))
10096 return false;
10097
10098 if (MI.memoperands_empty())
10099 return true;
10100
10101 for (const MachineMemOperand *MMO : MI.memoperands()) {
10103 return true;
10104 }
10105 return false;
10106}
10107
10110 static const std::pair<int, const char *> TargetIndices[] = {
10111 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
10112 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
10113 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
10114 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
10115 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
10116 return ArrayRef(TargetIndices);
10117}
10118
10119/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
10120/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
10123 const ScheduleDAG *DAG) const {
10124 return new GCNHazardRecognizer(DAG->MF);
10125}
10126
10127/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
10128/// pass.
10131 MachineLoopInfo *MLI) const {
10132 return new GCNHazardRecognizer(MF, MLI);
10133}
10134
10135// Called during:
10136// - pre-RA scheduling and post-RA scheduling
10139 const ScheduleDAGMI *DAG) const {
10140 // Borrowed from Arm Target
10141 // We would like to restrict this hazard recognizer to only
10142 // post-RA scheduling; we can tell that we're post-RA because we don't
10143 // track VRegLiveness.
10144 if (!DAG->hasVRegLiveness())
10145 return new GCNHazardRecognizer(DAG->MF);
10147}
10148
10149std::pair<unsigned, unsigned>
10151 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
10152}
10153
10156 static const std::pair<unsigned, const char *> TargetFlags[] = {
10157 {MO_GOTPCREL, "amdgpu-gotprel"},
10158 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
10159 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
10160 {MO_GOTPCREL64, "amdgpu-gotprel64"},
10161 {MO_REL32_LO, "amdgpu-rel32-lo"},
10162 {MO_REL32_HI, "amdgpu-rel32-hi"},
10163 {MO_REL64, "amdgpu-rel64"},
10164 {MO_ABS32_LO, "amdgpu-abs32-lo"},
10165 {MO_ABS32_HI, "amdgpu-abs32-hi"},
10166 {MO_ABS64, "amdgpu-abs64"},
10167 };
10168
10169 return ArrayRef(TargetFlags);
10170}
10171
10174 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10175 {
10176 {MONoClobber, "amdgpu-noclobber"},
10177 {MOLastUse, "amdgpu-last-use"},
10178 {MOCooperative, "amdgpu-cooperative"},
10179 {MOThreadPrivate, "amdgpu-thread-private"},
10180 };
10181
10182 return ArrayRef(TargetFlags);
10183}
10184
10186 const MachineFunction &MF) const {
10188 assert(SrcReg.isVirtual());
10189 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
10190 return AMDGPU::WWM_COPY;
10191
10192 return AMDGPU::COPY;
10193}
10194
10196 uint32_t Opcode = MI.getOpcode();
10197 // Check if it is SGPR spill or wwm-register spill Opcode.
10198 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
10199 return true;
10200
10201 const MachineFunction *MF = MI.getMF();
10202 const MachineRegisterInfo &MRI = MF->getRegInfo();
10204
10205 // See if this is Liverange split instruction inserted for SGPR or
10206 // wwm-register. The implicit def inserted for wwm-registers should also be
10207 // included as they can appear at the bb begin.
10208 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
10209 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
10210 return false;
10211
10212 Register Reg = MI.getOperand(0).getReg();
10213 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
10214 return IsLRSplitInst;
10215
10216 return MFI->isWWMReg(Reg);
10217}
10218
10220 Register Reg) const {
10221 // We need to handle instructions which may be inserted during register
10222 // allocation to handle the prolog. The initial prolog instruction may have
10223 // been separated from the start of the block by spills and copies inserted
10224 // needed by the prolog. However, the insertions for scalar registers can
10225 // always be placed at the BB top as they are independent of the exec mask
10226 // value.
10227 bool IsNullOrVectorRegister = true;
10228 if (Reg) {
10229 const MachineFunction *MF = MI.getMF();
10230 const MachineRegisterInfo &MRI = MF->getRegInfo();
10231 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
10232 }
10233
10234 return IsNullOrVectorRegister &&
10235 (canAddToBBProlog(MI) ||
10236 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10237 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
10238}
10239
10243 const DebugLoc &DL,
10244 Register DestReg) const {
10245 if (ST.hasAddNoCarryInsts())
10246 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
10247
10248 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10249 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
10250 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
10251
10252 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10253 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10254}
10255
10258 const DebugLoc &DL,
10259 Register DestReg,
10260 RegScavenger &RS) const {
10261 if (ST.hasAddNoCarryInsts())
10262 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
10263
10264 // If available, prefer to use vcc.
10265 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
10266 ? Register(RI.getVCC())
10267 : RS.scavengeRegisterBackwards(
10268 *RI.getBoolRC(), I, /* RestoreAfter */ false,
10269 0, /* AllowSpill */ false);
10270
10271 // TODO: Users need to deal with this.
10272 if (!UnusedCarry.isValid())
10273 return MachineInstrBuilder();
10274
10275 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10276 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10277}
10278
10279bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10280 switch (Opcode) {
10281 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10282 case AMDGPU::SI_KILL_I1_TERMINATOR:
10283 return true;
10284 default:
10285 return false;
10286 }
10287}
10288
10290 switch (Opcode) {
10291 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10292 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10293 case AMDGPU::SI_KILL_I1_PSEUDO:
10294 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10295 default:
10296 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10297 }
10298}
10299
10300bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10301 return Imm <= getMaxMUBUFImmOffset(ST);
10302}
10303
10305 // GFX12 field is non-negative 24-bit signed byte offset.
10306 const unsigned OffsetBits =
10307 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10308 return (1 << OffsetBits) - 1;
10309}
10310
10312 if (!ST.isWave32())
10313 return;
10314
10315 if (MI.isInlineAsm())
10316 return;
10317
10318 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10319 return;
10320
10321 for (auto &Op : MI.implicit_operands()) {
10322 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10323 Op.setReg(AMDGPU::VCC_LO);
10324 }
10325}
10326
10328 if (!isSMRD(MI))
10329 return false;
10330
10331 // Check that it is using a buffer resource.
10332 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10333 if (Idx == -1) // e.g. s_memtime
10334 return false;
10335
10336 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10337 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10338}
10339
10340// Given Imm, split it into the values to put into the SOffset and ImmOffset
10341// fields in an MUBUF instruction. Return false if it is not possible (due to a
10342// hardware bug needing a workaround).
10343//
10344// The required alignment ensures that individual address components remain
10345// aligned if they are aligned to begin with. It also ensures that additional
10346// offsets within the given alignment can be added to the resulting ImmOffset.
10348 uint32_t &ImmOffset, Align Alignment) const {
10349 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10350 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10351 uint32_t Overflow = 0;
10352
10353 if (Imm > MaxImm) {
10354 if (Imm <= MaxImm + 64) {
10355 // Use an SOffset inline constant for 4..64
10356 Overflow = Imm - MaxImm;
10357 Imm = MaxImm;
10358 } else {
10359 // Try to keep the same value in SOffset for adjacent loads, so that
10360 // the corresponding register contents can be re-used.
10361 //
10362 // Load values with all low-bits (except for alignment bits) set into
10363 // SOffset, so that a larger range of values can be covered using
10364 // s_movk_i32.
10365 //
10366 // Atomic operations fail to work correctly when individual address
10367 // components are unaligned, even if their sum is aligned.
10368 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10369 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10370 Imm = Low;
10371 Overflow = High - Alignment.value();
10372 }
10373 }
10374
10375 if (Overflow > 0) {
10376 // There is a hardware bug in SI and CI which prevents address clamping in
10377 // MUBUF instructions from working correctly with SOffsets. The immediate
10378 // offset is unaffected.
10379 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10380 return false;
10381
10382 // It is not possible to set immediate in SOffset field on some targets.
10383 if (ST.hasRestrictedSOffset())
10384 return false;
10385 }
10386
10387 ImmOffset = Imm;
10388 SOffset = Overflow;
10389 return true;
10390}
10391
10392// Depending on the used address space and instructions, some immediate offsets
10393// are allowed and some are not.
10394// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10395// scratch instruction offsets can also be negative. On GFX12, offsets can be
10396// negative for all variants.
10397//
10398// There are several bugs related to these offsets:
10399// On gfx10.1, flat instructions that go into the global address space cannot
10400// use an offset.
10401//
10402// For scratch instructions, the address can be either an SGPR or a VGPR.
10403// The following offsets can be used, depending on the architecture (x means
10404// cannot be used):
10405// +----------------------------+------+------+
10406// | Address-Mode | SGPR | VGPR |
10407// +----------------------------+------+------+
10408// | gfx9 | | |
10409// | negative, 4-aligned offset | x | ok |
10410// | negative, unaligned offset | x | ok |
10411// +----------------------------+------+------+
10412// | gfx10 | | |
10413// | negative, 4-aligned offset | ok | ok |
10414// | negative, unaligned offset | ok | x |
10415// +----------------------------+------+------+
10416// | gfx10.3 | | |
10417// | negative, 4-aligned offset | ok | ok |
10418// | negative, unaligned offset | ok | ok |
10419// +----------------------------+------+------+
10420//
10421// This function ignores the addressing mode, so if an offset cannot be used in
10422// one addressing mode, it is considered illegal.
10423bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10424 uint64_t FlatVariant) const {
10425 // TODO: Should 0 be special cased?
10426 if (!ST.hasFlatInstOffsets())
10427 return false;
10428
10429 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10430 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10431 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10432 return false;
10433
10434 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10435 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10436 (Offset % 4) != 0) {
10437 return false;
10438 }
10439
10440 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10441 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10442 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10443}
10444
10445// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10446std::pair<int64_t, int64_t>
10447SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10448 uint64_t FlatVariant) const {
10449 int64_t RemainderOffset = COffsetVal;
10450 int64_t ImmField = 0;
10451
10452 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10453 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10454
10455 if (AllowNegative) {
10456 // Use signed division by a power of two to truncate towards 0.
10457 int64_t D = 1LL << NumBits;
10458 RemainderOffset = (COffsetVal / D) * D;
10459 ImmField = COffsetVal - RemainderOffset;
10460
10461 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10462 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10463 (ImmField % 4) != 0) {
10464 // Make ImmField a multiple of 4
10465 RemainderOffset += ImmField % 4;
10466 ImmField -= ImmField % 4;
10467 }
10468 } else if (COffsetVal >= 0) {
10469 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10470 RemainderOffset = COffsetVal - ImmField;
10471 }
10472
10473 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10474 assert(RemainderOffset + ImmField == COffsetVal);
10475 return {ImmField, RemainderOffset};
10476}
10477
10479 if (ST.hasNegativeScratchOffsetBug() &&
10480 FlatVariant == SIInstrFlags::FlatScratch)
10481 return false;
10482
10483 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10484}
10485
10486static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10487 switch (ST.getGeneration()) {
10488 default:
10489 break;
10492 return SIEncodingFamily::SI;
10495 return SIEncodingFamily::VI;
10499 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10502 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10506 }
10507 llvm_unreachable("Unknown subtarget generation!");
10508}
10509
10510bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10511 switch(MCOp) {
10512 // These opcodes use indirect register addressing so
10513 // they need special handling by codegen (currently missing).
10514 // Therefore it is too risky to allow these opcodes
10515 // to be selected by dpp combiner or sdwa peepholer.
10516 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10517 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10518 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10519 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10520 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10521 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10522 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10523 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10524 return true;
10525 default:
10526 return false;
10527 }
10528}
10529
10530#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10531 case OPCODE##_dpp: \
10532 case OPCODE##_e32: \
10533 case OPCODE##_e64: \
10534 case OPCODE##_e64_dpp: \
10535 case OPCODE##_sdwa:
10536
10537static bool isRenamedInGFX9(int Opcode) {
10538 switch (Opcode) {
10539 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10540 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10541 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10542 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10543 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10544 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10545 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10546 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10547 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10548 //
10549 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10550 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10551 case AMDGPU::V_FMA_F16_gfx9_e64:
10552 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10553 case AMDGPU::V_INTERP_P2_F16:
10554 case AMDGPU::V_MAD_F16_e64:
10555 case AMDGPU::V_MAD_U16_e64:
10556 case AMDGPU::V_MAD_I16_e64:
10557 return true;
10558 default:
10559 return false;
10560 }
10561}
10562
10563int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10564 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10565 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10566
10567 unsigned Gen = subtargetEncodingFamily(ST);
10568
10569 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10571
10572 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10573 // subtarget has UnpackedD16VMem feature.
10574 // TODO: remove this when we discard GFX80 encoding.
10575 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10577
10578 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10579 switch (ST.getGeneration()) {
10580 default:
10582 break;
10585 break;
10588 break;
10589 }
10590 }
10591
10592 if (isMAI(Opcode)) {
10593 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10594 if (MFMAOp != -1)
10595 Opcode = MFMAOp;
10596 }
10597
10598 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10599
10600 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10602
10603 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10605
10606 // -1 means that Opcode is already a native instruction.
10607 if (MCOp == -1)
10608 return Opcode;
10609
10610 if (ST.hasGFX90AInsts()) {
10611 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10612 if (ST.hasGFX940Insts())
10614 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10616 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10618 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10619 MCOp = NMCOp;
10620 }
10621
10622 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10623 // encoding in the given subtarget generation.
10624 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10625 return -1;
10626
10627 if (isAsmOnlyOpcode(MCOp))
10628 return -1;
10629
10630 return MCOp;
10631}
10632
10633static
10635 assert(RegOpnd.isReg());
10636 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10637 getRegSubRegPair(RegOpnd);
10638}
10639
10642 assert(MI.isRegSequence());
10643 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10644 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10645 auto &RegOp = MI.getOperand(1 + 2 * I);
10646 return getRegOrUndef(RegOp);
10647 }
10649}
10650
10651// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10652// Following a subreg of reg:subreg isn't supported
10655 if (!RSR.SubReg)
10656 return false;
10657 switch (MI.getOpcode()) {
10658 default: break;
10659 case AMDGPU::REG_SEQUENCE:
10660 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10661 return true;
10662 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10663 case AMDGPU::INSERT_SUBREG:
10664 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10665 // inserted the subreg we're looking for
10666 RSR = getRegOrUndef(MI.getOperand(2));
10667 else { // the subreg in the rest of the reg
10668 auto R1 = getRegOrUndef(MI.getOperand(1));
10669 if (R1.SubReg) // subreg of subreg isn't supported
10670 return false;
10671 RSR.Reg = R1.Reg;
10672 }
10673 return true;
10674 }
10675 return false;
10676}
10677
10679 const MachineRegisterInfo &MRI) {
10680 assert(MRI.isSSA());
10681 if (!P.Reg.isVirtual())
10682 return nullptr;
10683
10684 auto RSR = P;
10685 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10686 while (auto *MI = DefInst) {
10687 DefInst = nullptr;
10688 switch (MI->getOpcode()) {
10689 case AMDGPU::COPY:
10690 case AMDGPU::V_MOV_B32_e32: {
10691 auto &Op1 = MI->getOperand(1);
10692 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10693 if (Op1.isUndef())
10694 return nullptr;
10695 RSR = getRegSubRegPair(Op1);
10696 DefInst = MRI.getVRegDef(RSR.Reg);
10697 }
10698 break;
10699 }
10700 default:
10701 if (followSubRegDef(*MI, RSR)) {
10702 if (!RSR.Reg)
10703 return nullptr;
10704 DefInst = MRI.getVRegDef(RSR.Reg);
10705 }
10706 }
10707 if (!DefInst)
10708 return MI;
10709 }
10710 return nullptr;
10711}
10712
10714 Register VReg,
10715 const MachineInstr &DefMI,
10716 const MachineInstr &UseMI) {
10717 assert(MRI.isSSA() && "Must be run on SSA");
10718
10719 auto *TRI = MRI.getTargetRegisterInfo();
10720 auto *DefBB = DefMI.getParent();
10721
10722 // Don't bother searching between blocks, although it is possible this block
10723 // doesn't modify exec.
10724 if (UseMI.getParent() != DefBB)
10725 return true;
10726
10727 const int MaxInstScan = 20;
10728 int NumInst = 0;
10729
10730 // Stop scan at the use.
10731 auto E = UseMI.getIterator();
10732 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10733 if (I->isDebugInstr())
10734 continue;
10735
10736 if (++NumInst > MaxInstScan)
10737 return true;
10738
10739 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10740 return true;
10741 }
10742
10743 return false;
10744}
10745
10747 Register VReg,
10748 const MachineInstr &DefMI) {
10749 assert(MRI.isSSA() && "Must be run on SSA");
10750
10751 auto *TRI = MRI.getTargetRegisterInfo();
10752 auto *DefBB = DefMI.getParent();
10753
10754 const int MaxUseScan = 10;
10755 int NumUse = 0;
10756
10757 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10758 auto &UseInst = *Use.getParent();
10759 // Don't bother searching between blocks, although it is possible this block
10760 // doesn't modify exec.
10761 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10762 return true;
10763
10764 if (++NumUse > MaxUseScan)
10765 return true;
10766 }
10767
10768 if (NumUse == 0)
10769 return false;
10770
10771 const int MaxInstScan = 20;
10772 int NumInst = 0;
10773
10774 // Stop scan when we have seen all the uses.
10775 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10776 assert(I != DefBB->end());
10777
10778 if (I->isDebugInstr())
10779 continue;
10780
10781 if (++NumInst > MaxInstScan)
10782 return true;
10783
10784 for (const MachineOperand &Op : I->operands()) {
10785 // We don't check reg masks here as they're used only on calls:
10786 // 1. EXEC is only considered const within one BB
10787 // 2. Call should be a terminator instruction if present in a BB
10788
10789 if (!Op.isReg())
10790 continue;
10791
10792 Register Reg = Op.getReg();
10793 if (Op.isUse()) {
10794 if (Reg == VReg && --NumUse == 0)
10795 return false;
10796 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10797 return true;
10798 }
10799 }
10800}
10801
10804 const DebugLoc &DL, Register Src, Register Dst) const {
10805 auto Cur = MBB.begin();
10806 if (Cur != MBB.end())
10807 do {
10808 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10809 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10810 ++Cur;
10811 } while (Cur != MBB.end() && Cur != LastPHIIt);
10812
10813 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10814 Dst);
10815}
10816
10819 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10820 if (InsPt != MBB.end() &&
10821 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10822 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10823 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10824 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10825 InsPt++;
10826 return BuildMI(MBB, InsPt, DL,
10827 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10828 .addReg(Src, {}, SrcSubReg)
10829 .addReg(AMDGPU::EXEC, RegState::Implicit);
10830 }
10831 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10832 Dst);
10833}
10834
10835bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10836
10838 const MachineInstr &SecondMI) const {
10839 for (const auto &Use : SecondMI.all_uses()) {
10840 if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), &RI))
10841 return true;
10842 }
10843 return false;
10844}
10845
10846/// If OpX is multicycle, anti-dependencies are not allowed.
10847/// isDPMACCInstruction was not designed for VOPD, but it is fit for the
10848/// purpose.
10850 const MachineInstr &OpX) const {
10852}
10853
10856 ArrayRef<unsigned> Ops, int FrameIndex,
10857 MachineInstr *&CopyMI, LiveIntervals *LIS,
10858 VirtRegMap *VRM) const {
10859 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10860 //
10861 // %0:sreg_32 = COPY $m0
10862 //
10863 // We explicitly chose SReg_32 for the virtual register so such a copy might
10864 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10865 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10866 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10867 // TargetInstrInfo::foldMemoryOperand() is going to try.
10868 // A similar issue also exists with spilling and reloading $exec registers.
10869 //
10870 // To prevent that, constrain the %0 register class here.
10871 if (isFullCopyInstr(MI)) {
10872 Register DstReg = MI.getOperand(0).getReg();
10873 Register SrcReg = MI.getOperand(1).getReg();
10874 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10875 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10876 MachineRegisterInfo &MRI = MF.getRegInfo();
10877 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10878 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10879 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10880 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10881 return nullptr;
10882 }
10883 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10884 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10885 return nullptr;
10886 }
10887 }
10888 }
10889
10890 return nullptr;
10891}
10892
10894 const MachineInstr &MI,
10895 unsigned *PredCost) const {
10896 if (MI.isBundle()) {
10898 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10899 unsigned Lat = 0, Count = 0;
10900 for (++I; I != E && I->isBundledWithPred(); ++I) {
10901 ++Count;
10902 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10903 }
10904 return Lat + Count - 1;
10905 }
10906
10907 return SchedModel.computeInstrLatency(&MI);
10908}
10909
10910const MachineOperand &
10912 if (const MachineOperand *CallAddrOp =
10913 getNamedOperand(MI, AMDGPU::OpName::src0))
10914 return *CallAddrOp;
10916}
10917
10920 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10921 unsigned Opcode = MI.getOpcode();
10922
10923 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10924 Register Dst = MI.getOperand(0).getReg();
10925 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10926 : MI.getOperand(1).getReg();
10927 LLT DstTy = MRI.getType(Dst);
10928 LLT SrcTy = MRI.getType(Src);
10929 unsigned DstAS = DstTy.getAddressSpace();
10930 unsigned SrcAS = SrcTy.getAddressSpace();
10931 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10932 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10933 ST.hasGloballyAddressableScratch()
10936 };
10937
10938 // If the target supports globally addressable scratch, the mapping from
10939 // scratch memory to the flat aperture changes therefore an address space cast
10940 // is no longer uniform.
10941 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10942 return HandleAddrSpaceCast(MI);
10943
10944 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10945 auto IID = GI->getIntrinsicID();
10950
10951 switch (IID) {
10952 case Intrinsic::amdgcn_addrspacecast_nonnull:
10953 return HandleAddrSpaceCast(MI);
10954 case Intrinsic::amdgcn_if:
10955 case Intrinsic::amdgcn_else:
10956 // FIXME: Uniform if second result
10957 break;
10958 }
10959
10961 }
10962
10963 // Loads from the private and flat address spaces are divergent, because
10964 // threads can execute the load instruction with the same inputs and get
10965 // different results.
10966 //
10967 // All other loads are not divergent, because if threads issue loads with the
10968 // same arguments, they will always get the same result.
10969 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10970 Opcode == AMDGPU::G_SEXTLOAD) {
10971 if (MI.memoperands_empty())
10972 return ValueUniformity::NeverUniform; // conservative assumption
10973
10974 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10975 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10976 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10977 })) {
10978 // At least one MMO in a non-global address space.
10980 }
10982 }
10983
10984 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10985 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10986 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10987 AMDGPU::isGenericAtomic(Opcode)) {
10989 }
10991}
10992
10994 if (!Formatter)
10995 Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
10996 return Formatter.get();
10997}
10998
11000
11001 if (isNeverUniform(MI))
11003
11004 unsigned opcode = MI.getOpcode();
11005 if (opcode == AMDGPU::V_READLANE_B32 ||
11006 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
11007 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
11009
11010 if (isCopyInstr(MI)) {
11011 const MachineOperand &srcOp = MI.getOperand(1);
11012 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
11013 const TargetRegisterClass *regClass =
11014 RI.getPhysRegBaseClass(srcOp.getReg());
11015 return RI.isSGPRClass(regClass) ? ValueUniformity::AlwaysUniform
11017 }
11019 }
11020
11021 // GMIR handling
11022 if (MI.isPreISelOpcode())
11024
11025 // Atomics are divergent because they are executed sequentially: when an
11026 // atomic operation refers to the same address in each thread, then each
11027 // thread after the first sees the value written by the previous thread as
11028 // original value.
11029
11030 if (isAtomic(MI))
11032
11033 // Loads from the private and flat address spaces are divergent, because
11034 // threads can execute the load instruction with the same inputs and get
11035 // different results.
11036 if (isFLAT(MI) && MI.mayLoad()) {
11037 if (MI.memoperands_empty())
11038 return ValueUniformity::NeverUniform; // conservative assumption
11039
11040 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
11041 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
11042 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
11043 })) {
11044 // At least one MMO in a non-global address space.
11046 }
11047
11049 }
11050
11051 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
11052 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
11053
11054 // FIXME: It's conceptually broken to report this for an instruction, and not
11055 // a specific def operand. For inline asm in particular, there could be mixed
11056 // uniform and divergent results.
11057 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
11058 const MachineOperand &SrcOp = MI.getOperand(I);
11059 if (!SrcOp.isReg())
11060 continue;
11061
11062 Register Reg = SrcOp.getReg();
11063 if (!Reg || !SrcOp.readsReg())
11064 continue;
11065
11066 // If RegBank is null, this is unassigned or an unallocatable special
11067 // register, which are all scalars.
11068 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
11069 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
11071 }
11072
11073 // TODO: Uniformity check condtions above can be rearranged for more
11074 // redability
11075
11076 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
11077 // currently turned into no-op COPYs by SelectionDAG ISel and are
11078 // therefore no longer recognizable.
11079
11081}
11082
11084 switch (MF.getFunction().getCallingConv()) {
11086 return 1;
11088 return 2;
11090 return 3;
11094 const Function &F = MF.getFunction();
11095 F.getContext().diagnose(DiagnosticInfoUnsupported(
11096 F, "ds_ordered_count unsupported for this calling conv"));
11097 [[fallthrough]];
11098 }
11101 case CallingConv::C:
11102 case CallingConv::Fast:
11103 default:
11104 // Assume other calling conventions are various compute callable functions
11105 return 0;
11106 }
11107}
11108
11110 Register &SrcReg2, int64_t &CmpMask,
11111 int64_t &CmpValue) const {
11112 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
11113 return false;
11114
11115 switch (MI.getOpcode()) {
11116 default:
11117 break;
11118 case AMDGPU::S_CMP_EQ_U32:
11119 case AMDGPU::S_CMP_EQ_I32:
11120 case AMDGPU::S_CMP_LG_U32:
11121 case AMDGPU::S_CMP_LG_I32:
11122 case AMDGPU::S_CMP_LT_U32:
11123 case AMDGPU::S_CMP_LT_I32:
11124 case AMDGPU::S_CMP_GT_U32:
11125 case AMDGPU::S_CMP_GT_I32:
11126 case AMDGPU::S_CMP_LE_U32:
11127 case AMDGPU::S_CMP_LE_I32:
11128 case AMDGPU::S_CMP_GE_U32:
11129 case AMDGPU::S_CMP_GE_I32:
11130 case AMDGPU::S_CMP_EQ_U64:
11131 case AMDGPU::S_CMP_LG_U64:
11132 SrcReg = MI.getOperand(0).getReg();
11133 if (MI.getOperand(1).isReg()) {
11134 if (MI.getOperand(1).getSubReg())
11135 return false;
11136 SrcReg2 = MI.getOperand(1).getReg();
11137 CmpValue = 0;
11138 } else if (MI.getOperand(1).isImm()) {
11139 SrcReg2 = Register();
11140 CmpValue = MI.getOperand(1).getImm();
11141 } else {
11142 return false;
11143 }
11144 CmpMask = ~0;
11145 return true;
11146 case AMDGPU::S_CMPK_EQ_U32:
11147 case AMDGPU::S_CMPK_EQ_I32:
11148 case AMDGPU::S_CMPK_LG_U32:
11149 case AMDGPU::S_CMPK_LG_I32:
11150 case AMDGPU::S_CMPK_LT_U32:
11151 case AMDGPU::S_CMPK_LT_I32:
11152 case AMDGPU::S_CMPK_GT_U32:
11153 case AMDGPU::S_CMPK_GT_I32:
11154 case AMDGPU::S_CMPK_LE_U32:
11155 case AMDGPU::S_CMPK_LE_I32:
11156 case AMDGPU::S_CMPK_GE_U32:
11157 case AMDGPU::S_CMPK_GE_I32:
11158 SrcReg = MI.getOperand(0).getReg();
11159 SrcReg2 = Register();
11160 CmpValue = MI.getOperand(1).getImm();
11161 CmpMask = ~0;
11162 return true;
11163 }
11164
11165 return false;
11166}
11167
11169 for (MachineBasicBlock *S : MBB->successors()) {
11170 if (S->isLiveIn(AMDGPU::SCC))
11171 return false;
11172 }
11173 return true;
11174}
11175
11176// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
11177// (incoming SCC) = !(SCC defined by SCCDef).
11178// Return true if all uses can be re-written, false otherwise.
11179bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
11180 MachineBasicBlock *MBB = SCCDef->getParent();
11181 SmallVector<MachineInstr *> InvertInstr;
11182 bool SCCIsDead = false;
11183
11184 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
11185 constexpr unsigned ScanLimit = 12;
11186 unsigned Count = 0;
11187 for (MachineInstr &MI :
11188 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
11189 if (++Count > ScanLimit)
11190 return false;
11191 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
11192 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
11193 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
11194 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11195 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
11196 InvertInstr.push_back(&MI);
11197 else
11198 return false;
11199 }
11200 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
11201 SCCIsDead = true;
11202 break;
11203 }
11204 }
11205 if (!SCCIsDead && isSCCDeadOnExit(MBB))
11206 SCCIsDead = true;
11207
11208 // SCC may have more uses. Can't invert all of them.
11209 if (!SCCIsDead)
11210 return false;
11211
11212 // Invert uses
11213 for (MachineInstr *MI : InvertInstr) {
11214 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
11215 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
11216 swapOperands(*MI);
11217 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11218 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
11219 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
11220 ? AMDGPU::S_CBRANCH_SCC1
11221 : AMDGPU::S_CBRANCH_SCC0));
11222 } else {
11223 llvm_unreachable("SCC used but no inversion handling");
11224 }
11225 }
11226 return true;
11227}
11228
11229// SCC is already valid after SCCValid.
11230// SCCRedefine will redefine SCC to the same value already available after
11231// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
11232// update kill/dead flags if necessary.
11233bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
11234 bool NeedInversion) const {
11235 MachineInstr *KillsSCC = nullptr;
11236 if (SCCValid->getParent() != SCCRedefine->getParent())
11237 return false;
11238 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
11239 SCCRedefine->getIterator())) {
11240 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
11241 return false;
11242 if (MI.killsRegister(AMDGPU::SCC, &RI))
11243 KillsSCC = &MI;
11244 }
11245 if (NeedInversion && !invertSCCUse(SCCRedefine))
11246 return false;
11247 if (MachineOperand *SccDef =
11248 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
11249 SccDef->setIsDead(false);
11250 if (KillsSCC)
11251 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
11252 SCCRedefine->eraseFromParent();
11253 return true;
11254}
11255
11256static bool foldableSelect(const MachineInstr &Def) {
11257 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11258 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11259 return false;
11260 bool Op1IsNonZeroImm =
11261 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
11262 bool Op2IsZeroImm =
11263 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
11264 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11265 return false;
11266 return true;
11267}
11268
11269static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11270 unsigned &NewDefOpc) {
11271 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11272 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11273 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11274 Def.getOpcode() != AMDGPU::S_ADD_U32)
11275 return false;
11276 const MachineOperand &AddSrc1 = Def.getOperand(1);
11277 const MachineOperand &AddSrc2 = Def.getOperand(2);
11278 int64_t addend;
11279
11280 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11281 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11282 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
11283 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
11284 return false;
11285
11286 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11287 const MachineOperand *SccDef =
11288 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
11289 if (!SccDef->isDead())
11290 return false;
11291 NewDefOpc = AMDGPU::S_ADD_U32;
11292 }
11293 NeedInversion = !NeedInversion;
11294 return true;
11295}
11296
11298 Register SrcReg2, int64_t CmpMask,
11299 int64_t CmpValue,
11300 const MachineRegisterInfo *MRI) const {
11301 if (!SrcReg || SrcReg.isPhysical())
11302 return false;
11303
11304 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
11305 return false;
11306
11307 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11308 this](bool NeedInversion) -> bool {
11309 if (CmpValue != 0)
11310 return false;
11311
11312 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11313 if (!Def)
11314 return false;
11315
11316 // For S_OP that set SCC = DST!=0, do the transformation
11317 //
11318 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11319 //
11320 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11321 // do the transformation:
11322 //
11323 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11324 //
11325 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11326 // for S_CSELECT* already has the same value that will be calculated by
11327 // s_cmp_lg_*
11328 //
11329 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11330 // (non-zero imm), 0)
11331
11332 unsigned NewDefOpc = Def->getOpcode();
11333 if (!setsSCCIfResultIsNonZero(*Def) &&
11334 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11335 !foldableSelect(*Def))
11336 return false;
11337
11338 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11339 return false;
11340
11341 if (NewDefOpc != Def->getOpcode())
11342 Def->setDesc(get(NewDefOpc));
11343
11344 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11345 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11346 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11347 // sX = s_cselect_b64 (non-zero imm), 0
11348 // sLo = copy sX.sub0
11349 // sHi = copy sX.sub1
11350 // sY = s_or_b32 sLo, sHi
11351 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11352 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11353 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11354 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11355 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11356 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11357 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11358 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11359 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11360 Def2->getOperand(1).isReg() &&
11361 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11362 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11363 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11364 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11365 if (Select && foldableSelect(*Select))
11366 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11367 }
11368 }
11369 }
11370 return true;
11371 };
11372
11373 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11374 this](int64_t ExpectedValue, unsigned SrcSize,
11375 bool IsReversible, bool IsSigned) -> bool {
11376 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11377 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11378 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11379 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11380 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11381 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11382 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11383 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11384 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11385 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11386 //
11387 // Signed ge/gt are not used for the sign bit.
11388 //
11389 // If result of the AND is unused except in the compare:
11390 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11391 //
11392 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11393 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11394 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11395 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11396 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11397 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11398
11399 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11400 if (!Def)
11401 return false;
11402
11403 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11404 Def->getOpcode() != AMDGPU::S_AND_B64)
11405 return false;
11406
11407 int64_t Mask;
11408 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11409 if (MO->isImm())
11410 Mask = MO->getImm();
11411 else if (!getFoldableImm(MO, Mask))
11412 return false;
11413 Mask &= maxUIntN(SrcSize);
11414 return isPowerOf2_64(Mask);
11415 };
11416
11417 MachineOperand *SrcOp = &Def->getOperand(1);
11418 if (isMask(SrcOp))
11419 SrcOp = &Def->getOperand(2);
11420 else if (isMask(&Def->getOperand(2)))
11421 SrcOp = &Def->getOperand(1);
11422 else
11423 return false;
11424
11425 // A valid Mask is required to have a single bit set, hence a non-zero and
11426 // power-of-two value. This verifies that we will not do 64-bit shift below.
11427 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11428 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11429 if (IsSigned && BitNo == SrcSize - 1)
11430 return false;
11431
11432 ExpectedValue <<= BitNo;
11433
11434 bool IsReversedCC = false;
11435 if (CmpValue != ExpectedValue) {
11436 if (!IsReversible)
11437 return false;
11438 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11439 if (!IsReversedCC)
11440 return false;
11441 }
11442
11443 Register DefReg = Def->getOperand(0).getReg();
11444 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11445 return false;
11446
11447 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11448 return false;
11449
11450 if (!MRI->use_nodbg_empty(DefReg)) {
11451 assert(!IsReversedCC);
11452 return true;
11453 }
11454
11455 // Replace AND with unused result with a S_BITCMP.
11456 MachineBasicBlock *MBB = Def->getParent();
11457
11458 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11459 : AMDGPU::S_BITCMP1_B32
11460 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11461 : AMDGPU::S_BITCMP1_B64;
11462
11463 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11464 .add(*SrcOp)
11465 .addImm(BitNo);
11466 Def->eraseFromParent();
11467
11468 return true;
11469 };
11470
11471 switch (CmpInstr.getOpcode()) {
11472 default:
11473 break;
11474 case AMDGPU::S_CMP_EQ_U32:
11475 case AMDGPU::S_CMP_EQ_I32:
11476 case AMDGPU::S_CMPK_EQ_U32:
11477 case AMDGPU::S_CMPK_EQ_I32:
11478 return optimizeCmpAnd(1, 32, true, false) ||
11479 optimizeCmpSelect(/*NeedInversion=*/true);
11480 case AMDGPU::S_CMP_GE_U32:
11481 case AMDGPU::S_CMPK_GE_U32:
11482 return optimizeCmpAnd(1, 32, false, false);
11483 case AMDGPU::S_CMP_GE_I32:
11484 case AMDGPU::S_CMPK_GE_I32:
11485 return optimizeCmpAnd(1, 32, false, true);
11486 case AMDGPU::S_CMP_EQ_U64:
11487 return optimizeCmpAnd(1, 64, true, false);
11488 case AMDGPU::S_CMP_LG_U32:
11489 case AMDGPU::S_CMP_LG_I32:
11490 case AMDGPU::S_CMPK_LG_U32:
11491 case AMDGPU::S_CMPK_LG_I32:
11492 return optimizeCmpAnd(0, 32, true, false) ||
11493 optimizeCmpSelect(/*NeedInversion=*/false);
11494 case AMDGPU::S_CMP_GT_U32:
11495 case AMDGPU::S_CMPK_GT_U32:
11496 return optimizeCmpAnd(0, 32, false, false);
11497 case AMDGPU::S_CMP_GT_I32:
11498 case AMDGPU::S_CMPK_GT_I32:
11499 return optimizeCmpAnd(0, 32, false, true);
11500 case AMDGPU::S_CMP_LG_U64:
11501 return optimizeCmpAnd(0, 64, true, false) ||
11502 optimizeCmpSelect(/*NeedInversion=*/false);
11503 }
11504
11505 return false;
11506}
11507
11509 AMDGPU::OpName OpName) const {
11510 if (!ST.needsAlignedVGPRs())
11511 return;
11512
11513 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11514 if (OpNo < 0)
11515 return;
11516 MachineOperand &Op = MI.getOperand(OpNo);
11517 if (getOpSize(MI, OpNo) > 4)
11518 return;
11519
11520 // Add implicit aligned super-reg to force alignment on the data operand.
11521 const DebugLoc &DL = MI.getDebugLoc();
11522 MachineBasicBlock *BB = MI.getParent();
11524 Register DataReg = Op.getReg();
11525 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11527 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11528 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11529 Register NewVR =
11530 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11531 : &AMDGPU::VReg_64_Align2RegClass);
11532 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11533 .addReg(DataReg, {}, Op.getSubReg())
11534 .addImm(AMDGPU::sub0)
11535 .addReg(Undef)
11536 .addImm(AMDGPU::sub1);
11537 Op.setReg(NewVR);
11538 Op.setSubReg(AMDGPU::sub0);
11539 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11540}
11541
11543 if (isIGLP(*MI))
11544 return false;
11545
11547}
11548
11550 if (!isWMMA(MI) && !isSWMMAC(MI))
11551 return false;
11552
11553 if (ST.hasGFX1250Insts())
11554 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11555
11556 return true;
11557}
11558
11560 unsigned Opcode = MI.getOpcode();
11561
11562 if (AMDGPU::isGFX12Plus(ST))
11563 return isDOT(MI) || isXDLWMMA(MI);
11564
11565 if (!isMAI(MI) || isDGEMM(Opcode) ||
11566 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11567 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11568 return false;
11569
11570 if (!ST.hasGFX940Insts())
11571 return true;
11572
11573 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11574}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={})
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static MachineBasicBlock * generateWaterFallLoop(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr, ArrayRef< Register > PhySGPRs={})
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static unsigned getAVSpillSaveOpcode(unsigned Size, bool NeedsCFI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI)
static constexpr AMDGPU::OpName ModifierOpNames[]
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
#define LLVM_DEBUG(...)
Definition Debug.h:119
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:158
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
MIRFormater - Interface to format MIR operand based on target.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI void moveOperands(MachineOperand *Dst, MachineOperand *Src, unsigned NumOps)
Move NumOps operands from Src to Dst, updating use-def lists as needed.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
LLVM_ABI void clearVirtRegs()
clearVirtRegs - Remove all virtual registers (after physreg assignment).
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
iterator_range< use_iterator > use_operands(Register Reg) const
LLVM_ABI void removeRegOperandFromUseList(MachineOperand *MO)
Remove MO from its use-def list.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI void addRegOperandToUseList(MachineOperand *MO)
Add MO to the linked list of operands for its register.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
bool isSpill(uint32_t Opcode) const
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
void storeRegToStackSlotCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
const MIRFormatter * getMIRFormatter() const override
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
InstSizeVerifyMode getInstSizeVerifyMode(const MachineInstr &MI) const override
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
Register isStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool hasRAWDependency(const MachineInstr &FirstMI, const MachineInstr &SecondMI) const
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
void handleCopyToPhysHelper(SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst, MachineRegisterInfo &MRI, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool isVOPDAntidependencyAllowed(const MachineInstr &MI) const
If OpX is multicycle, anti-dependencies are not allowed.
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void createWaterFallForSiCall(MachineInstr *MI, MachineDominatorTree *MDT, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={}) const
Wrapper function for generating waterfall for instruction MI This function take into consideration of...
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
ValueUniformity getGenericValueUniformity(const MachineInstr &MI) const
static bool isMAI(const MCInstrDesc &Desc)
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const override
static bool usesLGKM_CNT(const MachineInstr &MI)
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
bool isAlwaysGDS(uint32_t Opcode) const
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI, bool NeedsCFI) const
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
void createReadFirstLaneFromCopyToPhysReg(MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool isWWMRegSpillOpcode(uint32_t Opcode)
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
ValueUniformity getValueUniformity(const MachineInstr &MI) const final
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
unsigned getScratchReservedForDynamicVGPRs() const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
bool spillSGPRToVGPR() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:301
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:190
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isDPMACCInstruction(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int32_t getCommuteRev(uint32_t Opcode)
LLVM_READONLY int32_t getCommuteOrig(uint32_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READONLY int32_t getGlobalVaddrOp(uint32_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getIfAddr64Inst(uint32_t Opcode)
Check if Opcode is an Addr64 opcode.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
LLVM_READONLY int32_t getAddr64Inst(uint32_t Opcode)
int32_t getMCOpcode(uint32_t Opcode, unsigned Gen)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:204
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:227
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:213
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:203
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:209
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:212
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:219
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:214
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:228
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:240
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:215
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:251
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:206
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:226
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:245
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:216
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:241
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:223
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:205
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:231
LLVM_READONLY int32_t getBasicFromSDWAOp(uint32_t Opcode)
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:614
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:616
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:613
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:615
@ TI_CONSTDATA_START
Definition AMDGPU.h:612
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:558
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
constexpr RegState getUndefRegState(bool B)
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
MachineCycleInfo::CycleT MachineCycle
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:63
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
constexpr bool all() const
Definition LaneBitmask.h:54
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:67
MachineInstr * top() const
Definition SIInstrInfo.h:72
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:91
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.