LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
67 RI(ST), ST(ST) {
68 SchedModel.init(&ST);
69}
70
71//===----------------------------------------------------------------------===//
72// TargetInstrInfo callbacks
73//===----------------------------------------------------------------------===//
74
75static unsigned getNumOperandsNoGlue(SDNode *Node) {
76 unsigned N = Node->getNumOperands();
77 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
78 --N;
79 return N;
80}
81
82/// Returns true if both nodes have the same value for the given
83/// operand \p Op, or if both nodes do not have this operand.
85 AMDGPU::OpName OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
152bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
157 if (!Dst)
158 return true;
159
160 Register DstReg = Dst->getReg();
161 if (!DstReg.isVirtual())
162 return true;
163
164 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
165 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
166 switch (Use.getOpcode()) {
167 case AMDGPU::S_AND_SAVEEXEC_B32:
168 case AMDGPU::S_AND_SAVEEXEC_B64:
169 break;
170 case AMDGPU::S_AND_B32:
171 case AMDGPU::S_AND_B64:
172 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
173 return true;
174 break;
175 default:
176 return true;
177 }
178 }
179 return false;
180 }
181
182 switch (MI.getOpcode()) {
183 default:
184 break;
185 case AMDGPU::V_READFIRSTLANE_B32:
186 return true;
187 }
188
189 return false;
190}
191
193 // Any implicit use of exec by VALU is not a real register read.
194 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
195 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
196}
197
199 MachineBasicBlock *SuccToSinkTo,
200 MachineCycleInfo *CI) const {
201 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
202 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
203 return true;
204
205 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
206 // Check if sinking of MI would create temporal divergent use.
207 for (auto Op : MI.uses()) {
208 if (Op.isReg() && Op.getReg().isVirtual() &&
209 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
210 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
211
212 // SgprDef defined inside cycle
213 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
214 if (FromCycle == nullptr)
215 continue;
216
217 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
218 // Check if there is a FromCycle that contains SgprDef's basic block but
219 // does not contain SuccToSinkTo and also has divergent exit condition.
220 while (FromCycle && !FromCycle->contains(ToCycle)) {
222 FromCycle->getExitingBlocks(ExitingBlocks);
223
224 // FromCycle has divergent exit condition.
225 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
226 if (hasDivergentBranch(ExitingBlock))
227 return false;
228 }
229
230 FromCycle = FromCycle->getParentCycle();
231 }
232 }
233 }
234
235 return true;
236}
237
239 int64_t &Offset0,
240 int64_t &Offset1) const {
241 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
242 return false;
243
244 unsigned Opc0 = Load0->getMachineOpcode();
245 unsigned Opc1 = Load1->getMachineOpcode();
246
247 // Make sure both are actually loads.
248 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
249 return false;
250
251 // A mayLoad instruction without a def is not a load. Likely a prefetch.
252 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
253 return false;
254
255 if (isDS(Opc0) && isDS(Opc1)) {
256
257 // FIXME: Handle this case:
258 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
259 return false;
260
261 // Check base reg.
262 if (Load0->getOperand(0) != Load1->getOperand(0))
263 return false;
264
265 // Skip read2 / write2 variants for simplicity.
266 // TODO: We should report true if the used offsets are adjacent (excluded
267 // st64 versions).
268 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
269 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
270 if (Offset0Idx == -1 || Offset1Idx == -1)
271 return false;
272
273 // XXX - be careful of dataless loads
274 // getNamedOperandIdx returns the index for MachineInstrs. Since they
275 // include the output in the operand list, but SDNodes don't, we need to
276 // subtract the index by one.
277 Offset0Idx -= get(Opc0).NumDefs;
278 Offset1Idx -= get(Opc1).NumDefs;
279 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
280 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
281 return true;
282 }
283
284 if (isSMRD(Opc0) && isSMRD(Opc1)) {
285 // Skip time and cache invalidation instructions.
286 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
287 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
288 return false;
289
290 unsigned NumOps = getNumOperandsNoGlue(Load0);
291 if (NumOps != getNumOperandsNoGlue(Load1))
292 return false;
293
294 // Check base reg.
295 if (Load0->getOperand(0) != Load1->getOperand(0))
296 return false;
297
298 // Match register offsets, if both register and immediate offsets present.
299 assert(NumOps == 4 || NumOps == 5);
300 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
301 return false;
302
303 const ConstantSDNode *Load0Offset =
305 const ConstantSDNode *Load1Offset =
307
308 if (!Load0Offset || !Load1Offset)
309 return false;
310
311 Offset0 = Load0Offset->getZExtValue();
312 Offset1 = Load1Offset->getZExtValue();
313 return true;
314 }
315
316 // MUBUF and MTBUF can access the same addresses.
317 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
318
319 // MUBUF and MTBUF have vaddr at different indices.
320 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
321 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
323 return false;
324
325 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
326 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
327
328 if (OffIdx0 == -1 || OffIdx1 == -1)
329 return false;
330
331 // getNamedOperandIdx returns the index for MachineInstrs. Since they
332 // include the output in the operand list, but SDNodes don't, we need to
333 // subtract the index by one.
334 OffIdx0 -= get(Opc0).NumDefs;
335 OffIdx1 -= get(Opc1).NumDefs;
336
337 SDValue Off0 = Load0->getOperand(OffIdx0);
338 SDValue Off1 = Load1->getOperand(OffIdx1);
339
340 // The offset might be a FrameIndexSDNode.
341 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
342 return false;
343
344 Offset0 = Off0->getAsZExtVal();
345 Offset1 = Off1->getAsZExtVal();
346 return true;
347 }
348
349 return false;
350}
351
352static bool isStride64(unsigned Opc) {
353 switch (Opc) {
354 case AMDGPU::DS_READ2ST64_B32:
355 case AMDGPU::DS_READ2ST64_B64:
356 case AMDGPU::DS_WRITE2ST64_B32:
357 case AMDGPU::DS_WRITE2ST64_B64:
358 return true;
359 default:
360 return false;
361 }
362}
363
366 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
367 const TargetRegisterInfo *TRI) const {
368 if (!LdSt.mayLoadOrStore())
369 return false;
370
371 unsigned Opc = LdSt.getOpcode();
372 OffsetIsScalable = false;
373 const MachineOperand *BaseOp, *OffsetOp;
374 int DataOpIdx;
375
376 if (isDS(LdSt)) {
377 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
378 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
379 if (OffsetOp) {
380 // Normal, single offset LDS instruction.
381 if (!BaseOp) {
382 // DS_CONSUME/DS_APPEND use M0 for the base address.
383 // TODO: find the implicit use operand for M0 and use that as BaseOp?
384 return false;
385 }
386 BaseOps.push_back(BaseOp);
387 Offset = OffsetOp->getImm();
388 // Get appropriate operand, and compute width accordingly.
389 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
390 if (DataOpIdx == -1)
391 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
392 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
393 Width = LocationSize::precise(64);
394 else
395 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
396 } else {
397 // The 2 offset instructions use offset0 and offset1 instead. We can treat
398 // these as a load with a single offset if the 2 offsets are consecutive.
399 // We will use this for some partially aligned loads.
400 const MachineOperand *Offset0Op =
401 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
402 const MachineOperand *Offset1Op =
403 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
404
405 unsigned Offset0 = Offset0Op->getImm() & 0xff;
406 unsigned Offset1 = Offset1Op->getImm() & 0xff;
407 if (Offset0 + 1 != Offset1)
408 return false;
409
410 // Each of these offsets is in element sized units, so we need to convert
411 // to bytes of the individual reads.
412
413 unsigned EltSize;
414 if (LdSt.mayLoad())
415 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
416 else {
417 assert(LdSt.mayStore());
418 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
419 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
420 }
421
422 if (isStride64(Opc))
423 EltSize *= 64;
424
425 BaseOps.push_back(BaseOp);
426 Offset = EltSize * Offset0;
427 // Get appropriate operand(s), and compute width accordingly.
428 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
429 if (DataOpIdx == -1) {
430 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
431 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
432 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
433 Width = LocationSize::precise(
434 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
435 } else {
436 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
437 }
438 }
439 return true;
440 }
441
442 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
443 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
444 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
445 return false;
446 BaseOps.push_back(RSrc);
447 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
448 if (BaseOp && !BaseOp->isFI())
449 BaseOps.push_back(BaseOp);
450 const MachineOperand *OffsetImm =
451 getNamedOperand(LdSt, AMDGPU::OpName::offset);
452 Offset = OffsetImm->getImm();
453 const MachineOperand *SOffset =
454 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
455 if (SOffset) {
456 if (SOffset->isReg())
457 BaseOps.push_back(SOffset);
458 else
459 Offset += SOffset->getImm();
460 }
461 // Get appropriate operand, and compute width accordingly.
462 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
463 if (DataOpIdx == -1)
464 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
465 if (DataOpIdx == -1) // LDS DMA
466 return false;
467 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
468 return true;
469 }
470
471 if (isImage(LdSt)) {
472 auto RsrcOpName =
473 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
474 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
475 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
476 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
477 if (VAddr0Idx >= 0) {
478 // GFX10 possible NSA encoding.
479 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
480 BaseOps.push_back(&LdSt.getOperand(I));
481 } else {
482 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
483 }
484 Offset = 0;
485 // Get appropriate operand, and compute width accordingly.
486 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
487 if (DataOpIdx == -1)
488 return false; // no return sampler
489 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
490 return true;
491 }
492
493 if (isSMRD(LdSt)) {
494 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
495 if (!BaseOp) // e.g. S_MEMTIME
496 return false;
497 BaseOps.push_back(BaseOp);
498 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
499 Offset = OffsetOp ? OffsetOp->getImm() : 0;
500 // Get appropriate operand, and compute width accordingly.
501 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
502 if (DataOpIdx == -1)
503 return false;
504 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
505 return true;
506 }
507
508 if (isFLAT(LdSt)) {
509 // Instructions have either vaddr or saddr or both or none.
510 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
511 if (BaseOp)
512 BaseOps.push_back(BaseOp);
513 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
514 if (BaseOp)
515 BaseOps.push_back(BaseOp);
516 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
517 // Get appropriate operand, and compute width accordingly.
518 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
519 if (DataOpIdx == -1)
520 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
521 if (DataOpIdx == -1) // LDS DMA
522 return false;
523 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
524 return true;
525 }
526
527 return false;
528}
529
530static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
532 const MachineInstr &MI2,
534 // Only examine the first "base" operand of each instruction, on the
535 // assumption that it represents the real base address of the memory access.
536 // Other operands are typically offsets or indices from this base address.
537 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
538 return true;
539
540 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
541 return false;
542
543 auto *MO1 = *MI1.memoperands_begin();
544 auto *MO2 = *MI2.memoperands_begin();
545 if (MO1->getAddrSpace() != MO2->getAddrSpace())
546 return false;
547
548 const auto *Base1 = MO1->getValue();
549 const auto *Base2 = MO2->getValue();
550 if (!Base1 || !Base2)
551 return false;
552 Base1 = getUnderlyingObject(Base1);
553 Base2 = getUnderlyingObject(Base2);
554
555 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
556 return false;
557
558 return Base1 == Base2;
559}
560
562 int64_t Offset1, bool OffsetIsScalable1,
564 int64_t Offset2, bool OffsetIsScalable2,
565 unsigned ClusterSize,
566 unsigned NumBytes) const {
567 // If the mem ops (to be clustered) do not have the same base ptr, then they
568 // should not be clustered
569 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
570 if (!BaseOps1.empty() && !BaseOps2.empty()) {
571 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
572 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
573 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
574 return false;
575
576 const SIMachineFunctionInfo *MFI =
577 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
578 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
579 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
580 // If only one base op is empty, they do not have the same base ptr
581 return false;
582 }
583
584 // In order to avoid register pressure, on an average, the number of DWORDS
585 // loaded together by all clustered mem ops should not exceed
586 // MaxMemoryClusterDWords. This is an empirical value based on certain
587 // observations and performance related experiments.
588 // The good thing about this heuristic is - it avoids clustering of too many
589 // sub-word loads, and also avoids clustering of wide loads. Below is the
590 // brief summary of how the heuristic behaves for various `LoadSize` when
591 // MaxMemoryClusterDWords is 8.
592 //
593 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
594 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
595 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
596 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
597 // (5) LoadSize >= 17: do not cluster
598 const unsigned LoadSize = NumBytes / ClusterSize;
599 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
600 return NumDWords <= MaxMemoryClusterDWords;
601}
602
603// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
604// the first 16 loads will be interleaved with the stores, and the next 16 will
605// be clustered as expected. It should really split into 2 16 store batches.
606//
607// Loads are clustered until this returns false, rather than trying to schedule
608// groups of stores. This also means we have to deal with saying different
609// address space loads should be clustered, and ones which might cause bank
610// conflicts.
611//
612// This might be deprecated so it might not be worth that much effort to fix.
614 int64_t Offset0, int64_t Offset1,
615 unsigned NumLoads) const {
616 assert(Offset1 > Offset0 &&
617 "Second offset should be larger than first offset!");
618 // If we have less than 16 loads in a row, and the offsets are within 64
619 // bytes, then schedule together.
620
621 // A cacheline is 64 bytes (for global memory).
622 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
623}
624
627 const DebugLoc &DL, MCRegister DestReg,
628 MCRegister SrcReg, bool KillSrc,
629 const char *Msg = "illegal VGPR to SGPR copy") {
630 MachineFunction *MF = MBB.getParent();
631
633 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
634
635 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
636 .addReg(SrcReg, getKillRegState(KillSrc));
637}
638
639/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
640/// possible to have a direct copy in these cases on GFX908, so an intermediate
641/// VGPR copy is required.
645 const DebugLoc &DL, MCRegister DestReg,
646 MCRegister SrcReg, bool KillSrc,
647 RegScavenger &RS, bool RegsOverlap,
648 Register ImpDefSuperReg = Register(),
649 Register ImpUseSuperReg = Register()) {
650 assert((TII.getSubtarget().hasMAIInsts() &&
651 !TII.getSubtarget().hasGFX90AInsts()) &&
652 "Expected GFX908 subtarget.");
653
654 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
655 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
656 "Source register of the copy should be either an SGPR or an AGPR.");
657
658 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
659 "Destination register of the copy should be an AGPR.");
660
661 const SIRegisterInfo &RI = TII.getRegisterInfo();
662
663 // First try to find defining accvgpr_write to avoid temporary registers.
664 // In the case of copies of overlapping AGPRs, we conservatively do not
665 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
666 // an accvgpr_write used for this same copy due to implicit-defs
667 if (!RegsOverlap) {
668 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
669 --Def;
670
671 if (!Def->modifiesRegister(SrcReg, &RI))
672 continue;
673
674 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
675 Def->getOperand(0).getReg() != SrcReg)
676 break;
677
678 MachineOperand &DefOp = Def->getOperand(1);
679 assert(DefOp.isReg() || DefOp.isImm());
680
681 if (DefOp.isReg()) {
682 bool SafeToPropagate = true;
683 // Check that register source operand is not clobbered before MI.
684 // Immediate operands are always safe to propagate.
685 for (auto I = Def; I != MI && SafeToPropagate; ++I)
686 if (I->modifiesRegister(DefOp.getReg(), &RI))
687 SafeToPropagate = false;
688
689 if (!SafeToPropagate)
690 break;
691
692 for (auto I = Def; I != MI; ++I)
693 I->clearRegisterKills(DefOp.getReg(), &RI);
694 }
695
696 MachineInstrBuilder Builder =
697 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
698 .add(DefOp);
699 if (ImpDefSuperReg)
700 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
701
702 if (ImpUseSuperReg) {
703 Builder.addReg(ImpUseSuperReg,
705 }
706
707 return;
708 }
709 }
710
711 RS.enterBasicBlockEnd(MBB);
712 RS.backward(std::next(MI));
713
714 // Ideally we want to have three registers for a long reg_sequence copy
715 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
716 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
717 *MBB.getParent());
718
719 // Registers in the sequence are allocated contiguously so we can just
720 // use register number to pick one of three round-robin temps.
721 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
722 Register Tmp =
723 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
724 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
725 "VGPR used for an intermediate copy should have been reserved.");
726
727 // Only loop through if there are any free registers left. We don't want to
728 // spill.
729 while (RegNo--) {
730 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
731 /* RestoreAfter */ false, 0,
732 /* AllowSpill */ false);
733 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
734 break;
735 Tmp = Tmp2;
736 RS.setRegUsed(Tmp);
737 }
738
739 // Insert copy to temporary VGPR.
740 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
741 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
742 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
743 } else {
744 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
745 }
746
747 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
748 .addReg(SrcReg, getKillRegState(KillSrc));
749 if (ImpUseSuperReg) {
750 UseBuilder.addReg(ImpUseSuperReg,
752 }
753
754 MachineInstrBuilder DefBuilder
755 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
756 .addReg(Tmp, RegState::Kill);
757
758 if (ImpDefSuperReg)
759 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
760}
761
764 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
765 const TargetRegisterClass *RC, bool Forward) {
766 const SIRegisterInfo &RI = TII.getRegisterInfo();
767 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
769 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
770
771 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
772 int16_t SubIdx = BaseIndices[Idx];
773 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
774 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
775 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
776 unsigned Opcode = AMDGPU::S_MOV_B32;
777
778 // Is SGPR aligned? If so try to combine with next.
779 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
780 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
781 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
782 // Can use SGPR64 copy
783 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
784 SubIdx = RI.getSubRegFromChannel(Channel, 2);
785 DestSubReg = RI.getSubReg(DestReg, SubIdx);
786 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
787 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
788 Opcode = AMDGPU::S_MOV_B64;
789 Idx++;
790 }
791
792 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
793 .addReg(SrcSubReg)
794 .addReg(SrcReg, RegState::Implicit);
795
796 if (!FirstMI)
797 FirstMI = LastMI;
798
799 if (!Forward)
800 I--;
801 }
802
803 assert(FirstMI && LastMI);
804 if (!Forward)
805 std::swap(FirstMI, LastMI);
806
807 FirstMI->addOperand(
808 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
809
810 if (KillSrc)
811 LastMI->addRegisterKilled(SrcReg, &RI);
812}
813
816 const DebugLoc &DL, Register DestReg,
817 Register SrcReg, bool KillSrc, bool RenamableDest,
818 bool RenamableSrc) const {
819 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
820 unsigned Size = RI.getRegSizeInBits(*RC);
821 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
822 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
823
824 // The rest of copyPhysReg assumes Src and Dst size are the same size.
825 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
826 // we remove Fix16BitCopies and this code block?
827 if (Fix16BitCopies) {
828 if (((Size == 16) != (SrcSize == 16))) {
829 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
830 assert(ST.useRealTrue16Insts());
831 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
832 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
833 RegToFix = SubReg;
834
835 if (DestReg == SrcReg) {
836 // Identity copy. Insert empty bundle since ExpandPostRA expects an
837 // instruction here.
838 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
839 return;
840 }
841 RC = RI.getPhysRegBaseClass(DestReg);
842 Size = RI.getRegSizeInBits(*RC);
843 SrcRC = RI.getPhysRegBaseClass(SrcReg);
844 SrcSize = RI.getRegSizeInBits(*SrcRC);
845 }
846 }
847
848 if (RC == &AMDGPU::VGPR_32RegClass) {
849 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
850 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
851 AMDGPU::AGPR_32RegClass.contains(SrcReg));
852 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
853 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
854 BuildMI(MBB, MI, DL, get(Opc), DestReg)
855 .addReg(SrcReg, getKillRegState(KillSrc));
856 return;
857 }
858
859 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
860 RC == &AMDGPU::SReg_32RegClass) {
861 if (SrcReg == AMDGPU::SCC) {
862 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
863 .addImm(1)
864 .addImm(0);
865 return;
866 }
867
868 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
869 if (DestReg == AMDGPU::VCC_LO) {
870 // FIXME: Hack until VReg_1 removed.
871 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
872 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
873 .addImm(0)
874 .addReg(SrcReg, getKillRegState(KillSrc));
875 return;
876 }
877
878 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
879 return;
880 }
881
882 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
883 .addReg(SrcReg, getKillRegState(KillSrc));
884 return;
885 }
886
887 if (RC == &AMDGPU::SReg_64RegClass) {
888 if (SrcReg == AMDGPU::SCC) {
889 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
890 .addImm(1)
891 .addImm(0);
892 return;
893 }
894
895 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
896 if (DestReg == AMDGPU::VCC) {
897 // FIXME: Hack until VReg_1 removed.
898 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
899 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
900 .addImm(0)
901 .addReg(SrcReg, getKillRegState(KillSrc));
902 return;
903 }
904
905 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
906 return;
907 }
908
909 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
910 .addReg(SrcReg, getKillRegState(KillSrc));
911 return;
912 }
913
914 if (DestReg == AMDGPU::SCC) {
915 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
916 // but SelectionDAG emits such copies for i1 sources.
917 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
918 // This copy can only be produced by patterns
919 // with explicit SCC, which are known to be enabled
920 // only for subtargets with S_CMP_LG_U64 present.
921 assert(ST.hasScalarCompareEq64());
922 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
923 .addReg(SrcReg, getKillRegState(KillSrc))
924 .addImm(0);
925 } else {
926 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
927 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
928 .addReg(SrcReg, getKillRegState(KillSrc))
929 .addImm(0);
930 }
931
932 return;
933 }
934
935 if (RC == &AMDGPU::AGPR_32RegClass) {
936 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
937 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
938 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
939 .addReg(SrcReg, getKillRegState(KillSrc));
940 return;
941 }
942
943 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
944 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
945 .addReg(SrcReg, getKillRegState(KillSrc));
946 return;
947 }
948
949 // FIXME: Pass should maintain scavenger to avoid scan through the block on
950 // every AGPR spill.
951 RegScavenger RS;
952 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
953 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
954 return;
955 }
956
957 if (Size == 16) {
958 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
959 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
960 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
961
962 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
963 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
964 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
965 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
966 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
967 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
968 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
969 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
970
971 if (IsSGPRDst) {
972 if (!IsSGPRSrc) {
973 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
974 return;
975 }
976
977 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
978 .addReg(NewSrcReg, getKillRegState(KillSrc));
979 return;
980 }
981
982 if (IsAGPRDst || IsAGPRSrc) {
983 if (!DstLow || !SrcLow) {
984 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
985 "Cannot use hi16 subreg with an AGPR!");
986 }
987
988 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
989 return;
990 }
991
992 if (ST.useRealTrue16Insts()) {
993 if (IsSGPRSrc) {
994 assert(SrcLow);
995 SrcReg = NewSrcReg;
996 }
997 // Use the smaller instruction encoding if possible.
998 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
999 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1000 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1001 .addReg(SrcReg);
1002 } else {
1003 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1004 .addImm(0) // src0_modifiers
1005 .addReg(SrcReg)
1006 .addImm(0); // op_sel
1007 }
1008 return;
1009 }
1010
1011 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1012 if (!DstLow || !SrcLow) {
1013 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1014 "Cannot use hi16 subreg on VI!");
1015 }
1016
1017 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1018 .addReg(NewSrcReg, getKillRegState(KillSrc));
1019 return;
1020 }
1021
1022 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1023 .addImm(0) // src0_modifiers
1024 .addReg(NewSrcReg)
1025 .addImm(0) // clamp
1032 // First implicit operand is $exec.
1033 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1034 return;
1035 }
1036
1037 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1038 if (ST.hasMovB64()) {
1039 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1040 .addReg(SrcReg, getKillRegState(KillSrc));
1041 return;
1042 }
1043 if (ST.hasPkMovB32()) {
1044 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1046 .addReg(SrcReg)
1048 .addReg(SrcReg)
1049 .addImm(0) // op_sel_lo
1050 .addImm(0) // op_sel_hi
1051 .addImm(0) // neg_lo
1052 .addImm(0) // neg_hi
1053 .addImm(0) // clamp
1054 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1055 return;
1056 }
1057 }
1058
1059 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1060 if (RI.isSGPRClass(RC)) {
1061 if (!RI.isSGPRClass(SrcRC)) {
1062 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1063 return;
1064 }
1065 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1066 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1067 Forward);
1068 return;
1069 }
1070
1071 unsigned EltSize = 4;
1072 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1073 if (RI.isAGPRClass(RC)) {
1074 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1075 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1076 else if (RI.hasVGPRs(SrcRC) ||
1077 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1078 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1079 else
1080 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1081 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1082 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1083 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1084 (RI.isProperlyAlignedRC(*RC) &&
1085 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1086 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1087 if (ST.hasMovB64()) {
1088 Opcode = AMDGPU::V_MOV_B64_e32;
1089 EltSize = 8;
1090 } else if (ST.hasPkMovB32()) {
1091 Opcode = AMDGPU::V_PK_MOV_B32;
1092 EltSize = 8;
1093 }
1094 }
1095
1096 // For the cases where we need an intermediate instruction/temporary register
1097 // (destination is an AGPR), we need a scavenger.
1098 //
1099 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1100 // whole block for every handled copy.
1101 std::unique_ptr<RegScavenger> RS;
1102 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1103 RS = std::make_unique<RegScavenger>();
1104
1105 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1106
1107 // If there is an overlap, we can't kill the super-register on the last
1108 // instruction, since it will also kill the components made live by this def.
1109 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1110 const bool CanKillSuperReg = KillSrc && !Overlap;
1111
1112 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1113 unsigned SubIdx;
1114 if (Forward)
1115 SubIdx = SubIndices[Idx];
1116 else
1117 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1118 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1119 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1120 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1121
1122 bool IsFirstSubreg = Idx == 0;
1123 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1124
1125 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1126 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1127 Register ImpUseSuper = SrcReg;
1128 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1129 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1130 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1132 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1134 .addReg(SrcSubReg)
1136 .addReg(SrcSubReg)
1137 .addImm(0) // op_sel_lo
1138 .addImm(0) // op_sel_hi
1139 .addImm(0) // neg_lo
1140 .addImm(0) // neg_hi
1141 .addImm(0) // clamp
1142 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1143 if (IsFirstSubreg)
1145 } else {
1146 MachineInstrBuilder Builder =
1147 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1148 if (IsFirstSubreg)
1149 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1150
1151 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1152 }
1153 }
1154}
1155
1156int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1157 int NewOpc;
1158
1159 // Try to map original to commuted opcode
1160 NewOpc = AMDGPU::getCommuteRev(Opcode);
1161 if (NewOpc != -1)
1162 // Check if the commuted (REV) opcode exists on the target.
1163 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1164
1165 // Try to map commuted to original opcode
1166 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1167 if (NewOpc != -1)
1168 // Check if the original (non-REV) opcode exists on the target.
1169 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1170
1171 return Opcode;
1172}
1173
1174const TargetRegisterClass *
1176 return &AMDGPU::VGPR_32RegClass;
1177}
1178
1181 const DebugLoc &DL, Register DstReg,
1183 Register TrueReg,
1184 Register FalseReg) const {
1185 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1186 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1188 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1189 "Not a VGPR32 reg");
1190
1191 if (Cond.size() == 1) {
1192 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1193 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1194 .add(Cond[0]);
1195 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1196 .addImm(0)
1197 .addReg(FalseReg)
1198 .addImm(0)
1199 .addReg(TrueReg)
1200 .addReg(SReg);
1201 } else if (Cond.size() == 2) {
1202 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1203 switch (Cond[0].getImm()) {
1204 case SIInstrInfo::SCC_TRUE: {
1205 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1206 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1207 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1208 .addImm(0)
1209 .addReg(FalseReg)
1210 .addImm(0)
1211 .addReg(TrueReg)
1212 .addReg(SReg);
1213 break;
1214 }
1215 case SIInstrInfo::SCC_FALSE: {
1216 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1217 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1218 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1219 .addImm(0)
1220 .addReg(FalseReg)
1221 .addImm(0)
1222 .addReg(TrueReg)
1223 .addReg(SReg);
1224 break;
1225 }
1226 case SIInstrInfo::VCCNZ: {
1227 MachineOperand RegOp = Cond[1];
1228 RegOp.setImplicit(false);
1229 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1230 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1231 .add(RegOp);
1232 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1233 .addImm(0)
1234 .addReg(FalseReg)
1235 .addImm(0)
1236 .addReg(TrueReg)
1237 .addReg(SReg);
1238 break;
1239 }
1240 case SIInstrInfo::VCCZ: {
1241 MachineOperand RegOp = Cond[1];
1242 RegOp.setImplicit(false);
1243 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1244 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1245 .add(RegOp);
1246 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1247 .addImm(0)
1248 .addReg(TrueReg)
1249 .addImm(0)
1250 .addReg(FalseReg)
1251 .addReg(SReg);
1252 break;
1253 }
1254 case SIInstrInfo::EXECNZ: {
1255 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1256 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1257 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1258 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1259 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1260 .addImm(0)
1261 .addReg(FalseReg)
1262 .addImm(0)
1263 .addReg(TrueReg)
1264 .addReg(SReg);
1265 break;
1266 }
1267 case SIInstrInfo::EXECZ: {
1268 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1269 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1270 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1271 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1272 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1273 .addImm(0)
1274 .addReg(FalseReg)
1275 .addImm(0)
1276 .addReg(TrueReg)
1277 .addReg(SReg);
1278 llvm_unreachable("Unhandled branch predicate EXECZ");
1279 break;
1280 }
1281 default:
1282 llvm_unreachable("invalid branch predicate");
1283 }
1284 } else {
1285 llvm_unreachable("Can only handle Cond size 1 or 2");
1286 }
1287}
1288
1291 const DebugLoc &DL,
1292 Register SrcReg, int Value) const {
1293 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1294 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1295 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1296 .addImm(Value)
1297 .addReg(SrcReg);
1298
1299 return Reg;
1300}
1301
1304 const DebugLoc &DL,
1305 Register SrcReg, int Value) const {
1306 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1307 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1308 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1309 .addImm(Value)
1310 .addReg(SrcReg);
1311
1312 return Reg;
1313}
1314
1316 const Register Reg,
1317 int64_t &ImmVal) const {
1318 switch (MI.getOpcode()) {
1319 case AMDGPU::V_MOV_B32_e32:
1320 case AMDGPU::S_MOV_B32:
1321 case AMDGPU::S_MOVK_I32:
1322 case AMDGPU::S_MOV_B64:
1323 case AMDGPU::V_MOV_B64_e32:
1324 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1325 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1326 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1327 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::V_MOV_B64_PSEUDO: {
1329 const MachineOperand &Src0 = MI.getOperand(1);
1330 if (Src0.isImm()) {
1331 ImmVal = Src0.getImm();
1332 return MI.getOperand(0).getReg() == Reg;
1333 }
1334
1335 return false;
1336 }
1337 case AMDGPU::S_BREV_B32:
1338 case AMDGPU::V_BFREV_B32_e32:
1339 case AMDGPU::V_BFREV_B32_e64: {
1340 const MachineOperand &Src0 = MI.getOperand(1);
1341 if (Src0.isImm()) {
1342 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1343 return MI.getOperand(0).getReg() == Reg;
1344 }
1345
1346 return false;
1347 }
1348 case AMDGPU::S_NOT_B32:
1349 case AMDGPU::V_NOT_B32_e32:
1350 case AMDGPU::V_NOT_B32_e64: {
1351 const MachineOperand &Src0 = MI.getOperand(1);
1352 if (Src0.isImm()) {
1353 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1354 return MI.getOperand(0).getReg() == Reg;
1355 }
1356
1357 return false;
1358 }
1359 default:
1360 return false;
1361 }
1362}
1363
1365
1366 if (RI.isAGPRClass(DstRC))
1367 return AMDGPU::COPY;
1368 if (RI.getRegSizeInBits(*DstRC) == 16) {
1369 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1370 // before RA.
1371 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1372 }
1373 if (RI.getRegSizeInBits(*DstRC) == 32)
1374 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1375 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1376 return AMDGPU::S_MOV_B64;
1377 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1378 return AMDGPU::V_MOV_B64_PSEUDO;
1379 return AMDGPU::COPY;
1380}
1381
1382const MCInstrDesc &
1384 bool IsIndirectSrc) const {
1385 if (IsIndirectSrc) {
1386 if (VecSize <= 32) // 4 bytes
1387 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1388 if (VecSize <= 64) // 8 bytes
1389 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1390 if (VecSize <= 96) // 12 bytes
1391 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1392 if (VecSize <= 128) // 16 bytes
1393 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1394 if (VecSize <= 160) // 20 bytes
1395 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1396 if (VecSize <= 256) // 32 bytes
1397 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1398 if (VecSize <= 288) // 36 bytes
1399 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1400 if (VecSize <= 320) // 40 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1402 if (VecSize <= 352) // 44 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1404 if (VecSize <= 384) // 48 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1406 if (VecSize <= 512) // 64 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1408 if (VecSize <= 1024) // 128 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1410
1411 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1412 }
1413
1414 if (VecSize <= 32) // 4 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1416 if (VecSize <= 64) // 8 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1418 if (VecSize <= 96) // 12 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1420 if (VecSize <= 128) // 16 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1422 if (VecSize <= 160) // 20 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1424 if (VecSize <= 256) // 32 bytes
1425 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1426 if (VecSize <= 288) // 36 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1428 if (VecSize <= 320) // 40 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1430 if (VecSize <= 352) // 44 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1432 if (VecSize <= 384) // 48 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1434 if (VecSize <= 512) // 64 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1436 if (VecSize <= 1024) // 128 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1438
1439 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1440}
1441
1442static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1443 if (VecSize <= 32) // 4 bytes
1444 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1445 if (VecSize <= 64) // 8 bytes
1446 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1447 if (VecSize <= 96) // 12 bytes
1448 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1449 if (VecSize <= 128) // 16 bytes
1450 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1451 if (VecSize <= 160) // 20 bytes
1452 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1453 if (VecSize <= 256) // 32 bytes
1454 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1455 if (VecSize <= 288) // 36 bytes
1456 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1457 if (VecSize <= 320) // 40 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1459 if (VecSize <= 352) // 44 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1461 if (VecSize <= 384) // 48 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1463 if (VecSize <= 512) // 64 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1465 if (VecSize <= 1024) // 128 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1467
1468 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1469}
1470
1471static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1472 if (VecSize <= 32) // 4 bytes
1473 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1474 if (VecSize <= 64) // 8 bytes
1475 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1476 if (VecSize <= 96) // 12 bytes
1477 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1478 if (VecSize <= 128) // 16 bytes
1479 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1480 if (VecSize <= 160) // 20 bytes
1481 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1482 if (VecSize <= 256) // 32 bytes
1483 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1484 if (VecSize <= 288) // 36 bytes
1485 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1486 if (VecSize <= 320) // 40 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1488 if (VecSize <= 352) // 44 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1490 if (VecSize <= 384) // 48 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1492 if (VecSize <= 512) // 64 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1494 if (VecSize <= 1024) // 128 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1496
1497 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1498}
1499
1500static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1501 if (VecSize <= 64) // 8 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1503 if (VecSize <= 128) // 16 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1505 if (VecSize <= 256) // 32 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1507 if (VecSize <= 512) // 64 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1509 if (VecSize <= 1024) // 128 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1511
1512 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1513}
1514
1515const MCInstrDesc &
1516SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1517 bool IsSGPR) const {
1518 if (IsSGPR) {
1519 switch (EltSize) {
1520 case 32:
1521 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1522 case 64:
1523 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1524 default:
1525 llvm_unreachable("invalid reg indexing elt size");
1526 }
1527 }
1528
1529 assert(EltSize == 32 && "invalid reg indexing elt size");
1531}
1532
1533static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1534 switch (Size) {
1535 case 4:
1536 return AMDGPU::SI_SPILL_S32_SAVE;
1537 case 8:
1538 return AMDGPU::SI_SPILL_S64_SAVE;
1539 case 12:
1540 return AMDGPU::SI_SPILL_S96_SAVE;
1541 case 16:
1542 return AMDGPU::SI_SPILL_S128_SAVE;
1543 case 20:
1544 return AMDGPU::SI_SPILL_S160_SAVE;
1545 case 24:
1546 return AMDGPU::SI_SPILL_S192_SAVE;
1547 case 28:
1548 return AMDGPU::SI_SPILL_S224_SAVE;
1549 case 32:
1550 return AMDGPU::SI_SPILL_S256_SAVE;
1551 case 36:
1552 return AMDGPU::SI_SPILL_S288_SAVE;
1553 case 40:
1554 return AMDGPU::SI_SPILL_S320_SAVE;
1555 case 44:
1556 return AMDGPU::SI_SPILL_S352_SAVE;
1557 case 48:
1558 return AMDGPU::SI_SPILL_S384_SAVE;
1559 case 64:
1560 return AMDGPU::SI_SPILL_S512_SAVE;
1561 case 128:
1562 return AMDGPU::SI_SPILL_S1024_SAVE;
1563 default:
1564 llvm_unreachable("unknown register size");
1565 }
1566}
1567
1568static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1569 switch (Size) {
1570 case 2:
1571 return AMDGPU::SI_SPILL_V16_SAVE;
1572 case 4:
1573 return AMDGPU::SI_SPILL_V32_SAVE;
1574 case 8:
1575 return AMDGPU::SI_SPILL_V64_SAVE;
1576 case 12:
1577 return AMDGPU::SI_SPILL_V96_SAVE;
1578 case 16:
1579 return AMDGPU::SI_SPILL_V128_SAVE;
1580 case 20:
1581 return AMDGPU::SI_SPILL_V160_SAVE;
1582 case 24:
1583 return AMDGPU::SI_SPILL_V192_SAVE;
1584 case 28:
1585 return AMDGPU::SI_SPILL_V224_SAVE;
1586 case 32:
1587 return AMDGPU::SI_SPILL_V256_SAVE;
1588 case 36:
1589 return AMDGPU::SI_SPILL_V288_SAVE;
1590 case 40:
1591 return AMDGPU::SI_SPILL_V320_SAVE;
1592 case 44:
1593 return AMDGPU::SI_SPILL_V352_SAVE;
1594 case 48:
1595 return AMDGPU::SI_SPILL_V384_SAVE;
1596 case 64:
1597 return AMDGPU::SI_SPILL_V512_SAVE;
1598 case 128:
1599 return AMDGPU::SI_SPILL_V1024_SAVE;
1600 default:
1601 llvm_unreachable("unknown register size");
1602 }
1603}
1604
1605static unsigned getAVSpillSaveOpcode(unsigned Size) {
1606 switch (Size) {
1607 case 4:
1608 return AMDGPU::SI_SPILL_AV32_SAVE;
1609 case 8:
1610 return AMDGPU::SI_SPILL_AV64_SAVE;
1611 case 12:
1612 return AMDGPU::SI_SPILL_AV96_SAVE;
1613 case 16:
1614 return AMDGPU::SI_SPILL_AV128_SAVE;
1615 case 20:
1616 return AMDGPU::SI_SPILL_AV160_SAVE;
1617 case 24:
1618 return AMDGPU::SI_SPILL_AV192_SAVE;
1619 case 28:
1620 return AMDGPU::SI_SPILL_AV224_SAVE;
1621 case 32:
1622 return AMDGPU::SI_SPILL_AV256_SAVE;
1623 case 36:
1624 return AMDGPU::SI_SPILL_AV288_SAVE;
1625 case 40:
1626 return AMDGPU::SI_SPILL_AV320_SAVE;
1627 case 44:
1628 return AMDGPU::SI_SPILL_AV352_SAVE;
1629 case 48:
1630 return AMDGPU::SI_SPILL_AV384_SAVE;
1631 case 64:
1632 return AMDGPU::SI_SPILL_AV512_SAVE;
1633 case 128:
1634 return AMDGPU::SI_SPILL_AV1024_SAVE;
1635 default:
1636 llvm_unreachable("unknown register size");
1637 }
1638}
1639
1640static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1641 bool IsVectorSuperClass) {
1642 // Currently, there is only 32-bit WWM register spills needed.
1643 if (Size != 4)
1644 llvm_unreachable("unknown wwm register spill size");
1645
1646 if (IsVectorSuperClass)
1647 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1648
1649 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1650}
1651
1653 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1654 const SIMachineFunctionInfo &MFI) const {
1655 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1656
1657 // Choose the right opcode if spilling a WWM register.
1659 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1660
1661 // TODO: Check if AGPRs are available
1662 if (ST.hasMAIInsts())
1663 return getAVSpillSaveOpcode(Size);
1664
1666}
1667
1670 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1671 const TargetRegisterInfo *TRI, Register VReg,
1672 MachineInstr::MIFlag Flags) const {
1673 MachineFunction *MF = MBB.getParent();
1675 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1676 const DebugLoc &DL = MBB.findDebugLoc(MI);
1677
1678 MachinePointerInfo PtrInfo
1679 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1681 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1682 FrameInfo.getObjectAlign(FrameIndex));
1683 unsigned SpillSize = TRI->getSpillSize(*RC);
1684
1686 if (RI.isSGPRClass(RC)) {
1687 MFI->setHasSpilledSGPRs();
1688 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1689 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1690 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1691
1692 // We are only allowed to create one new instruction when spilling
1693 // registers, so we need to use pseudo instruction for spilling SGPRs.
1694 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1695
1696 // The SGPR spill/restore instructions only work on number sgprs, so we need
1697 // to make sure we are using the correct register class.
1698 if (SrcReg.isVirtual() && SpillSize == 4) {
1699 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1700 }
1701
1702 BuildMI(MBB, MI, DL, OpDesc)
1703 .addReg(SrcReg, getKillRegState(isKill)) // data
1704 .addFrameIndex(FrameIndex) // addr
1705 .addMemOperand(MMO)
1707
1708 if (RI.spillSGPRToVGPR())
1709 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1710 return;
1711 }
1712
1713 unsigned Opcode =
1714 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1715 MFI->setHasSpilledVGPRs();
1716
1717 BuildMI(MBB, MI, DL, get(Opcode))
1718 .addReg(SrcReg, getKillRegState(isKill)) // data
1719 .addFrameIndex(FrameIndex) // addr
1720 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1721 .addImm(0) // offset
1722 .addMemOperand(MMO);
1723}
1724
1725static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1726 switch (Size) {
1727 case 4:
1728 return AMDGPU::SI_SPILL_S32_RESTORE;
1729 case 8:
1730 return AMDGPU::SI_SPILL_S64_RESTORE;
1731 case 12:
1732 return AMDGPU::SI_SPILL_S96_RESTORE;
1733 case 16:
1734 return AMDGPU::SI_SPILL_S128_RESTORE;
1735 case 20:
1736 return AMDGPU::SI_SPILL_S160_RESTORE;
1737 case 24:
1738 return AMDGPU::SI_SPILL_S192_RESTORE;
1739 case 28:
1740 return AMDGPU::SI_SPILL_S224_RESTORE;
1741 case 32:
1742 return AMDGPU::SI_SPILL_S256_RESTORE;
1743 case 36:
1744 return AMDGPU::SI_SPILL_S288_RESTORE;
1745 case 40:
1746 return AMDGPU::SI_SPILL_S320_RESTORE;
1747 case 44:
1748 return AMDGPU::SI_SPILL_S352_RESTORE;
1749 case 48:
1750 return AMDGPU::SI_SPILL_S384_RESTORE;
1751 case 64:
1752 return AMDGPU::SI_SPILL_S512_RESTORE;
1753 case 128:
1754 return AMDGPU::SI_SPILL_S1024_RESTORE;
1755 default:
1756 llvm_unreachable("unknown register size");
1757 }
1758}
1759
1760static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1761 switch (Size) {
1762 case 2:
1763 return AMDGPU::SI_SPILL_V16_RESTORE;
1764 case 4:
1765 return AMDGPU::SI_SPILL_V32_RESTORE;
1766 case 8:
1767 return AMDGPU::SI_SPILL_V64_RESTORE;
1768 case 12:
1769 return AMDGPU::SI_SPILL_V96_RESTORE;
1770 case 16:
1771 return AMDGPU::SI_SPILL_V128_RESTORE;
1772 case 20:
1773 return AMDGPU::SI_SPILL_V160_RESTORE;
1774 case 24:
1775 return AMDGPU::SI_SPILL_V192_RESTORE;
1776 case 28:
1777 return AMDGPU::SI_SPILL_V224_RESTORE;
1778 case 32:
1779 return AMDGPU::SI_SPILL_V256_RESTORE;
1780 case 36:
1781 return AMDGPU::SI_SPILL_V288_RESTORE;
1782 case 40:
1783 return AMDGPU::SI_SPILL_V320_RESTORE;
1784 case 44:
1785 return AMDGPU::SI_SPILL_V352_RESTORE;
1786 case 48:
1787 return AMDGPU::SI_SPILL_V384_RESTORE;
1788 case 64:
1789 return AMDGPU::SI_SPILL_V512_RESTORE;
1790 case 128:
1791 return AMDGPU::SI_SPILL_V1024_RESTORE;
1792 default:
1793 llvm_unreachable("unknown register size");
1794 }
1795}
1796
1797static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1798 switch (Size) {
1799 case 4:
1800 return AMDGPU::SI_SPILL_AV32_RESTORE;
1801 case 8:
1802 return AMDGPU::SI_SPILL_AV64_RESTORE;
1803 case 12:
1804 return AMDGPU::SI_SPILL_AV96_RESTORE;
1805 case 16:
1806 return AMDGPU::SI_SPILL_AV128_RESTORE;
1807 case 20:
1808 return AMDGPU::SI_SPILL_AV160_RESTORE;
1809 case 24:
1810 return AMDGPU::SI_SPILL_AV192_RESTORE;
1811 case 28:
1812 return AMDGPU::SI_SPILL_AV224_RESTORE;
1813 case 32:
1814 return AMDGPU::SI_SPILL_AV256_RESTORE;
1815 case 36:
1816 return AMDGPU::SI_SPILL_AV288_RESTORE;
1817 case 40:
1818 return AMDGPU::SI_SPILL_AV320_RESTORE;
1819 case 44:
1820 return AMDGPU::SI_SPILL_AV352_RESTORE;
1821 case 48:
1822 return AMDGPU::SI_SPILL_AV384_RESTORE;
1823 case 64:
1824 return AMDGPU::SI_SPILL_AV512_RESTORE;
1825 case 128:
1826 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1827 default:
1828 llvm_unreachable("unknown register size");
1829 }
1830}
1831
1832static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1833 bool IsVectorSuperClass) {
1834 // Currently, there is only 32-bit WWM register spills needed.
1835 if (Size != 4)
1836 llvm_unreachable("unknown wwm register spill size");
1837
1838 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1839 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1840
1841 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1842}
1843
1845 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1846 const SIMachineFunctionInfo &MFI) const {
1847 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1848
1849 // Choose the right opcode if restoring a WWM register.
1851 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1852
1853 // TODO: Check if AGPRs are available
1854 if (ST.hasMAIInsts())
1856
1857 assert(!RI.isAGPRClass(RC));
1859}
1860
1863 Register DestReg, int FrameIndex,
1864 const TargetRegisterClass *RC,
1865 const TargetRegisterInfo *TRI,
1866 Register VReg,
1867 MachineInstr::MIFlag Flags) const {
1868 MachineFunction *MF = MBB.getParent();
1870 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1871 const DebugLoc &DL = MBB.findDebugLoc(MI);
1872 unsigned SpillSize = TRI->getSpillSize(*RC);
1873
1874 MachinePointerInfo PtrInfo
1875 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1876
1878 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1879 FrameInfo.getObjectAlign(FrameIndex));
1880
1881 if (RI.isSGPRClass(RC)) {
1882 MFI->setHasSpilledSGPRs();
1883 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1884 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1885 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1886
1887 // FIXME: Maybe this should not include a memoperand because it will be
1888 // lowered to non-memory instructions.
1889 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1890 if (DestReg.isVirtual() && SpillSize == 4) {
1892 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1893 }
1894
1895 if (RI.spillSGPRToVGPR())
1896 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1897 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1898 .addFrameIndex(FrameIndex) // addr
1899 .addMemOperand(MMO)
1901
1902 return;
1903 }
1904
1905 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1906 SpillSize, *MFI);
1907 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1908 .addFrameIndex(FrameIndex) // vaddr
1909 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1910 .addImm(0) // offset
1911 .addMemOperand(MMO);
1912}
1913
1918
1921 unsigned Quantity) const {
1922 DebugLoc DL = MBB.findDebugLoc(MI);
1923 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1924 while (Quantity > 0) {
1925 unsigned Arg = std::min(Quantity, MaxSNopCount);
1926 Quantity -= Arg;
1927 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1928 }
1929}
1930
1932 auto *MF = MBB.getParent();
1933 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1934
1935 assert(Info->isEntryFunction());
1936
1937 if (MBB.succ_empty()) {
1938 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1939 if (HasNoTerminator) {
1940 if (Info->returnsVoid()) {
1941 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1942 } else {
1943 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1944 }
1945 }
1946 }
1947}
1948
1952 const DebugLoc &DL) const {
1953 MachineFunction *MF = MBB.getParent();
1954 constexpr unsigned DoorbellIDMask = 0x3ff;
1955 constexpr unsigned ECQueueWaveAbort = 0x400;
1956
1957 MachineBasicBlock *TrapBB = &MBB;
1958 MachineBasicBlock *ContBB = &MBB;
1959 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1960
1961 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1962 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1963 TrapBB = MF->CreateMachineBasicBlock();
1964 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1965 MF->push_back(TrapBB);
1966 MBB.addSuccessor(TrapBB);
1967 }
1968
1969 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1970 // will be a nop.
1971 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1972 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1973 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1974 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1975 DoorbellReg)
1977 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1978 .addUse(AMDGPU::M0);
1979 Register DoorbellRegMasked =
1980 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1981 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
1982 .addUse(DoorbellReg)
1983 .addImm(DoorbellIDMask);
1984 Register SetWaveAbortBit =
1985 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1986 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
1987 .addUse(DoorbellRegMasked)
1988 .addImm(ECQueueWaveAbort);
1989 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1990 .addUse(SetWaveAbortBit);
1991 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
1993 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1994 .addUse(AMDGPU::TTMP2);
1995 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
1996 TrapBB->addSuccessor(HaltLoopBB);
1997
1998 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
1999 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2000 .addMBB(HaltLoopBB);
2001 MF->push_back(HaltLoopBB);
2002 HaltLoopBB->addSuccessor(HaltLoopBB);
2003
2004 return ContBB;
2005}
2006
2008 switch (MI.getOpcode()) {
2009 default:
2010 if (MI.isMetaInstruction())
2011 return 0;
2012 return 1; // FIXME: Do wait states equal cycles?
2013
2014 case AMDGPU::S_NOP:
2015 return MI.getOperand(0).getImm() + 1;
2016 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2017 // hazard, even if one exist, won't really be visible. Should we handle it?
2018 }
2019}
2020
2022 MachineBasicBlock &MBB = *MI.getParent();
2023 DebugLoc DL = MBB.findDebugLoc(MI);
2025 switch (MI.getOpcode()) {
2026 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2027 case AMDGPU::S_MOV_B64_term:
2028 // This is only a terminator to get the correct spill code placement during
2029 // register allocation.
2030 MI.setDesc(get(AMDGPU::S_MOV_B64));
2031 break;
2032
2033 case AMDGPU::S_MOV_B32_term:
2034 // This is only a terminator to get the correct spill code placement during
2035 // register allocation.
2036 MI.setDesc(get(AMDGPU::S_MOV_B32));
2037 break;
2038
2039 case AMDGPU::S_XOR_B64_term:
2040 // This is only a terminator to get the correct spill code placement during
2041 // register allocation.
2042 MI.setDesc(get(AMDGPU::S_XOR_B64));
2043 break;
2044
2045 case AMDGPU::S_XOR_B32_term:
2046 // This is only a terminator to get the correct spill code placement during
2047 // register allocation.
2048 MI.setDesc(get(AMDGPU::S_XOR_B32));
2049 break;
2050 case AMDGPU::S_OR_B64_term:
2051 // This is only a terminator to get the correct spill code placement during
2052 // register allocation.
2053 MI.setDesc(get(AMDGPU::S_OR_B64));
2054 break;
2055 case AMDGPU::S_OR_B32_term:
2056 // This is only a terminator to get the correct spill code placement during
2057 // register allocation.
2058 MI.setDesc(get(AMDGPU::S_OR_B32));
2059 break;
2060
2061 case AMDGPU::S_ANDN2_B64_term:
2062 // This is only a terminator to get the correct spill code placement during
2063 // register allocation.
2064 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2065 break;
2066
2067 case AMDGPU::S_ANDN2_B32_term:
2068 // This is only a terminator to get the correct spill code placement during
2069 // register allocation.
2070 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2071 break;
2072
2073 case AMDGPU::S_AND_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_AND_B64));
2077 break;
2078
2079 case AMDGPU::S_AND_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_AND_B32));
2083 break;
2084
2085 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2089 break;
2090
2091 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2095 break;
2096
2097 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2098 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2099 break;
2100
2101 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2102 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2103 break;
2104 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2105 Register Dst = MI.getOperand(0).getReg();
2106 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2107 MI.setDesc(
2108 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2109 break;
2110 }
2111 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2112 Register Dst = MI.getOperand(0).getReg();
2113 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2114 int64_t Imm = MI.getOperand(1).getImm();
2115
2116 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2117 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2118 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2121 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2122 .addImm(SignExtend64<32>(Imm >> 32))
2124 MI.eraseFromParent();
2125 break;
2126 }
2127
2128 [[fallthrough]];
2129 }
2130 case AMDGPU::V_MOV_B64_PSEUDO: {
2131 Register Dst = MI.getOperand(0).getReg();
2132 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2133 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2134
2135 const MachineOperand &SrcOp = MI.getOperand(1);
2136 // FIXME: Will this work for 64-bit floating point immediates?
2137 assert(!SrcOp.isFPImm());
2138 if (ST.hasMovB64()) {
2139 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2140 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2141 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2142 break;
2143 }
2144 if (SrcOp.isImm()) {
2145 APInt Imm(64, SrcOp.getImm());
2146 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2147 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2148 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2149 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2151 .addImm(Lo.getSExtValue())
2153 .addImm(Lo.getSExtValue())
2154 .addImm(0) // op_sel_lo
2155 .addImm(0) // op_sel_hi
2156 .addImm(0) // neg_lo
2157 .addImm(0) // neg_hi
2158 .addImm(0); // clamp
2159 } else {
2160 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2161 .addImm(Lo.getSExtValue())
2163 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2164 .addImm(Hi.getSExtValue())
2166 }
2167 } else {
2168 assert(SrcOp.isReg());
2169 if (ST.hasPkMovB32() &&
2170 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2171 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2172 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2173 .addReg(SrcOp.getReg())
2175 .addReg(SrcOp.getReg())
2176 .addImm(0) // op_sel_lo
2177 .addImm(0) // op_sel_hi
2178 .addImm(0) // neg_lo
2179 .addImm(0) // neg_hi
2180 .addImm(0); // clamp
2181 } else {
2182 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2183 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2185 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2186 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2188 }
2189 }
2190 MI.eraseFromParent();
2191 break;
2192 }
2193 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2195 break;
2196 }
2197 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2198 const MachineOperand &SrcOp = MI.getOperand(1);
2199 assert(!SrcOp.isFPImm());
2200
2201 if (ST.has64BitLiterals()) {
2202 MI.setDesc(get(AMDGPU::S_MOV_B64));
2203 break;
2204 }
2205
2206 APInt Imm(64, SrcOp.getImm());
2207 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2208 MI.setDesc(get(AMDGPU::S_MOV_B64));
2209 break;
2210 }
2211
2212 Register Dst = MI.getOperand(0).getReg();
2213 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2214 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2215
2216 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2217 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2218 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2219 .addImm(Lo.getSExtValue())
2221 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2222 .addImm(Hi.getSExtValue())
2224 MI.eraseFromParent();
2225 break;
2226 }
2227 case AMDGPU::V_SET_INACTIVE_B32: {
2228 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2229 Register DstReg = MI.getOperand(0).getReg();
2230 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2231 .add(MI.getOperand(3))
2232 .add(MI.getOperand(4))
2233 .add(MI.getOperand(1))
2234 .add(MI.getOperand(2))
2235 .add(MI.getOperand(5));
2236 MI.eraseFromParent();
2237 break;
2238 }
2239 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2240 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2241 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2242 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2243 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2244 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2245 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2246 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2247 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2248 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2249 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2250 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2251 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2252 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2253 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2254 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2255 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2256 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2257 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2258 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2259 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2260 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2261 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2262 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2263 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2264 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2265 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2266 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2267 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2268 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2269
2270 unsigned Opc;
2271 if (RI.hasVGPRs(EltRC)) {
2272 Opc = AMDGPU::V_MOVRELD_B32_e32;
2273 } else {
2274 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2275 : AMDGPU::S_MOVRELD_B32;
2276 }
2277
2278 const MCInstrDesc &OpDesc = get(Opc);
2279 Register VecReg = MI.getOperand(0).getReg();
2280 bool IsUndef = MI.getOperand(1).isUndef();
2281 unsigned SubReg = MI.getOperand(3).getImm();
2282 assert(VecReg == MI.getOperand(1).getReg());
2283
2285 BuildMI(MBB, MI, DL, OpDesc)
2286 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2287 .add(MI.getOperand(2))
2289 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2290
2291 const int ImpDefIdx =
2292 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2293 const int ImpUseIdx = ImpDefIdx + 1;
2294 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2295 MI.eraseFromParent();
2296 break;
2297 }
2298 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2306 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2307 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2308 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2309 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2310 assert(ST.useVGPRIndexMode());
2311 Register VecReg = MI.getOperand(0).getReg();
2312 bool IsUndef = MI.getOperand(1).isUndef();
2313 MachineOperand &Idx = MI.getOperand(3);
2314 Register SubReg = MI.getOperand(4).getImm();
2315
2316 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2317 .add(Idx)
2319 SetOn->getOperand(3).setIsUndef();
2320
2321 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2323 BuildMI(MBB, MI, DL, OpDesc)
2324 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2325 .add(MI.getOperand(2))
2327 .addReg(VecReg,
2328 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2329
2330 const int ImpDefIdx =
2331 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2332 const int ImpUseIdx = ImpDefIdx + 1;
2333 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2334
2335 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2336
2337 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2338
2339 MI.eraseFromParent();
2340 break;
2341 }
2342 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2343 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2344 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2345 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2346 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2347 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2348 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2349 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2350 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2351 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2352 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2353 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2354 assert(ST.useVGPRIndexMode());
2355 Register Dst = MI.getOperand(0).getReg();
2356 Register VecReg = MI.getOperand(1).getReg();
2357 bool IsUndef = MI.getOperand(1).isUndef();
2358 Register Idx = MI.getOperand(2).getReg();
2359 Register SubReg = MI.getOperand(3).getImm();
2360
2361 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2362 .addReg(Idx)
2364 SetOn->getOperand(3).setIsUndef();
2365
2366 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2367 .addDef(Dst)
2368 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2369 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2370
2371 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2372
2373 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2374
2375 MI.eraseFromParent();
2376 break;
2377 }
2378 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2379 MachineFunction &MF = *MBB.getParent();
2380 Register Reg = MI.getOperand(0).getReg();
2381 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2382 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2383 MachineOperand OpLo = MI.getOperand(1);
2384 MachineOperand OpHi = MI.getOperand(2);
2385
2386 // Create a bundle so these instructions won't be re-ordered by the
2387 // post-RA scheduler.
2388 MIBundleBuilder Bundler(MBB, MI);
2389 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2390
2391 // What we want here is an offset from the value returned by s_getpc (which
2392 // is the address of the s_add_u32 instruction) to the global variable, but
2393 // since the encoding of $symbol starts 4 bytes after the start of the
2394 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2395 // small. This requires us to add 4 to the global variable offset in order
2396 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2397 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2398 // instruction.
2399
2400 int64_t Adjust = 0;
2401 if (ST.hasGetPCZeroExtension()) {
2402 // Fix up hardware that does not sign-extend the 48-bit PC value by
2403 // inserting: s_sext_i32_i16 reghi, reghi
2404 Bundler.append(
2405 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2406 Adjust += 4;
2407 }
2408
2409 if (OpLo.isGlobal())
2410 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2411 Bundler.append(
2412 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2413
2414 if (OpHi.isGlobal())
2415 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2416 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2417 .addReg(RegHi)
2418 .add(OpHi));
2419
2420 finalizeBundle(MBB, Bundler.begin());
2421
2422 MI.eraseFromParent();
2423 break;
2424 }
2425 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2426 MachineFunction &MF = *MBB.getParent();
2427 Register Reg = MI.getOperand(0).getReg();
2428 MachineOperand Op = MI.getOperand(1);
2429
2430 // Create a bundle so these instructions won't be re-ordered by the
2431 // post-RA scheduler.
2432 MIBundleBuilder Bundler(MBB, MI);
2433 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2434 if (Op.isGlobal())
2435 Op.setOffset(Op.getOffset() + 4);
2436 Bundler.append(
2437 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2438
2439 finalizeBundle(MBB, Bundler.begin());
2440
2441 MI.eraseFromParent();
2442 break;
2443 }
2444 case AMDGPU::ENTER_STRICT_WWM: {
2445 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2446 // Whole Wave Mode is entered.
2447 MI.setDesc(get(LMC.OrSaveExecOpc));
2448 break;
2449 }
2450 case AMDGPU::ENTER_STRICT_WQM: {
2451 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2452 // STRICT_WQM is entered.
2453 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2454 .addReg(LMC.ExecReg);
2455 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2456
2457 MI.eraseFromParent();
2458 break;
2459 }
2460 case AMDGPU::EXIT_STRICT_WWM:
2461 case AMDGPU::EXIT_STRICT_WQM: {
2462 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2463 // WWM/STICT_WQM is exited.
2464 MI.setDesc(get(LMC.MovOpc));
2465 break;
2466 }
2467 case AMDGPU::SI_RETURN: {
2468 const MachineFunction *MF = MBB.getParent();
2469 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2470 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2471 // Hiding the return address use with SI_RETURN may lead to extra kills in
2472 // the function and missing live-ins. We are fine in practice because callee
2473 // saved register handling ensures the register value is restored before
2474 // RET, but we need the undef flag here to appease the MachineVerifier
2475 // liveness checks.
2477 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2478 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2479
2480 MIB.copyImplicitOps(MI);
2481 MI.eraseFromParent();
2482 break;
2483 }
2484
2485 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2486 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2487 MI.setDesc(get(AMDGPU::S_MUL_U64));
2488 break;
2489
2490 case AMDGPU::S_GETPC_B64_pseudo:
2491 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2492 if (ST.hasGetPCZeroExtension()) {
2493 Register Dst = MI.getOperand(0).getReg();
2494 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2495 // Fix up hardware that does not sign-extend the 48-bit PC value by
2496 // inserting: s_sext_i32_i16 dsthi, dsthi
2497 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2498 DstHi)
2499 .addReg(DstHi);
2500 }
2501 break;
2502
2503 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2504 assert(ST.hasBF16PackedInsts());
2505 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2506 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2507 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2508 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2509 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2510 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2511 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2512 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2513 break;
2514 }
2515
2516 return true;
2517}
2518
2521 unsigned SubIdx, const MachineInstr &Orig,
2522 const TargetRegisterInfo &RI) const {
2523
2524 // Try shrinking the instruction to remat only the part needed for current
2525 // context.
2526 // TODO: Handle more cases.
2527 unsigned Opcode = Orig.getOpcode();
2528 switch (Opcode) {
2529 case AMDGPU::S_LOAD_DWORDX16_IMM:
2530 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2531 if (SubIdx != 0)
2532 break;
2533
2534 if (I == MBB.end())
2535 break;
2536
2537 if (I->isBundled())
2538 break;
2539
2540 // Look for a single use of the register that is also a subreg.
2541 Register RegToFind = Orig.getOperand(0).getReg();
2542 MachineOperand *UseMO = nullptr;
2543 for (auto &CandMO : I->operands()) {
2544 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2545 continue;
2546 if (UseMO) {
2547 UseMO = nullptr;
2548 break;
2549 }
2550 UseMO = &CandMO;
2551 }
2552 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2553 break;
2554
2555 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2556 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2557
2558 MachineFunction *MF = MBB.getParent();
2560 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2561
2562 unsigned NewOpcode = -1;
2563 if (SubregSize == 256)
2564 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2565 else if (SubregSize == 128)
2566 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2567 else
2568 break;
2569
2570 const MCInstrDesc &TID = get(NewOpcode);
2571 const TargetRegisterClass *NewRC =
2572 RI.getAllocatableClass(getRegClass(TID, 0, &RI));
2573 MRI.setRegClass(DestReg, NewRC);
2574
2575 UseMO->setReg(DestReg);
2576 UseMO->setSubReg(AMDGPU::NoSubRegister);
2577
2578 // Use a smaller load with the desired size, possibly with updated offset.
2579 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2580 MI->setDesc(TID);
2581 MI->getOperand(0).setReg(DestReg);
2582 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2583 if (Offset) {
2584 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2585 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2586 OffsetMO->setImm(FinalOffset);
2587 }
2589 for (const MachineMemOperand *MemOp : Orig.memoperands())
2590 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2591 SubregSize / 8));
2592 MI->setMemRefs(*MF, NewMMOs);
2593
2594 MBB.insert(I, MI);
2595 return;
2596 }
2597
2598 default:
2599 break;
2600 }
2601
2602 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2603}
2604
2605std::pair<MachineInstr*, MachineInstr*>
2607 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2608
2609 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2611 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2612 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2613 return std::pair(&MI, nullptr);
2614 }
2615
2616 MachineBasicBlock &MBB = *MI.getParent();
2617 DebugLoc DL = MBB.findDebugLoc(MI);
2618 MachineFunction *MF = MBB.getParent();
2620 Register Dst = MI.getOperand(0).getReg();
2621 unsigned Part = 0;
2622 MachineInstr *Split[2];
2623
2624 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2625 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2626 if (Dst.isPhysical()) {
2627 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2628 } else {
2629 assert(MRI.isSSA());
2630 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2631 MovDPP.addDef(Tmp);
2632 }
2633
2634 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2635 const MachineOperand &SrcOp = MI.getOperand(I);
2636 assert(!SrcOp.isFPImm());
2637 if (SrcOp.isImm()) {
2638 APInt Imm(64, SrcOp.getImm());
2639 Imm.ashrInPlace(Part * 32);
2640 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2641 } else {
2642 assert(SrcOp.isReg());
2643 Register Src = SrcOp.getReg();
2644 if (Src.isPhysical())
2645 MovDPP.addReg(RI.getSubReg(Src, Sub));
2646 else
2647 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2648 }
2649 }
2650
2651 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2652 MovDPP.addImm(MO.getImm());
2653
2654 Split[Part] = MovDPP;
2655 ++Part;
2656 }
2657
2658 if (Dst.isVirtual())
2659 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2660 .addReg(Split[0]->getOperand(0).getReg())
2661 .addImm(AMDGPU::sub0)
2662 .addReg(Split[1]->getOperand(0).getReg())
2663 .addImm(AMDGPU::sub1);
2664
2665 MI.eraseFromParent();
2666 return std::pair(Split[0], Split[1]);
2667}
2668
2669std::optional<DestSourcePair>
2671 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2672 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2673
2674 return std::nullopt;
2675}
2676
2678 AMDGPU::OpName Src0OpName,
2679 MachineOperand &Src1,
2680 AMDGPU::OpName Src1OpName) const {
2681 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2682 if (!Src0Mods)
2683 return false;
2684
2685 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2686 assert(Src1Mods &&
2687 "All commutable instructions have both src0 and src1 modifiers");
2688
2689 int Src0ModsVal = Src0Mods->getImm();
2690 int Src1ModsVal = Src1Mods->getImm();
2691
2692 Src1Mods->setImm(Src0ModsVal);
2693 Src0Mods->setImm(Src1ModsVal);
2694 return true;
2695}
2696
2698 MachineOperand &RegOp,
2699 MachineOperand &NonRegOp) {
2700 Register Reg = RegOp.getReg();
2701 unsigned SubReg = RegOp.getSubReg();
2702 bool IsKill = RegOp.isKill();
2703 bool IsDead = RegOp.isDead();
2704 bool IsUndef = RegOp.isUndef();
2705 bool IsDebug = RegOp.isDebug();
2706
2707 if (NonRegOp.isImm())
2708 RegOp.ChangeToImmediate(NonRegOp.getImm());
2709 else if (NonRegOp.isFI())
2710 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2711 else if (NonRegOp.isGlobal()) {
2712 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2713 NonRegOp.getTargetFlags());
2714 } else
2715 return nullptr;
2716
2717 // Make sure we don't reinterpret a subreg index in the target flags.
2718 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2719
2720 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2721 NonRegOp.setSubReg(SubReg);
2722
2723 return &MI;
2724}
2725
2727 MachineOperand &NonRegOp1,
2728 MachineOperand &NonRegOp2) {
2729 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2730 int64_t NonRegVal = NonRegOp1.getImm();
2731
2732 NonRegOp1.setImm(NonRegOp2.getImm());
2733 NonRegOp2.setImm(NonRegVal);
2734 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2735 NonRegOp2.setTargetFlags(TargetFlags);
2736 return &MI;
2737}
2738
2739bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2740 unsigned OpIdx1) const {
2741 const MCInstrDesc &InstDesc = MI.getDesc();
2742 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2743 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2744
2745 unsigned Opc = MI.getOpcode();
2746 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2747
2748 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2749 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2750
2751 // Swap doesn't breach constant bus or literal limits
2752 // It may move literal to position other than src0, this is not allowed
2753 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2754 // FIXME: After gfx9, literal can be in place other than Src0
2755 if (isVALU(MI)) {
2756 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2757 !isInlineConstant(MO0, OpInfo1))
2758 return false;
2759 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2760 !isInlineConstant(MO1, OpInfo0))
2761 return false;
2762 }
2763
2764 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2765 if (OpInfo1.RegClass == -1)
2766 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2767 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2768 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2769 }
2770 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2771 if (OpInfo0.RegClass == -1)
2772 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2773 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2774 isLegalRegOperand(MI, OpIdx0, MO1);
2775 }
2776
2777 // No need to check 64-bit literals since swapping does not bring new
2778 // 64-bit literals into current instruction to fold to 32-bit
2779
2780 return isImmOperandLegal(MI, OpIdx1, MO0);
2781}
2782
2784 unsigned Src0Idx,
2785 unsigned Src1Idx) const {
2786 assert(!NewMI && "this should never be used");
2787
2788 unsigned Opc = MI.getOpcode();
2789 int CommutedOpcode = commuteOpcode(Opc);
2790 if (CommutedOpcode == -1)
2791 return nullptr;
2792
2793 if (Src0Idx > Src1Idx)
2794 std::swap(Src0Idx, Src1Idx);
2795
2796 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2797 static_cast<int>(Src0Idx) &&
2798 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2799 static_cast<int>(Src1Idx) &&
2800 "inconsistency with findCommutedOpIndices");
2801
2802 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2803 return nullptr;
2804
2805 MachineInstr *CommutedMI = nullptr;
2806 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2807 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2808 if (Src0.isReg() && Src1.isReg()) {
2809 // Be sure to copy the source modifiers to the right place.
2810 CommutedMI =
2811 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2812 } else if (Src0.isReg() && !Src1.isReg()) {
2813 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2814 } else if (!Src0.isReg() && Src1.isReg()) {
2815 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2816 } else if (Src0.isImm() && Src1.isImm()) {
2817 CommutedMI = swapImmOperands(MI, Src0, Src1);
2818 } else {
2819 // FIXME: Found two non registers to commute. This does happen.
2820 return nullptr;
2821 }
2822
2823 if (CommutedMI) {
2824 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2825 Src1, AMDGPU::OpName::src1_modifiers);
2826
2827 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2828 AMDGPU::OpName::src1_sel);
2829
2830 CommutedMI->setDesc(get(CommutedOpcode));
2831 }
2832
2833 return CommutedMI;
2834}
2835
2836// This needs to be implemented because the source modifiers may be inserted
2837// between the true commutable operands, and the base
2838// TargetInstrInfo::commuteInstruction uses it.
2840 unsigned &SrcOpIdx0,
2841 unsigned &SrcOpIdx1) const {
2842 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2843}
2844
2846 unsigned &SrcOpIdx0,
2847 unsigned &SrcOpIdx1) const {
2848 if (!Desc.isCommutable())
2849 return false;
2850
2851 unsigned Opc = Desc.getOpcode();
2852 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2853 if (Src0Idx == -1)
2854 return false;
2855
2856 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2857 if (Src1Idx == -1)
2858 return false;
2859
2860 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2861}
2862
2864 int64_t BrOffset) const {
2865 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2866 // because its dest block is unanalyzable.
2867 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2868
2869 // Convert to dwords.
2870 BrOffset /= 4;
2871
2872 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2873 // from the next instruction.
2874 BrOffset -= 1;
2875
2876 return isIntN(BranchOffsetBits, BrOffset);
2877}
2878
2881 return MI.getOperand(0).getMBB();
2882}
2883
2885 for (const MachineInstr &MI : MBB->terminators()) {
2886 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2887 MI.getOpcode() == AMDGPU::SI_LOOP)
2888 return true;
2889 }
2890 return false;
2891}
2892
2894 MachineBasicBlock &DestBB,
2895 MachineBasicBlock &RestoreBB,
2896 const DebugLoc &DL, int64_t BrOffset,
2897 RegScavenger *RS) const {
2898 assert(MBB.empty() &&
2899 "new block should be inserted for expanding unconditional branch");
2900 assert(MBB.pred_size() == 1);
2901 assert(RestoreBB.empty() &&
2902 "restore block should be inserted for restoring clobbered registers");
2903
2904 MachineFunction *MF = MBB.getParent();
2907 auto I = MBB.end();
2908 auto &MCCtx = MF->getContext();
2909
2910 if (ST.hasAddPC64Inst()) {
2911 MCSymbol *Offset =
2912 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2913 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2915 MCSymbol *PostAddPCLabel =
2916 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2917 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2918 auto *OffsetExpr = MCBinaryExpr::createSub(
2919 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2920 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2921 Offset->setVariableValue(OffsetExpr);
2922 return;
2923 }
2924
2925 assert(RS && "RegScavenger required for long branching");
2926
2927 // FIXME: Virtual register workaround for RegScavenger not working with empty
2928 // blocks.
2929 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2930
2931 // Note: as this is used after hazard recognizer we need to apply some hazard
2932 // workarounds directly.
2933 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2934 ST.hasVALUReadSGPRHazard();
2935 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2936 if (FlushSGPRWrites)
2937 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2939 };
2940
2941 // We need to compute the offset relative to the instruction immediately after
2942 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2943 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2944 ApplyHazardWorkarounds();
2945
2946 MCSymbol *PostGetPCLabel =
2947 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2948 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2949
2950 MCSymbol *OffsetLo =
2951 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2952 MCSymbol *OffsetHi =
2953 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2954 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2955 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2956 .addReg(PCReg, 0, AMDGPU::sub0)
2957 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2958 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2959 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2960 .addReg(PCReg, 0, AMDGPU::sub1)
2961 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2962 ApplyHazardWorkarounds();
2963
2964 // Insert the indirect branch after the other terminator.
2965 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2966 .addReg(PCReg);
2967
2968 // If a spill is needed for the pc register pair, we need to insert a spill
2969 // restore block right before the destination block, and insert a short branch
2970 // into the old destination block's fallthrough predecessor.
2971 // e.g.:
2972 //
2973 // s_cbranch_scc0 skip_long_branch:
2974 //
2975 // long_branch_bb:
2976 // spill s[8:9]
2977 // s_getpc_b64 s[8:9]
2978 // s_add_u32 s8, s8, restore_bb
2979 // s_addc_u32 s9, s9, 0
2980 // s_setpc_b64 s[8:9]
2981 //
2982 // skip_long_branch:
2983 // foo;
2984 //
2985 // .....
2986 //
2987 // dest_bb_fallthrough_predecessor:
2988 // bar;
2989 // s_branch dest_bb
2990 //
2991 // restore_bb:
2992 // restore s[8:9]
2993 // fallthrough dest_bb
2994 ///
2995 // dest_bb:
2996 // buzz;
2997
2998 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2999 Register Scav;
3000
3001 // If we've previously reserved a register for long branches
3002 // avoid running the scavenger and just use those registers
3003 if (LongBranchReservedReg) {
3004 RS->enterBasicBlock(MBB);
3005 Scav = LongBranchReservedReg;
3006 } else {
3007 RS->enterBasicBlockEnd(MBB);
3008 Scav = RS->scavengeRegisterBackwards(
3009 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3010 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3011 }
3012 if (Scav) {
3013 RS->setRegUsed(Scav);
3014 MRI.replaceRegWith(PCReg, Scav);
3015 MRI.clearVirtRegs();
3016 } else {
3017 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3018 // SGPR spill.
3019 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3020 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3021 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3022 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3023 MRI.clearVirtRegs();
3024 }
3025
3026 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3027 // Now, the distance could be defined.
3029 MCSymbolRefExpr::create(DestLabel, MCCtx),
3030 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3031 // Add offset assignments.
3032 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3033 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3034 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3035 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3036}
3037
3038unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3039 switch (Cond) {
3040 case SIInstrInfo::SCC_TRUE:
3041 return AMDGPU::S_CBRANCH_SCC1;
3042 case SIInstrInfo::SCC_FALSE:
3043 return AMDGPU::S_CBRANCH_SCC0;
3044 case SIInstrInfo::VCCNZ:
3045 return AMDGPU::S_CBRANCH_VCCNZ;
3046 case SIInstrInfo::VCCZ:
3047 return AMDGPU::S_CBRANCH_VCCZ;
3048 case SIInstrInfo::EXECNZ:
3049 return AMDGPU::S_CBRANCH_EXECNZ;
3050 case SIInstrInfo::EXECZ:
3051 return AMDGPU::S_CBRANCH_EXECZ;
3052 default:
3053 llvm_unreachable("invalid branch predicate");
3054 }
3055}
3056
3057SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3058 switch (Opcode) {
3059 case AMDGPU::S_CBRANCH_SCC0:
3060 return SCC_FALSE;
3061 case AMDGPU::S_CBRANCH_SCC1:
3062 return SCC_TRUE;
3063 case AMDGPU::S_CBRANCH_VCCNZ:
3064 return VCCNZ;
3065 case AMDGPU::S_CBRANCH_VCCZ:
3066 return VCCZ;
3067 case AMDGPU::S_CBRANCH_EXECNZ:
3068 return EXECNZ;
3069 case AMDGPU::S_CBRANCH_EXECZ:
3070 return EXECZ;
3071 default:
3072 return INVALID_BR;
3073 }
3074}
3075
3079 MachineBasicBlock *&FBB,
3081 bool AllowModify) const {
3082 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3083 // Unconditional Branch
3084 TBB = I->getOperand(0).getMBB();
3085 return false;
3086 }
3087
3088 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3089 if (Pred == INVALID_BR)
3090 return true;
3091
3092 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3093 Cond.push_back(MachineOperand::CreateImm(Pred));
3094 Cond.push_back(I->getOperand(1)); // Save the branch register.
3095
3096 ++I;
3097
3098 if (I == MBB.end()) {
3099 // Conditional branch followed by fall-through.
3100 TBB = CondBB;
3101 return false;
3102 }
3103
3104 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3105 TBB = CondBB;
3106 FBB = I->getOperand(0).getMBB();
3107 return false;
3108 }
3109
3110 return true;
3111}
3112
3114 MachineBasicBlock *&FBB,
3116 bool AllowModify) const {
3117 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3118 auto E = MBB.end();
3119 if (I == E)
3120 return false;
3121
3122 // Skip over the instructions that are artificially terminators for special
3123 // exec management.
3124 while (I != E && !I->isBranch() && !I->isReturn()) {
3125 switch (I->getOpcode()) {
3126 case AMDGPU::S_MOV_B64_term:
3127 case AMDGPU::S_XOR_B64_term:
3128 case AMDGPU::S_OR_B64_term:
3129 case AMDGPU::S_ANDN2_B64_term:
3130 case AMDGPU::S_AND_B64_term:
3131 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3132 case AMDGPU::S_MOV_B32_term:
3133 case AMDGPU::S_XOR_B32_term:
3134 case AMDGPU::S_OR_B32_term:
3135 case AMDGPU::S_ANDN2_B32_term:
3136 case AMDGPU::S_AND_B32_term:
3137 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3138 break;
3139 case AMDGPU::SI_IF:
3140 case AMDGPU::SI_ELSE:
3141 case AMDGPU::SI_KILL_I1_TERMINATOR:
3142 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3143 // FIXME: It's messy that these need to be considered here at all.
3144 return true;
3145 default:
3146 llvm_unreachable("unexpected non-branch terminator inst");
3147 }
3148
3149 ++I;
3150 }
3151
3152 if (I == E)
3153 return false;
3154
3155 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3156}
3157
3159 int *BytesRemoved) const {
3160 unsigned Count = 0;
3161 unsigned RemovedSize = 0;
3162 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3163 // Skip over artificial terminators when removing instructions.
3164 if (MI.isBranch() || MI.isReturn()) {
3165 RemovedSize += getInstSizeInBytes(MI);
3166 MI.eraseFromParent();
3167 ++Count;
3168 }
3169 }
3170
3171 if (BytesRemoved)
3172 *BytesRemoved = RemovedSize;
3173
3174 return Count;
3175}
3176
3177// Copy the flags onto the implicit condition register operand.
3179 const MachineOperand &OrigCond) {
3180 CondReg.setIsUndef(OrigCond.isUndef());
3181 CondReg.setIsKill(OrigCond.isKill());
3182}
3183
3186 MachineBasicBlock *FBB,
3188 const DebugLoc &DL,
3189 int *BytesAdded) const {
3190 if (!FBB && Cond.empty()) {
3191 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3192 .addMBB(TBB);
3193 if (BytesAdded)
3194 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3195 return 1;
3196 }
3197
3198 assert(TBB && Cond[0].isImm());
3199
3200 unsigned Opcode
3201 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3202
3203 if (!FBB) {
3204 MachineInstr *CondBr =
3205 BuildMI(&MBB, DL, get(Opcode))
3206 .addMBB(TBB);
3207
3208 // Copy the flags onto the implicit condition register operand.
3209 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3210 fixImplicitOperands(*CondBr);
3211
3212 if (BytesAdded)
3213 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3214 return 1;
3215 }
3216
3217 assert(TBB && FBB);
3218
3219 MachineInstr *CondBr =
3220 BuildMI(&MBB, DL, get(Opcode))
3221 .addMBB(TBB);
3222 fixImplicitOperands(*CondBr);
3223 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3224 .addMBB(FBB);
3225
3226 MachineOperand &CondReg = CondBr->getOperand(1);
3227 CondReg.setIsUndef(Cond[1].isUndef());
3228 CondReg.setIsKill(Cond[1].isKill());
3229
3230 if (BytesAdded)
3231 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3232
3233 return 2;
3234}
3235
3238 if (Cond.size() != 2) {
3239 return true;
3240 }
3241
3242 if (Cond[0].isImm()) {
3243 Cond[0].setImm(-Cond[0].getImm());
3244 return false;
3245 }
3246
3247 return true;
3248}
3249
3252 Register DstReg, Register TrueReg,
3253 Register FalseReg, int &CondCycles,
3254 int &TrueCycles, int &FalseCycles) const {
3255 switch (Cond[0].getImm()) {
3256 case VCCNZ:
3257 case VCCZ: {
3258 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3259 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3260 if (MRI.getRegClass(FalseReg) != RC)
3261 return false;
3262
3263 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3264 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3265
3266 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3267 return RI.hasVGPRs(RC) && NumInsts <= 6;
3268 }
3269 case SCC_TRUE:
3270 case SCC_FALSE: {
3271 // FIXME: We could insert for VGPRs if we could replace the original compare
3272 // with a vector one.
3273 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3274 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3275 if (MRI.getRegClass(FalseReg) != RC)
3276 return false;
3277
3278 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3279
3280 // Multiples of 8 can do s_cselect_b64
3281 if (NumInsts % 2 == 0)
3282 NumInsts /= 2;
3283
3284 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3285 return RI.isSGPRClass(RC);
3286 }
3287 default:
3288 return false;
3289 }
3290}
3291
3295 Register TrueReg, Register FalseReg) const {
3296 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3297 if (Pred == VCCZ || Pred == SCC_FALSE) {
3298 Pred = static_cast<BranchPredicate>(-Pred);
3299 std::swap(TrueReg, FalseReg);
3300 }
3301
3302 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3303 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3304 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3305
3306 if (DstSize == 32) {
3308 if (Pred == SCC_TRUE) {
3309 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3310 .addReg(TrueReg)
3311 .addReg(FalseReg);
3312 } else {
3313 // Instruction's operands are backwards from what is expected.
3314 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3315 .addReg(FalseReg)
3316 .addReg(TrueReg);
3317 }
3318
3319 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3320 return;
3321 }
3322
3323 if (DstSize == 64 && Pred == SCC_TRUE) {
3325 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3326 .addReg(TrueReg)
3327 .addReg(FalseReg);
3328
3329 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3330 return;
3331 }
3332
3333 static const int16_t Sub0_15[] = {
3334 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3335 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3336 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3337 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3338 };
3339
3340 static const int16_t Sub0_15_64[] = {
3341 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3342 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3343 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3344 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3345 };
3346
3347 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3348 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3349 const int16_t *SubIndices = Sub0_15;
3350 int NElts = DstSize / 32;
3351
3352 // 64-bit select is only available for SALU.
3353 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3354 if (Pred == SCC_TRUE) {
3355 if (NElts % 2) {
3356 SelOp = AMDGPU::S_CSELECT_B32;
3357 EltRC = &AMDGPU::SGPR_32RegClass;
3358 } else {
3359 SelOp = AMDGPU::S_CSELECT_B64;
3360 EltRC = &AMDGPU::SGPR_64RegClass;
3361 SubIndices = Sub0_15_64;
3362 NElts /= 2;
3363 }
3364 }
3365
3367 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3368
3369 I = MIB->getIterator();
3370
3372 for (int Idx = 0; Idx != NElts; ++Idx) {
3373 Register DstElt = MRI.createVirtualRegister(EltRC);
3374 Regs.push_back(DstElt);
3375
3376 unsigned SubIdx = SubIndices[Idx];
3377
3379 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3380 Select =
3381 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3382 .addReg(FalseReg, 0, SubIdx)
3383 .addReg(TrueReg, 0, SubIdx);
3384 } else {
3385 Select =
3386 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3387 .addReg(TrueReg, 0, SubIdx)
3388 .addReg(FalseReg, 0, SubIdx);
3389 }
3390
3391 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3393
3394 MIB.addReg(DstElt)
3395 .addImm(SubIdx);
3396 }
3397}
3398
3400 switch (MI.getOpcode()) {
3401 case AMDGPU::V_MOV_B16_t16_e32:
3402 case AMDGPU::V_MOV_B16_t16_e64:
3403 case AMDGPU::V_MOV_B32_e32:
3404 case AMDGPU::V_MOV_B32_e64:
3405 case AMDGPU::V_MOV_B64_PSEUDO:
3406 case AMDGPU::V_MOV_B64_e32:
3407 case AMDGPU::V_MOV_B64_e64:
3408 case AMDGPU::S_MOV_B32:
3409 case AMDGPU::S_MOV_B64:
3410 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3411 case AMDGPU::COPY:
3412 case AMDGPU::WWM_COPY:
3413 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3414 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3415 case AMDGPU::V_ACCVGPR_MOV_B32:
3416 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3417 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3418 return true;
3419 default:
3420 return false;
3421 }
3422}
3423
3425 switch (MI.getOpcode()) {
3426 case AMDGPU::V_MOV_B16_t16_e32:
3427 case AMDGPU::V_MOV_B16_t16_e64:
3428 return 2;
3429 case AMDGPU::V_MOV_B32_e32:
3430 case AMDGPU::V_MOV_B32_e64:
3431 case AMDGPU::V_MOV_B64_PSEUDO:
3432 case AMDGPU::V_MOV_B64_e32:
3433 case AMDGPU::V_MOV_B64_e64:
3434 case AMDGPU::S_MOV_B32:
3435 case AMDGPU::S_MOV_B64:
3436 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3437 case AMDGPU::COPY:
3438 case AMDGPU::WWM_COPY:
3439 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3440 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3441 case AMDGPU::V_ACCVGPR_MOV_B32:
3442 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3443 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3444 return 1;
3445 default:
3446 llvm_unreachable("MI is not a foldable copy");
3447 }
3448}
3449
3450static constexpr AMDGPU::OpName ModifierOpNames[] = {
3451 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3452 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3453 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3454
3456 unsigned Opc = MI.getOpcode();
3457 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3458 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3459 if (Idx >= 0)
3460 MI.removeOperand(Idx);
3461 }
3462}
3463
3464std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3465 unsigned SubRegIndex) {
3466 switch (SubRegIndex) {
3467 case AMDGPU::NoSubRegister:
3468 return Imm;
3469 case AMDGPU::sub0:
3470 return SignExtend64<32>(Imm);
3471 case AMDGPU::sub1:
3472 return SignExtend64<32>(Imm >> 32);
3473 case AMDGPU::lo16:
3474 return SignExtend64<16>(Imm);
3475 case AMDGPU::hi16:
3476 return SignExtend64<16>(Imm >> 16);
3477 case AMDGPU::sub1_lo16:
3478 return SignExtend64<16>(Imm >> 32);
3479 case AMDGPU::sub1_hi16:
3480 return SignExtend64<16>(Imm >> 48);
3481 default:
3482 return std::nullopt;
3483 }
3484
3485 llvm_unreachable("covered subregister switch");
3486}
3487
3488static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3489 switch (Opc) {
3490 case AMDGPU::V_MAC_F16_e32:
3491 case AMDGPU::V_MAC_F16_e64:
3492 case AMDGPU::V_MAD_F16_e64:
3493 return AMDGPU::V_MADAK_F16;
3494 case AMDGPU::V_MAC_F32_e32:
3495 case AMDGPU::V_MAC_F32_e64:
3496 case AMDGPU::V_MAD_F32_e64:
3497 return AMDGPU::V_MADAK_F32;
3498 case AMDGPU::V_FMAC_F32_e32:
3499 case AMDGPU::V_FMAC_F32_e64:
3500 case AMDGPU::V_FMA_F32_e64:
3501 return AMDGPU::V_FMAAK_F32;
3502 case AMDGPU::V_FMAC_F16_e32:
3503 case AMDGPU::V_FMAC_F16_e64:
3504 case AMDGPU::V_FMAC_F16_t16_e64:
3505 case AMDGPU::V_FMAC_F16_fake16_e64:
3506 case AMDGPU::V_FMA_F16_e64:
3507 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3508 ? AMDGPU::V_FMAAK_F16_t16
3509 : AMDGPU::V_FMAAK_F16_fake16
3510 : AMDGPU::V_FMAAK_F16;
3511 case AMDGPU::V_FMAC_F64_e32:
3512 case AMDGPU::V_FMAC_F64_e64:
3513 case AMDGPU::V_FMA_F64_e64:
3514 return AMDGPU::V_FMAAK_F64;
3515 default:
3516 llvm_unreachable("invalid instruction");
3517 }
3518}
3519
3520static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3521 switch (Opc) {
3522 case AMDGPU::V_MAC_F16_e32:
3523 case AMDGPU::V_MAC_F16_e64:
3524 case AMDGPU::V_MAD_F16_e64:
3525 return AMDGPU::V_MADMK_F16;
3526 case AMDGPU::V_MAC_F32_e32:
3527 case AMDGPU::V_MAC_F32_e64:
3528 case AMDGPU::V_MAD_F32_e64:
3529 return AMDGPU::V_MADMK_F32;
3530 case AMDGPU::V_FMAC_F32_e32:
3531 case AMDGPU::V_FMAC_F32_e64:
3532 case AMDGPU::V_FMA_F32_e64:
3533 return AMDGPU::V_FMAMK_F32;
3534 case AMDGPU::V_FMAC_F16_e32:
3535 case AMDGPU::V_FMAC_F16_e64:
3536 case AMDGPU::V_FMAC_F16_t16_e64:
3537 case AMDGPU::V_FMAC_F16_fake16_e64:
3538 case AMDGPU::V_FMA_F16_e64:
3539 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3540 ? AMDGPU::V_FMAMK_F16_t16
3541 : AMDGPU::V_FMAMK_F16_fake16
3542 : AMDGPU::V_FMAMK_F16;
3543 case AMDGPU::V_FMAC_F64_e32:
3544 case AMDGPU::V_FMAC_F64_e64:
3545 case AMDGPU::V_FMA_F64_e64:
3546 return AMDGPU::V_FMAMK_F64;
3547 default:
3548 llvm_unreachable("invalid instruction");
3549 }
3550}
3551
3553 Register Reg, MachineRegisterInfo *MRI) const {
3554 int64_t Imm;
3555 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3556 return false;
3557
3558 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3559
3560 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3561
3562 unsigned Opc = UseMI.getOpcode();
3563 if (Opc == AMDGPU::COPY) {
3564 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3565
3566 Register DstReg = UseMI.getOperand(0).getReg();
3567 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3568
3569 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3570
3571 if (HasMultipleUses) {
3572 // TODO: This should fold in more cases with multiple use, but we need to
3573 // more carefully consider what those uses are.
3574 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3575
3576 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3577 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3578 return false;
3579
3580 // Most of the time folding a 32-bit inline constant is free (though this
3581 // might not be true if we can't later fold it into a real user).
3582 //
3583 // FIXME: This isInlineConstant check is imprecise if
3584 // getConstValDefinedInReg handled the tricky non-mov cases.
3585 if (ImmDefSize == 32 &&
3587 return false;
3588 }
3589
3590 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3591 RI.getSubRegIdxSize(UseSubReg) == 16;
3592
3593 if (Is16Bit) {
3594 if (RI.hasVGPRs(DstRC))
3595 return false; // Do not clobber vgpr_hi16
3596
3597 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3598 return false;
3599 }
3600
3601 MachineFunction *MF = UseMI.getMF();
3602
3603 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3604 MCRegister MovDstPhysReg =
3605 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3606
3607 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3608
3609 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3610 for (unsigned MovOp :
3611 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3612 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3613 const MCInstrDesc &MovDesc = get(MovOp);
3614
3615 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
3616 if (Is16Bit) {
3617 // We just need to find a correctly sized register class, so the
3618 // subregister index compatibility doesn't matter since we're statically
3619 // extracting the immediate value.
3620 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3621 if (!MovDstRC)
3622 continue;
3623
3624 if (MovDstPhysReg) {
3625 // FIXME: We probably should not do this. If there is a live value in
3626 // the high half of the register, it will be corrupted.
3627 MovDstPhysReg =
3628 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3629 if (!MovDstPhysReg)
3630 continue;
3631 }
3632 }
3633
3634 // Result class isn't the right size, try the next instruction.
3635 if (MovDstPhysReg) {
3636 if (!MovDstRC->contains(MovDstPhysReg))
3637 return false;
3638 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3639 // TODO: This will be overly conservative in the case of 16-bit virtual
3640 // SGPRs. We could hack up the virtual register uses to use a compatible
3641 // 32-bit class.
3642 continue;
3643 }
3644
3645 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3646
3647 // Ensure the interpreted immediate value is a valid operand in the new
3648 // mov.
3649 //
3650 // FIXME: isImmOperandLegal should have form that doesn't require existing
3651 // MachineInstr or MachineOperand
3652 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3653 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3654 break;
3655
3656 NewOpc = MovOp;
3657 break;
3658 }
3659
3660 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3661 return false;
3662
3663 if (Is16Bit) {
3664 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3665 if (MovDstPhysReg)
3666 UseMI.getOperand(0).setReg(MovDstPhysReg);
3667 assert(UseMI.getOperand(1).getReg().isVirtual());
3668 }
3669
3670 const MCInstrDesc &NewMCID = get(NewOpc);
3671 UseMI.setDesc(NewMCID);
3672 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3673 UseMI.addImplicitDefUseOperands(*MF);
3674 return true;
3675 }
3676
3677 if (HasMultipleUses)
3678 return false;
3679
3680 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3681 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3682 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3683 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3684 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3685 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3686 Opc == AMDGPU::V_FMAC_F64_e64) {
3687 // Don't fold if we are using source or output modifiers. The new VOP2
3688 // instructions don't have them.
3690 return false;
3691
3692 // If this is a free constant, there's no reason to do this.
3693 // TODO: We could fold this here instead of letting SIFoldOperands do it
3694 // later.
3695 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3696
3697 // Any src operand can be used for the legality check.
3698 if (isInlineConstant(UseMI, Src0Idx, Imm))
3699 return false;
3700
3701 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3702
3703 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3704 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3705
3706 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3707 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3708 (Src1->isReg() && Src1->getReg() == Reg)) {
3709 MachineOperand *RegSrc =
3710 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3711 if (!RegSrc->isReg())
3712 return false;
3713 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3714 ST.getConstantBusLimit(Opc) < 2)
3715 return false;
3716
3717 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3718 return false;
3719
3720 // If src2 is also a literal constant then we have to choose which one to
3721 // fold. In general it is better to choose madak so that the other literal
3722 // can be materialized in an sgpr instead of a vgpr:
3723 // s_mov_b32 s0, literal
3724 // v_madak_f32 v0, s0, v0, literal
3725 // Instead of:
3726 // v_mov_b32 v1, literal
3727 // v_madmk_f32 v0, v0, literal, v1
3728 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3729 if (Def && Def->isMoveImmediate() &&
3730 !isInlineConstant(Def->getOperand(1)))
3731 return false;
3732
3733 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3734 if (pseudoToMCOpcode(NewOpc) == -1)
3735 return false;
3736
3737 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3738 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3739 // restricting their register classes. For now just bail out.
3740 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3741 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3742 return false;
3743
3744 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3745 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3746
3747 // FIXME: This would be a lot easier if we could return a new instruction
3748 // instead of having to modify in place.
3749
3750 Register SrcReg = RegSrc->getReg();
3751 unsigned SrcSubReg = RegSrc->getSubReg();
3752 Src0->setReg(SrcReg);
3753 Src0->setSubReg(SrcSubReg);
3754 Src0->setIsKill(RegSrc->isKill());
3755
3756 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3757 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3758 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3759 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3760 UseMI.untieRegOperand(
3761 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3762
3763 Src1->ChangeToImmediate(*SubRegImm);
3764
3766 UseMI.setDesc(get(NewOpc));
3767
3768 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3769 if (DeleteDef)
3770 DefMI.eraseFromParent();
3771
3772 return true;
3773 }
3774
3775 // Added part is the constant: Use v_madak_{f16, f32}.
3776 if (Src2->isReg() && Src2->getReg() == Reg) {
3777 if (ST.getConstantBusLimit(Opc) < 2) {
3778 // Not allowed to use constant bus for another operand.
3779 // We can however allow an inline immediate as src0.
3780 bool Src0Inlined = false;
3781 if (Src0->isReg()) {
3782 // Try to inline constant if possible.
3783 // If the Def moves immediate and the use is single
3784 // We are saving VGPR here.
3785 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3786 if (Def && Def->isMoveImmediate() &&
3787 isInlineConstant(Def->getOperand(1)) &&
3788 MRI->hasOneNonDBGUse(Src0->getReg())) {
3789 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3790 Src0Inlined = true;
3791 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3792 RI.isSGPRReg(*MRI, Src0->getReg())) {
3793 return false;
3794 }
3795 // VGPR is okay as Src0 - fallthrough
3796 }
3797
3798 if (Src1->isReg() && !Src0Inlined) {
3799 // We have one slot for inlinable constant so far - try to fill it
3800 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3801 if (Def && Def->isMoveImmediate() &&
3802 isInlineConstant(Def->getOperand(1)) &&
3803 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3804 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3805 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3806 return false;
3807 // VGPR is okay as Src1 - fallthrough
3808 }
3809 }
3810
3811 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3812 if (pseudoToMCOpcode(NewOpc) == -1)
3813 return false;
3814
3815 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3816 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3817 // restricting their register classes. For now just bail out.
3818 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3819 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3820 return false;
3821
3822 // FIXME: This would be a lot easier if we could return a new instruction
3823 // instead of having to modify in place.
3824
3825 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3826 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3827 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3828 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3829 UseMI.untieRegOperand(
3830 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3831
3832 const std::optional<int64_t> SubRegImm =
3833 extractSubregFromImm(Imm, Src2->getSubReg());
3834
3835 // ChangingToImmediate adds Src2 back to the instruction.
3836 Src2->ChangeToImmediate(*SubRegImm);
3837
3838 // These come before src2.
3840 UseMI.setDesc(get(NewOpc));
3841 // It might happen that UseMI was commuted
3842 // and we now have SGPR as SRC1. If so 2 inlined
3843 // constant and SGPR are illegal.
3845
3846 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3847 if (DeleteDef)
3848 DefMI.eraseFromParent();
3849
3850 return true;
3851 }
3852 }
3853
3854 return false;
3855}
3856
3857static bool
3860 if (BaseOps1.size() != BaseOps2.size())
3861 return false;
3862 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3863 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3864 return false;
3865 }
3866 return true;
3867}
3868
3869static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3870 LocationSize WidthB, int OffsetB) {
3871 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3872 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3873 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3874 return LowWidth.hasValue() &&
3875 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3876}
3877
3878bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3879 const MachineInstr &MIb) const {
3880 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3881 int64_t Offset0, Offset1;
3882 LocationSize Dummy0 = LocationSize::precise(0);
3883 LocationSize Dummy1 = LocationSize::precise(0);
3884 bool Offset0IsScalable, Offset1IsScalable;
3885 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3886 Dummy0, &RI) ||
3887 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3888 Dummy1, &RI))
3889 return false;
3890
3891 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3892 return false;
3893
3894 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3895 // FIXME: Handle ds_read2 / ds_write2.
3896 return false;
3897 }
3898 LocationSize Width0 = MIa.memoperands().front()->getSize();
3899 LocationSize Width1 = MIb.memoperands().front()->getSize();
3900 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3901}
3902
3904 const MachineInstr &MIb) const {
3905 assert(MIa.mayLoadOrStore() &&
3906 "MIa must load from or modify a memory location");
3907 assert(MIb.mayLoadOrStore() &&
3908 "MIb must load from or modify a memory location");
3909
3911 return false;
3912
3913 // XXX - Can we relax this between address spaces?
3914 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3915 return false;
3916
3917 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3918 return false;
3919
3920 // TODO: Should we check the address space from the MachineMemOperand? That
3921 // would allow us to distinguish objects we know don't alias based on the
3922 // underlying address space, even if it was lowered to a different one,
3923 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3924 // buffer.
3925 if (isDS(MIa)) {
3926 if (isDS(MIb))
3927 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3928
3929 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3930 }
3931
3932 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3933 if (isMUBUF(MIb) || isMTBUF(MIb))
3934 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3935
3936 if (isFLAT(MIb))
3937 return isFLATScratch(MIb);
3938
3939 return !isSMRD(MIb);
3940 }
3941
3942 if (isSMRD(MIa)) {
3943 if (isSMRD(MIb))
3944 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3945
3946 if (isFLAT(MIb))
3947 return isFLATScratch(MIb);
3948
3949 return !isMUBUF(MIb) && !isMTBUF(MIb);
3950 }
3951
3952 if (isFLAT(MIa)) {
3953 if (isFLAT(MIb)) {
3954 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3955 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3956 return true;
3957
3958 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3959 }
3960
3961 return false;
3962 }
3963
3964 return false;
3965}
3966
3968 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3969 if (Reg.isPhysical())
3970 return false;
3971 auto *Def = MRI.getUniqueVRegDef(Reg);
3972 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3973 Imm = Def->getOperand(1).getImm();
3974 if (DefMI)
3975 *DefMI = Def;
3976 return true;
3977 }
3978 return false;
3979}
3980
3981static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3982 MachineInstr **DefMI = nullptr) {
3983 if (!MO->isReg())
3984 return false;
3985 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3986 const MachineRegisterInfo &MRI = MF->getRegInfo();
3987 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3988}
3989
3991 MachineInstr &NewMI) {
3992 if (LV) {
3993 unsigned NumOps = MI.getNumOperands();
3994 for (unsigned I = 1; I < NumOps; ++I) {
3995 MachineOperand &Op = MI.getOperand(I);
3996 if (Op.isReg() && Op.isKill())
3997 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3998 }
3999 }
4000}
4001
4002static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4003 switch (Opc) {
4004 case AMDGPU::V_MAC_F16_e32:
4005 case AMDGPU::V_MAC_F16_e64:
4006 return AMDGPU::V_MAD_F16_e64;
4007 case AMDGPU::V_MAC_F32_e32:
4008 case AMDGPU::V_MAC_F32_e64:
4009 return AMDGPU::V_MAD_F32_e64;
4010 case AMDGPU::V_MAC_LEGACY_F32_e32:
4011 case AMDGPU::V_MAC_LEGACY_F32_e64:
4012 return AMDGPU::V_MAD_LEGACY_F32_e64;
4013 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4014 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4015 return AMDGPU::V_FMA_LEGACY_F32_e64;
4016 case AMDGPU::V_FMAC_F16_e32:
4017 case AMDGPU::V_FMAC_F16_e64:
4018 case AMDGPU::V_FMAC_F16_t16_e64:
4019 case AMDGPU::V_FMAC_F16_fake16_e64:
4020 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4021 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4022 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4023 : AMDGPU::V_FMA_F16_gfx9_e64;
4024 case AMDGPU::V_FMAC_F32_e32:
4025 case AMDGPU::V_FMAC_F32_e64:
4026 return AMDGPU::V_FMA_F32_e64;
4027 case AMDGPU::V_FMAC_F64_e32:
4028 case AMDGPU::V_FMAC_F64_e64:
4029 return AMDGPU::V_FMA_F64_e64;
4030 default:
4031 llvm_unreachable("invalid instruction");
4032 }
4033}
4034
4036 LiveVariables *LV,
4037 LiveIntervals *LIS) const {
4038 MachineBasicBlock &MBB = *MI.getParent();
4039 unsigned Opc = MI.getOpcode();
4040
4041 // Handle MFMA.
4042 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4043 if (NewMFMAOpc != -1) {
4045 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4046 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4047 MIB.add(MI.getOperand(I));
4048 updateLiveVariables(LV, MI, *MIB);
4049 if (LIS) {
4050 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4051 // SlotIndex of defs needs to be updated when converting to early-clobber
4052 MachineOperand &Def = MIB->getOperand(0);
4053 if (Def.isEarlyClobber() && Def.isReg() &&
4054 LIS->hasInterval(Def.getReg())) {
4055 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
4056 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
4057 auto &LI = LIS->getInterval(Def.getReg());
4058 auto UpdateDefIndex = [&](LiveRange &LR) {
4059 auto *S = LR.find(OldIndex);
4060 if (S != LR.end() && S->start == OldIndex) {
4061 assert(S->valno && S->valno->def == OldIndex);
4062 S->start = NewIndex;
4063 S->valno->def = NewIndex;
4064 }
4065 };
4066 UpdateDefIndex(LI);
4067 for (auto &SR : LI.subranges())
4068 UpdateDefIndex(SR);
4069 }
4070 }
4071 return MIB;
4072 }
4073
4074 if (SIInstrInfo::isWMMA(MI)) {
4075 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4076 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4077 .setMIFlags(MI.getFlags());
4078 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4079 MIB->addOperand(MI.getOperand(I));
4080
4081 updateLiveVariables(LV, MI, *MIB);
4082 if (LIS)
4083 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4084
4085 return MIB;
4086 }
4087
4088 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4089 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4090 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4091 "present pre-RA");
4092
4093 // Handle MAC/FMAC.
4094 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4095 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4096 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4097 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4098 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4099 bool Src0Literal = false;
4100
4101 switch (Opc) {
4102 default:
4103 return nullptr;
4104 case AMDGPU::V_MAC_F16_e64:
4105 case AMDGPU::V_FMAC_F16_e64:
4106 case AMDGPU::V_FMAC_F16_t16_e64:
4107 case AMDGPU::V_FMAC_F16_fake16_e64:
4108 case AMDGPU::V_MAC_F32_e64:
4109 case AMDGPU::V_MAC_LEGACY_F32_e64:
4110 case AMDGPU::V_FMAC_F32_e64:
4111 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4112 case AMDGPU::V_FMAC_F64_e64:
4113 break;
4114 case AMDGPU::V_MAC_F16_e32:
4115 case AMDGPU::V_FMAC_F16_e32:
4116 case AMDGPU::V_MAC_F32_e32:
4117 case AMDGPU::V_MAC_LEGACY_F32_e32:
4118 case AMDGPU::V_FMAC_F32_e32:
4119 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4120 case AMDGPU::V_FMAC_F64_e32: {
4121 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4122 AMDGPU::OpName::src0);
4123 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4124 if (!Src0->isReg() && !Src0->isImm())
4125 return nullptr;
4126
4127 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4128 Src0Literal = true;
4129
4130 break;
4131 }
4132 }
4133
4135 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4136 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4137 const MachineOperand *Src0Mods =
4138 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4139 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4140 const MachineOperand *Src1Mods =
4141 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4142 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4143 const MachineOperand *Src2Mods =
4144 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4145 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4146 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4147 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4148
4149 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4150 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4151 // If we have an SGPR input, we will violate the constant bus restriction.
4152 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4153 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4155 const auto killDef = [&]() -> void {
4156 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4157 // The only user is the instruction which will be killed.
4158 Register DefReg = DefMI->getOperand(0).getReg();
4159
4160 if (MRI.hasOneNonDBGUse(DefReg)) {
4161 // We cannot just remove the DefMI here, calling pass will crash.
4162 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
4163 DefMI->getOperand(0).setIsDead(true);
4164 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4165 DefMI->removeOperand(I);
4166 if (LV)
4167 LV->getVarInfo(DefReg).AliveBlocks.clear();
4168 }
4169
4170 if (LIS) {
4171 LiveInterval &DefLI = LIS->getInterval(DefReg);
4172
4173 // We cannot delete the original instruction here, so hack out the use
4174 // in the original instruction with a dummy register so we can use
4175 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4176 // not have the complexity of deleting a use to consider here.
4177 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4178 for (MachineOperand &MIOp : MI.uses()) {
4179 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4180 MIOp.setIsUndef(true);
4181 MIOp.setReg(DummyReg);
4182 }
4183 }
4184
4185 LIS->shrinkToUses(&DefLI);
4186 }
4187 };
4188
4189 int64_t Imm;
4190 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4191 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4192 if (pseudoToMCOpcode(NewOpc) != -1) {
4193 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4194 .add(*Dst)
4195 .add(*Src0)
4196 .add(*Src1)
4197 .addImm(Imm)
4198 .setMIFlags(MI.getFlags());
4199 updateLiveVariables(LV, MI, *MIB);
4200 if (LIS)
4201 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4202 killDef();
4203 return MIB;
4204 }
4205 }
4206 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4207 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4208 if (pseudoToMCOpcode(NewOpc) != -1) {
4209 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4210 .add(*Dst)
4211 .add(*Src0)
4212 .addImm(Imm)
4213 .add(*Src2)
4214 .setMIFlags(MI.getFlags());
4215 updateLiveVariables(LV, MI, *MIB);
4216
4217 if (LIS)
4218 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4219 killDef();
4220 return MIB;
4221 }
4222 }
4223 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4224 if (Src0Literal) {
4225 Imm = Src0->getImm();
4226 DefMI = nullptr;
4227 }
4228 if (pseudoToMCOpcode(NewOpc) != -1 &&
4230 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4231 Src1)) {
4232 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4233 .add(*Dst)
4234 .add(*Src1)
4235 .addImm(Imm)
4236 .add(*Src2)
4237 .setMIFlags(MI.getFlags());
4238 updateLiveVariables(LV, MI, *MIB);
4239
4240 if (LIS)
4241 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4242 if (DefMI)
4243 killDef();
4244 return MIB;
4245 }
4246 }
4247 }
4248
4249 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4250 // if VOP3 does not allow a literal operand.
4251 if (Src0Literal && !ST.hasVOP3Literal())
4252 return nullptr;
4253
4254 unsigned NewOpc = getNewFMAInst(ST, Opc);
4255
4256 if (pseudoToMCOpcode(NewOpc) == -1)
4257 return nullptr;
4258
4259 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4260 .add(*Dst)
4261 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4262 .add(*Src0)
4263 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4264 .add(*Src1)
4265 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4266 .add(*Src2)
4267 .addImm(Clamp ? Clamp->getImm() : 0)
4268 .addImm(Omod ? Omod->getImm() : 0)
4269 .setMIFlags(MI.getFlags());
4270 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4271 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4272 updateLiveVariables(LV, MI, *MIB);
4273 if (LIS)
4274 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4275 return MIB;
4276}
4277
4278// It's not generally safe to move VALU instructions across these since it will
4279// start using the register as a base index rather than directly.
4280// XXX - Why isn't hasSideEffects sufficient for these?
4282 switch (MI.getOpcode()) {
4283 case AMDGPU::S_SET_GPR_IDX_ON:
4284 case AMDGPU::S_SET_GPR_IDX_MODE:
4285 case AMDGPU::S_SET_GPR_IDX_OFF:
4286 return true;
4287 default:
4288 return false;
4289 }
4290}
4291
4293 const MachineBasicBlock *MBB,
4294 const MachineFunction &MF) const {
4295 // Skipping the check for SP writes in the base implementation. The reason it
4296 // was added was apparently due to compile time concerns.
4297 //
4298 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4299 // but is probably avoidable.
4300
4301 // Copied from base implementation.
4302 // Terminators and labels can't be scheduled around.
4303 if (MI.isTerminator() || MI.isPosition())
4304 return true;
4305
4306 // INLINEASM_BR can jump to another block
4307 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4308 return true;
4309
4310 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4311 return true;
4312
4313 // Target-independent instructions do not have an implicit-use of EXEC, even
4314 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4315 // boundaries prevents incorrect movements of such instructions.
4316 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4317 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4318 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4319 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4320 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4322}
4323
4325 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4326 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4327 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4328}
4329
4331 if (!isFLAT(MI) || isFLATGlobal(MI))
4332 return false;
4333
4334 // If scratch is not initialized, we can never access it.
4335 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4336 return false;
4337
4338 // SCRATCH instructions always access scratch.
4339 if (isFLATScratch(MI))
4340 return true;
4341
4342 // If there are no memory operands then conservatively assume the flat
4343 // operation may access scratch.
4344 if (MI.memoperands_empty())
4345 return true;
4346
4347 // See if any memory operand specifies an address space that involves scratch.
4348 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4349 unsigned AS = Memop->getAddrSpace();
4350 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4351 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4352 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4353 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4354 }
4355 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4356 });
4357}
4358
4360 assert(isFLAT(MI));
4361
4362 // All flat instructions use the VMEM counter except prefetch.
4363 if (!usesVM_CNT(MI))
4364 return false;
4365
4366 // If there are no memory operands then conservatively assume the flat
4367 // operation may access VMEM.
4368 if (MI.memoperands_empty())
4369 return true;
4370
4371 // See if any memory operand specifies an address space that involves VMEM.
4372 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4373 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4374 // (GDS) address space is not supported by flat operations. Therefore, simply
4375 // return true unless only the LDS address space is found.
4376 for (const MachineMemOperand *Memop : MI.memoperands()) {
4377 unsigned AS = Memop->getAddrSpace();
4379 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4380 return true;
4381 }
4382
4383 return false;
4384}
4385
4387 assert(isFLAT(MI));
4388
4389 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4390 if (!usesLGKM_CNT(MI))
4391 return false;
4392
4393 // If in tgsplit mode then there can be no use of LDS.
4394 if (ST.isTgSplitEnabled())
4395 return false;
4396
4397 // If there are no memory operands then conservatively assume the flat
4398 // operation may access LDS.
4399 if (MI.memoperands_empty())
4400 return true;
4401
4402 // See if any memory operand specifies an address space that involves LDS.
4403 for (const MachineMemOperand *Memop : MI.memoperands()) {
4404 unsigned AS = Memop->getAddrSpace();
4406 return true;
4407 }
4408
4409 return false;
4410}
4411
4413 // Skip the full operand and register alias search modifiesRegister
4414 // does. There's only a handful of instructions that touch this, it's only an
4415 // implicit def, and doesn't alias any other registers.
4416 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4417}
4418
4420 unsigned Opcode = MI.getOpcode();
4421
4422 if (MI.mayStore() && isSMRD(MI))
4423 return true; // scalar store or atomic
4424
4425 // This will terminate the function when other lanes may need to continue.
4426 if (MI.isReturn())
4427 return true;
4428
4429 // These instructions cause shader I/O that may cause hardware lockups
4430 // when executed with an empty EXEC mask.
4431 //
4432 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4433 // EXEC = 0, but checking for that case here seems not worth it
4434 // given the typical code patterns.
4435 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4436 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4437 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4438 return true;
4439
4440 if (MI.isCall() || MI.isInlineAsm())
4441 return true; // conservative assumption
4442
4443 // Assume that barrier interactions are only intended with active lanes.
4444 if (isBarrier(Opcode))
4445 return true;
4446
4447 // A mode change is a scalar operation that influences vector instructions.
4449 return true;
4450
4451 // These are like SALU instructions in terms of effects, so it's questionable
4452 // whether we should return true for those.
4453 //
4454 // However, executing them with EXEC = 0 causes them to operate on undefined
4455 // data, which we avoid by returning true here.
4456 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4457 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4458 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4459 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4460 return true;
4461
4462 return false;
4463}
4464
4466 const MachineInstr &MI) const {
4467 if (MI.isMetaInstruction())
4468 return false;
4469
4470 // This won't read exec if this is an SGPR->SGPR copy.
4471 if (MI.isCopyLike()) {
4472 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4473 return true;
4474
4475 // Make sure this isn't copying exec as a normal operand
4476 return MI.readsRegister(AMDGPU::EXEC, &RI);
4477 }
4478
4479 // Make a conservative assumption about the callee.
4480 if (MI.isCall())
4481 return true;
4482
4483 // Be conservative with any unhandled generic opcodes.
4484 if (!isTargetSpecificOpcode(MI.getOpcode()))
4485 return true;
4486
4487 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4488}
4489
4490bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4491 switch (Imm.getBitWidth()) {
4492 case 1: // This likely will be a condition code mask.
4493 return true;
4494
4495 case 32:
4496 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4497 ST.hasInv2PiInlineImm());
4498 case 64:
4499 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4500 ST.hasInv2PiInlineImm());
4501 case 16:
4502 return ST.has16BitInsts() &&
4503 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4504 ST.hasInv2PiInlineImm());
4505 default:
4506 llvm_unreachable("invalid bitwidth");
4507 }
4508}
4509
4511 APInt IntImm = Imm.bitcastToAPInt();
4512 int64_t IntImmVal = IntImm.getSExtValue();
4513 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4514 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4515 default:
4516 llvm_unreachable("invalid fltSemantics");
4519 return isInlineConstant(IntImm);
4521 return ST.has16BitInsts() &&
4522 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4524 return ST.has16BitInsts() &&
4525 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4526 }
4527}
4528
4529bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4530 // MachineOperand provides no way to tell the true operand size, since it only
4531 // records a 64-bit value. We need to know the size to determine if a 32-bit
4532 // floating point immediate bit pattern is legal for an integer immediate. It
4533 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4534 switch (OperandType) {
4544 int32_t Trunc = static_cast<int32_t>(Imm);
4545 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4546 }
4552 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4555 // We would expect inline immediates to not be concerned with an integer/fp
4556 // distinction. However, in the case of 16-bit integer operations, the
4557 // "floating point" values appear to not work. It seems read the low 16-bits
4558 // of 32-bit immediates, which happens to always work for the integer
4559 // values.
4560 //
4561 // See llvm bugzilla 46302.
4562 //
4563 // TODO: Theoretically we could use op-sel to use the high bits of the
4564 // 32-bit FP values.
4576 return false;
4579 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4580 // A few special case instructions have 16-bit operands on subtargets
4581 // where 16-bit instructions are not legal.
4582 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4583 // constants in these cases
4584 int16_t Trunc = static_cast<int16_t>(Imm);
4585 return ST.has16BitInsts() &&
4586 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4587 }
4588
4589 return false;
4590 }
4593 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4594 int16_t Trunc = static_cast<int16_t>(Imm);
4595 return ST.has16BitInsts() &&
4596 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4597 }
4598 return false;
4599 }
4603 return false;
4605 return isLegalAV64PseudoImm(Imm);
4608 // Always embedded in the instruction for free.
4609 return true;
4619 // Just ignore anything else.
4620 return true;
4621 default:
4622 llvm_unreachable("invalid operand type");
4623 }
4624}
4625
4626static bool compareMachineOp(const MachineOperand &Op0,
4627 const MachineOperand &Op1) {
4628 if (Op0.getType() != Op1.getType())
4629 return false;
4630
4631 switch (Op0.getType()) {
4633 return Op0.getReg() == Op1.getReg();
4635 return Op0.getImm() == Op1.getImm();
4636 default:
4637 llvm_unreachable("Didn't expect to be comparing these operand types");
4638 }
4639}
4640
4642 const MCOperandInfo &OpInfo) const {
4643 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4644 return true;
4645
4646 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4647 return false;
4648
4649 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4650 return true;
4651
4652 return ST.hasVOP3Literal();
4653}
4654
4655bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4656 int64_t ImmVal) const {
4657 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4658 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4659 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4660 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4661 AMDGPU::OpName::src2))
4662 return false;
4663 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4664 }
4665
4666 return isLiteralOperandLegal(InstDesc, OpInfo);
4667}
4668
4669bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4670 const MachineOperand &MO) const {
4671 if (MO.isImm())
4672 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4673
4674 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4675 "unexpected imm-like operand kind");
4676 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4677 return isLiteralOperandLegal(InstDesc, OpInfo);
4678}
4679
4681 // 2 32-bit inline constants packed into one.
4682 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4683 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4684}
4685
4686bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4687 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4688 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4689 return false;
4690
4691 int Op32 = AMDGPU::getVOPe32(Opcode);
4692 if (Op32 == -1)
4693 return false;
4694
4695 return pseudoToMCOpcode(Op32) != -1;
4696}
4697
4698bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4699 // The src0_modifier operand is present on all instructions
4700 // that have modifiers.
4701
4702 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4703}
4704
4706 AMDGPU::OpName OpName) const {
4707 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4708 return Mods && Mods->getImm();
4709}
4710
4712 return any_of(ModifierOpNames,
4713 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4714}
4715
4717 const MachineRegisterInfo &MRI) const {
4718 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4719 // Can't shrink instruction with three operands.
4720 if (Src2) {
4721 switch (MI.getOpcode()) {
4722 default: return false;
4723
4724 case AMDGPU::V_ADDC_U32_e64:
4725 case AMDGPU::V_SUBB_U32_e64:
4726 case AMDGPU::V_SUBBREV_U32_e64: {
4727 const MachineOperand *Src1
4728 = getNamedOperand(MI, AMDGPU::OpName::src1);
4729 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4730 return false;
4731 // Additional verification is needed for sdst/src2.
4732 return true;
4733 }
4734 case AMDGPU::V_MAC_F16_e64:
4735 case AMDGPU::V_MAC_F32_e64:
4736 case AMDGPU::V_MAC_LEGACY_F32_e64:
4737 case AMDGPU::V_FMAC_F16_e64:
4738 case AMDGPU::V_FMAC_F16_t16_e64:
4739 case AMDGPU::V_FMAC_F16_fake16_e64:
4740 case AMDGPU::V_FMAC_F32_e64:
4741 case AMDGPU::V_FMAC_F64_e64:
4742 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4743 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4744 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4745 return false;
4746 break;
4747
4748 case AMDGPU::V_CNDMASK_B32_e64:
4749 break;
4750 }
4751 }
4752
4753 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4754 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4755 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4756 return false;
4757
4758 // We don't need to check src0, all input types are legal, so just make sure
4759 // src0 isn't using any modifiers.
4760 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4761 return false;
4762
4763 // Can it be shrunk to a valid 32 bit opcode?
4764 if (!hasVALU32BitEncoding(MI.getOpcode()))
4765 return false;
4766
4767 // Check output modifiers
4768 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4769 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4770 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4771 // TODO: Can we avoid checking bound_ctrl/fi here?
4772 // They are only used by permlane*_swap special case.
4773 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4774 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4775}
4776
4777// Set VCC operand with all flags from \p Orig, except for setting it as
4778// implicit.
4780 const MachineOperand &Orig) {
4781
4782 for (MachineOperand &Use : MI.implicit_operands()) {
4783 if (Use.isUse() &&
4784 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4785 Use.setIsUndef(Orig.isUndef());
4786 Use.setIsKill(Orig.isKill());
4787 return;
4788 }
4789 }
4790}
4791
4793 unsigned Op32) const {
4794 MachineBasicBlock *MBB = MI.getParent();
4795
4796 const MCInstrDesc &Op32Desc = get(Op32);
4797 MachineInstrBuilder Inst32 =
4798 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4799 .setMIFlags(MI.getFlags());
4800
4801 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4802 // For VOPC instructions, this is replaced by an implicit def of vcc.
4803
4804 // We assume the defs of the shrunk opcode are in the same order, and the
4805 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4806 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4807 Inst32.add(MI.getOperand(I));
4808
4809 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4810
4811 int Idx = MI.getNumExplicitDefs();
4812 for (const MachineOperand &Use : MI.explicit_uses()) {
4813 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4815 continue;
4816
4817 if (&Use == Src2) {
4818 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4819 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4820 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4821 // of vcc was already added during the initial BuildMI, but we
4822 // 1) may need to change vcc to vcc_lo to preserve the original register
4823 // 2) have to preserve the original flags.
4824 copyFlagsToImplicitVCC(*Inst32, *Src2);
4825 continue;
4826 }
4827 }
4828
4829 Inst32.add(Use);
4830 }
4831
4832 // FIXME: Losing implicit operands
4833 fixImplicitOperands(*Inst32);
4834 return Inst32;
4835}
4836
4838 // Null is free
4839 Register Reg = RegOp.getReg();
4840 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4841 return false;
4842
4843 // SGPRs use the constant bus
4844
4845 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4846 // physical register operands should also count, except for exec.
4847 if (RegOp.isImplicit())
4848 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4849
4850 // SGPRs use the constant bus
4851 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4852 AMDGPU::SReg_64RegClass.contains(Reg);
4853}
4854
4856 const MachineRegisterInfo &MRI) const {
4857 Register Reg = RegOp.getReg();
4858 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4859 : physRegUsesConstantBus(RegOp);
4860}
4861
4863 const MachineOperand &MO,
4864 const MCOperandInfo &OpInfo) const {
4865 // Literal constants use the constant bus.
4866 if (!MO.isReg())
4867 return !isInlineConstant(MO, OpInfo);
4868
4869 Register Reg = MO.getReg();
4870 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4872}
4873
4875 for (const MachineOperand &MO : MI.implicit_operands()) {
4876 // We only care about reads.
4877 if (MO.isDef())
4878 continue;
4879
4880 switch (MO.getReg()) {
4881 case AMDGPU::VCC:
4882 case AMDGPU::VCC_LO:
4883 case AMDGPU::VCC_HI:
4884 case AMDGPU::M0:
4885 case AMDGPU::FLAT_SCR:
4886 return MO.getReg();
4887
4888 default:
4889 break;
4890 }
4891 }
4892
4893 return Register();
4894}
4895
4896static bool shouldReadExec(const MachineInstr &MI) {
4897 if (SIInstrInfo::isVALU(MI)) {
4898 switch (MI.getOpcode()) {
4899 case AMDGPU::V_READLANE_B32:
4900 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4901 case AMDGPU::V_WRITELANE_B32:
4902 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4903 return false;
4904 }
4905
4906 return true;
4907 }
4908
4909 if (MI.isPreISelOpcode() ||
4910 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4913 return false;
4914
4915 return true;
4916}
4917
4918static bool isRegOrFI(const MachineOperand &MO) {
4919 return MO.isReg() || MO.isFI();
4920}
4921
4922static bool isSubRegOf(const SIRegisterInfo &TRI,
4923 const MachineOperand &SuperVec,
4924 const MachineOperand &SubReg) {
4925 if (SubReg.getReg().isPhysical())
4926 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4927
4928 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4929 SubReg.getReg() == SuperVec.getReg();
4930}
4931
4932// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4933bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4934 const MachineRegisterInfo &MRI,
4935 StringRef &ErrInfo) const {
4936 Register DstReg = MI.getOperand(0).getReg();
4937 Register SrcReg = MI.getOperand(1).getReg();
4938 // This is a check for copy from vector register to SGPR
4939 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4940 ErrInfo = "illegal copy from vector register to SGPR";
4941 return false;
4942 }
4943 return true;
4944}
4945
4947 StringRef &ErrInfo) const {
4948 uint16_t Opcode = MI.getOpcode();
4949 const MachineFunction *MF = MI.getParent()->getParent();
4950 const MachineRegisterInfo &MRI = MF->getRegInfo();
4951
4952 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4953 // Find a better property to recognize the point where instruction selection
4954 // is just done.
4955 // We can only enforce this check after SIFixSGPRCopies pass so that the
4956 // illegal copies are legalized and thereafter we don't expect a pass
4957 // inserting similar copies.
4958 if (!MRI.isSSA() && MI.isCopy())
4959 return verifyCopy(MI, MRI, ErrInfo);
4960
4961 if (SIInstrInfo::isGenericOpcode(Opcode))
4962 return true;
4963
4964 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4965 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4966 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4967 int Src3Idx = -1;
4968 if (Src0Idx == -1) {
4969 // VOPD V_DUAL_* instructions use different operand names.
4970 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4971 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4972 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4973 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4974 }
4975
4976 // Make sure the number of operands is correct.
4977 const MCInstrDesc &Desc = get(Opcode);
4978 if (!Desc.isVariadic() &&
4979 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4980 ErrInfo = "Instruction has wrong number of operands.";
4981 return false;
4982 }
4983
4984 if (MI.isInlineAsm()) {
4985 // Verify register classes for inlineasm constraints.
4986 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4987 I != E; ++I) {
4988 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4989 if (!RC)
4990 continue;
4991
4992 const MachineOperand &Op = MI.getOperand(I);
4993 if (!Op.isReg())
4994 continue;
4995
4996 Register Reg = Op.getReg();
4997 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4998 ErrInfo = "inlineasm operand has incorrect register class.";
4999 return false;
5000 }
5001 }
5002
5003 return true;
5004 }
5005
5006 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5007 ErrInfo = "missing memory operand from image instruction.";
5008 return false;
5009 }
5010
5011 // Make sure the register classes are correct.
5012 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5013 const MachineOperand &MO = MI.getOperand(i);
5014 if (MO.isFPImm()) {
5015 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5016 "all fp values to integers.";
5017 return false;
5018 }
5019
5020 const MCOperandInfo &OpInfo = Desc.operands()[i];
5021 int16_t RegClass = getOpRegClassID(OpInfo);
5022
5023 switch (OpInfo.OperandType) {
5025 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5026 ErrInfo = "Illegal immediate value for operand.";
5027 return false;
5028 }
5029 break;
5042 break;
5044 break;
5045 break;
5059 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5060 ErrInfo = "Illegal immediate value for operand.";
5061 return false;
5062 }
5063 break;
5064 }
5066 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5067 ErrInfo = "Expected inline constant for operand.";
5068 return false;
5069 }
5070 break;
5074 break;
5079 // Check if this operand is an immediate.
5080 // FrameIndex operands will be replaced by immediates, so they are
5081 // allowed.
5082 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5083 ErrInfo = "Expected immediate, but got non-immediate";
5084 return false;
5085 }
5086 break;
5090 break;
5091 default:
5092 if (OpInfo.isGenericType())
5093 continue;
5094 break;
5095 }
5096
5097 if (!MO.isReg())
5098 continue;
5099 Register Reg = MO.getReg();
5100 if (!Reg)
5101 continue;
5102
5103 // FIXME: Ideally we would have separate instruction definitions with the
5104 // aligned register constraint.
5105 // FIXME: We do not verify inline asm operands, but custom inline asm
5106 // verification is broken anyway
5107 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5108 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5109 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5110 if (const TargetRegisterClass *SubRC =
5111 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5112 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5113 if (RC)
5114 RC = SubRC;
5115 }
5116 }
5117
5118 // Check that this is the aligned version of the class.
5119 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5120 ErrInfo = "Subtarget requires even aligned vector registers";
5121 return false;
5122 }
5123 }
5124
5125 if (RegClass != -1) {
5126 if (Reg.isVirtual())
5127 continue;
5128
5129 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5130 if (!RC->contains(Reg)) {
5131 ErrInfo = "Operand has incorrect register class.";
5132 return false;
5133 }
5134 }
5135 }
5136
5137 // Verify SDWA
5138 if (isSDWA(MI)) {
5139 if (!ST.hasSDWA()) {
5140 ErrInfo = "SDWA is not supported on this target";
5141 return false;
5142 }
5143
5144 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5145 AMDGPU::OpName::dst_sel}) {
5146 const MachineOperand *MO = getNamedOperand(MI, Op);
5147 if (!MO)
5148 continue;
5149 int64_t Imm = MO->getImm();
5150 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5151 ErrInfo = "Invalid SDWA selection";
5152 return false;
5153 }
5154 }
5155
5156 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5157
5158 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5159 if (OpIdx == -1)
5160 continue;
5161 const MachineOperand &MO = MI.getOperand(OpIdx);
5162
5163 if (!ST.hasSDWAScalar()) {
5164 // Only VGPRS on VI
5165 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5166 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5167 return false;
5168 }
5169 } else {
5170 // No immediates on GFX9
5171 if (!MO.isReg()) {
5172 ErrInfo =
5173 "Only reg allowed as operands in SDWA instructions on GFX9+";
5174 return false;
5175 }
5176 }
5177 }
5178
5179 if (!ST.hasSDWAOmod()) {
5180 // No omod allowed on VI
5181 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5182 if (OMod != nullptr &&
5183 (!OMod->isImm() || OMod->getImm() != 0)) {
5184 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5185 return false;
5186 }
5187 }
5188
5189 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5190 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5191 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5192 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5193 const MachineOperand *Src0ModsMO =
5194 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5195 unsigned Mods = Src0ModsMO->getImm();
5196 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5197 Mods & SISrcMods::SEXT) {
5198 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5199 return false;
5200 }
5201 }
5202
5203 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5204 if (isVOPC(BasicOpcode)) {
5205 if (!ST.hasSDWASdst() && DstIdx != -1) {
5206 // Only vcc allowed as dst on VI for VOPC
5207 const MachineOperand &Dst = MI.getOperand(DstIdx);
5208 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5209 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5210 return false;
5211 }
5212 } else if (!ST.hasSDWAOutModsVOPC()) {
5213 // No clamp allowed on GFX9 for VOPC
5214 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5215 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5216 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5217 return false;
5218 }
5219
5220 // No omod allowed on GFX9 for VOPC
5221 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5222 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5223 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5224 return false;
5225 }
5226 }
5227 }
5228
5229 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5230 if (DstUnused && DstUnused->isImm() &&
5231 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5232 const MachineOperand &Dst = MI.getOperand(DstIdx);
5233 if (!Dst.isReg() || !Dst.isTied()) {
5234 ErrInfo = "Dst register should have tied register";
5235 return false;
5236 }
5237
5238 const MachineOperand &TiedMO =
5239 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5240 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5241 ErrInfo =
5242 "Dst register should be tied to implicit use of preserved register";
5243 return false;
5244 }
5245 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5246 ErrInfo = "Dst register should use same physical register as preserved";
5247 return false;
5248 }
5249 }
5250 }
5251
5252 // Verify MIMG / VIMAGE / VSAMPLE
5253 if (isImage(Opcode) && !MI.mayStore()) {
5254 // Ensure that the return type used is large enough for all the options
5255 // being used TFE/LWE require an extra result register.
5256 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5257 if (DMask) {
5258 uint64_t DMaskImm = DMask->getImm();
5259 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5260 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5261 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5262 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5263
5264 // Adjust for packed 16 bit values
5265 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5266 RegCount = divideCeil(RegCount, 2);
5267
5268 // Adjust if using LWE or TFE
5269 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5270 RegCount += 1;
5271
5272 const uint32_t DstIdx =
5273 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5274 const MachineOperand &Dst = MI.getOperand(DstIdx);
5275 if (Dst.isReg()) {
5276 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5277 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5278 if (RegCount > DstSize) {
5279 ErrInfo = "Image instruction returns too many registers for dst "
5280 "register class";
5281 return false;
5282 }
5283 }
5284 }
5285 }
5286
5287 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5288 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5289 unsigned ConstantBusCount = 0;
5290 bool UsesLiteral = false;
5291 const MachineOperand *LiteralVal = nullptr;
5292
5293 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5294 if (ImmIdx != -1) {
5295 ++ConstantBusCount;
5296 UsesLiteral = true;
5297 LiteralVal = &MI.getOperand(ImmIdx);
5298 }
5299
5300 SmallVector<Register, 2> SGPRsUsed;
5301 Register SGPRUsed;
5302
5303 // Only look at the true operands. Only a real operand can use the constant
5304 // bus, and we don't want to check pseudo-operands like the source modifier
5305 // flags.
5306 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5307 if (OpIdx == -1)
5308 continue;
5309 const MachineOperand &MO = MI.getOperand(OpIdx);
5310 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5311 if (MO.isReg()) {
5312 SGPRUsed = MO.getReg();
5313 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5314 ++ConstantBusCount;
5315 SGPRsUsed.push_back(SGPRUsed);
5316 }
5317 } else if (!MO.isFI()) { // Treat FI like a register.
5318 if (!UsesLiteral) {
5319 ++ConstantBusCount;
5320 UsesLiteral = true;
5321 LiteralVal = &MO;
5322 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5323 assert(isVOP2(MI) || isVOP3(MI));
5324 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5325 return false;
5326 }
5327 }
5328 }
5329 }
5330
5331 SGPRUsed = findImplicitSGPRRead(MI);
5332 if (SGPRUsed) {
5333 // Implicit uses may safely overlap true operands
5334 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5335 return !RI.regsOverlap(SGPRUsed, SGPR);
5336 })) {
5337 ++ConstantBusCount;
5338 SGPRsUsed.push_back(SGPRUsed);
5339 }
5340 }
5341
5342 // v_writelane_b32 is an exception from constant bus restriction:
5343 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5344 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5345 Opcode != AMDGPU::V_WRITELANE_B32) {
5346 ErrInfo = "VOP* instruction violates constant bus restriction";
5347 return false;
5348 }
5349
5350 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5351 ErrInfo = "VOP3 instruction uses literal";
5352 return false;
5353 }
5354 }
5355
5356 // Special case for writelane - this can break the multiple constant bus rule,
5357 // but still can't use more than one SGPR register
5358 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5359 unsigned SGPRCount = 0;
5360 Register SGPRUsed;
5361
5362 for (int OpIdx : {Src0Idx, Src1Idx}) {
5363 if (OpIdx == -1)
5364 break;
5365
5366 const MachineOperand &MO = MI.getOperand(OpIdx);
5367
5368 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5369 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5370 if (MO.getReg() != SGPRUsed)
5371 ++SGPRCount;
5372 SGPRUsed = MO.getReg();
5373 }
5374 }
5375 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5376 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5377 return false;
5378 }
5379 }
5380 }
5381
5382 // Verify misc. restrictions on specific instructions.
5383 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5384 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5385 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5386 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5387 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5388 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5389 if (!compareMachineOp(Src0, Src1) &&
5390 !compareMachineOp(Src0, Src2)) {
5391 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5392 return false;
5393 }
5394 }
5395 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5396 SISrcMods::ABS) ||
5397 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5398 SISrcMods::ABS) ||
5399 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5400 SISrcMods::ABS)) {
5401 ErrInfo = "ABS not allowed in VOP3B instructions";
5402 return false;
5403 }
5404 }
5405
5406 if (isSOP2(MI) || isSOPC(MI)) {
5407 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5408 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5409
5410 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5411 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5412 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5413 !Src0.isIdenticalTo(Src1)) {
5414 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5415 return false;
5416 }
5417 }
5418
5419 if (isSOPK(MI)) {
5420 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5421 if (Desc.isBranch()) {
5422 if (!Op->isMBB()) {
5423 ErrInfo = "invalid branch target for SOPK instruction";
5424 return false;
5425 }
5426 } else {
5427 uint64_t Imm = Op->getImm();
5428 if (sopkIsZext(Opcode)) {
5429 if (!isUInt<16>(Imm)) {
5430 ErrInfo = "invalid immediate for SOPK instruction";
5431 return false;
5432 }
5433 } else {
5434 if (!isInt<16>(Imm)) {
5435 ErrInfo = "invalid immediate for SOPK instruction";
5436 return false;
5437 }
5438 }
5439 }
5440 }
5441
5442 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5443 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5444 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5445 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5446 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5447 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5448
5449 const unsigned StaticNumOps =
5450 Desc.getNumOperands() + Desc.implicit_uses().size();
5451 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5452
5453 // Allow additional implicit operands. This allows a fixup done by the post
5454 // RA scheduler where the main implicit operand is killed and implicit-defs
5455 // are added for sub-registers that remain live after this instruction.
5456 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5457 ErrInfo = "missing implicit register operands";
5458 return false;
5459 }
5460
5461 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5462 if (IsDst) {
5463 if (!Dst->isUse()) {
5464 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5465 return false;
5466 }
5467
5468 unsigned UseOpIdx;
5469 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5470 UseOpIdx != StaticNumOps + 1) {
5471 ErrInfo = "movrel implicit operands should be tied";
5472 return false;
5473 }
5474 }
5475
5476 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5477 const MachineOperand &ImpUse
5478 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5479 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5480 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5481 ErrInfo = "src0 should be subreg of implicit vector use";
5482 return false;
5483 }
5484 }
5485
5486 // Make sure we aren't losing exec uses in the td files. This mostly requires
5487 // being careful when using let Uses to try to add other use registers.
5488 if (shouldReadExec(MI)) {
5489 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5490 ErrInfo = "VALU instruction does not implicitly read exec mask";
5491 return false;
5492 }
5493 }
5494
5495 if (isSMRD(MI)) {
5496 if (MI.mayStore() &&
5497 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5498 // The register offset form of scalar stores may only use m0 as the
5499 // soffset register.
5500 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5501 if (Soff && Soff->getReg() != AMDGPU::M0) {
5502 ErrInfo = "scalar stores must use m0 as offset register";
5503 return false;
5504 }
5505 }
5506 }
5507
5508 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5509 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5510 if (Offset->getImm() != 0) {
5511 ErrInfo = "subtarget does not support offsets in flat instructions";
5512 return false;
5513 }
5514 }
5515
5516 if (isDS(MI) && !ST.hasGDS()) {
5517 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5518 if (GDSOp && GDSOp->getImm() != 0) {
5519 ErrInfo = "GDS is not supported on this subtarget";
5520 return false;
5521 }
5522 }
5523
5524 if (isImage(MI)) {
5525 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5526 if (DimOp) {
5527 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5528 AMDGPU::OpName::vaddr0);
5529 AMDGPU::OpName RSrcOpName =
5530 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5531 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5532 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5533 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5534 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5535 const AMDGPU::MIMGDimInfo *Dim =
5537
5538 if (!Dim) {
5539 ErrInfo = "dim is out of range";
5540 return false;
5541 }
5542
5543 bool IsA16 = false;
5544 if (ST.hasR128A16()) {
5545 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5546 IsA16 = R128A16->getImm() != 0;
5547 } else if (ST.hasA16()) {
5548 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5549 IsA16 = A16->getImm() != 0;
5550 }
5551
5552 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5553
5554 unsigned AddrWords =
5555 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5556
5557 unsigned VAddrWords;
5558 if (IsNSA) {
5559 VAddrWords = RsrcIdx - VAddr0Idx;
5560 if (ST.hasPartialNSAEncoding() &&
5561 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5562 unsigned LastVAddrIdx = RsrcIdx - 1;
5563 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5564 }
5565 } else {
5566 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5567 if (AddrWords > 12)
5568 AddrWords = 16;
5569 }
5570
5571 if (VAddrWords != AddrWords) {
5572 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5573 << " but got " << VAddrWords << "\n");
5574 ErrInfo = "bad vaddr size";
5575 return false;
5576 }
5577 }
5578 }
5579
5580 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5581 if (DppCt) {
5582 using namespace AMDGPU::DPP;
5583
5584 unsigned DC = DppCt->getImm();
5585 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5586 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5587 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5588 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5589 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5590 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5591 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5592 ErrInfo = "Invalid dpp_ctrl value";
5593 return false;
5594 }
5595 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5596 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5597 ErrInfo = "Invalid dpp_ctrl value: "
5598 "wavefront shifts are not supported on GFX10+";
5599 return false;
5600 }
5601 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5602 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5603 ErrInfo = "Invalid dpp_ctrl value: "
5604 "broadcasts are not supported on GFX10+";
5605 return false;
5606 }
5607 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5608 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5609 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5610 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5611 !ST.hasGFX90AInsts()) {
5612 ErrInfo = "Invalid dpp_ctrl value: "
5613 "row_newbroadcast/row_share is not supported before "
5614 "GFX90A/GFX10";
5615 return false;
5616 }
5617 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5618 ErrInfo = "Invalid dpp_ctrl value: "
5619 "row_share and row_xmask are not supported before GFX10";
5620 return false;
5621 }
5622 }
5623
5624 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5626 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5627 ErrInfo = "Invalid dpp_ctrl value: "
5628 "DP ALU dpp only support row_newbcast";
5629 return false;
5630 }
5631 }
5632
5633 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5634 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5635 AMDGPU::OpName DataName =
5636 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5637 const MachineOperand *Data = getNamedOperand(MI, DataName);
5638 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5639 if (Data && !Data->isReg())
5640 Data = nullptr;
5641
5642 if (ST.hasGFX90AInsts()) {
5643 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5644 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5645 ErrInfo = "Invalid register class: "
5646 "vdata and vdst should be both VGPR or AGPR";
5647 return false;
5648 }
5649 if (Data && Data2 &&
5650 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5651 ErrInfo = "Invalid register class: "
5652 "both data operands should be VGPR or AGPR";
5653 return false;
5654 }
5655 } else {
5656 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5657 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5658 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5659 ErrInfo = "Invalid register class: "
5660 "agpr loads and stores not supported on this GPU";
5661 return false;
5662 }
5663 }
5664 }
5665
5666 if (ST.needsAlignedVGPRs()) {
5667 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5669 if (!Op)
5670 return true;
5671 Register Reg = Op->getReg();
5672 if (Reg.isPhysical())
5673 return !(RI.getHWRegIndex(Reg) & 1);
5674 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5675 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5676 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5677 };
5678
5679 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5680 Opcode == AMDGPU::DS_GWS_BARRIER) {
5681
5682 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5683 ErrInfo = "Subtarget requires even aligned vector registers "
5684 "for DS_GWS instructions";
5685 return false;
5686 }
5687 }
5688
5689 if (isMIMG(MI)) {
5690 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5691 ErrInfo = "Subtarget requires even aligned vector registers "
5692 "for vaddr operand of image instructions";
5693 return false;
5694 }
5695 }
5696 }
5697
5698 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5699 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5700 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5701 ErrInfo = "Invalid register class: "
5702 "v_accvgpr_write with an SGPR is not supported on this GPU";
5703 return false;
5704 }
5705 }
5706
5707 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5708 const MachineOperand &SrcOp = MI.getOperand(1);
5709 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5710 ErrInfo = "pseudo expects only physical SGPRs";
5711 return false;
5712 }
5713 }
5714
5715 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5716 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5717 if (!ST.hasScaleOffset()) {
5718 ErrInfo = "Subtarget does not support offset scaling";
5719 return false;
5720 }
5721 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5722 ErrInfo = "Instruction does not support offset scaling";
5723 return false;
5724 }
5725 }
5726 }
5727
5728 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5729 // information.
5730 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5731 for (unsigned I = 0; I < 3; ++I) {
5733 return false;
5734 }
5735 }
5736
5737 return true;
5738}
5739
5740// It is more readable to list mapped opcodes on the same line.
5741// clang-format off
5742
5744 switch (MI.getOpcode()) {
5745 default: return AMDGPU::INSTRUCTION_LIST_END;
5746 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5747 case AMDGPU::COPY: return AMDGPU::COPY;
5748 case AMDGPU::PHI: return AMDGPU::PHI;
5749 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5750 case AMDGPU::WQM: return AMDGPU::WQM;
5751 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5752 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5753 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5754 case AMDGPU::S_MOV_B32: {
5755 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5756 return MI.getOperand(1).isReg() ||
5757 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5758 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5759 }
5760 case AMDGPU::S_ADD_I32:
5761 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5762 case AMDGPU::S_ADDC_U32:
5763 return AMDGPU::V_ADDC_U32_e32;
5764 case AMDGPU::S_SUB_I32:
5765 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5766 // FIXME: These are not consistently handled, and selected when the carry is
5767 // used.
5768 case AMDGPU::S_ADD_U32:
5769 return AMDGPU::V_ADD_CO_U32_e32;
5770 case AMDGPU::S_SUB_U32:
5771 return AMDGPU::V_SUB_CO_U32_e32;
5772 case AMDGPU::S_ADD_U64_PSEUDO:
5773 return AMDGPU::V_ADD_U64_PSEUDO;
5774 case AMDGPU::S_SUB_U64_PSEUDO:
5775 return AMDGPU::V_SUB_U64_PSEUDO;
5776 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5777 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5778 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5779 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5780 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5781 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5782 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5783 case AMDGPU::S_XNOR_B32:
5784 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5785 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5786 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5787 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5788 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5789 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5790 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5791 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5792 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5793 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5794 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5795 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5796 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5797 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5798 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5799 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5800 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5801 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5802 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5803 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5804 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5805 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5806 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5807 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5808 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5809 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5810 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5811 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5812 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5813 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5814 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5815 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5816 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5817 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5818 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5819 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5820 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5821 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5822 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5823 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5824 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5825 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5826 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5827 case AMDGPU::S_CVT_F32_F16:
5828 case AMDGPU::S_CVT_HI_F32_F16:
5829 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5830 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5831 case AMDGPU::S_CVT_F16_F32:
5832 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5833 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5834 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5835 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5836 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5837 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5838 case AMDGPU::S_CEIL_F16:
5839 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5840 : AMDGPU::V_CEIL_F16_fake16_e64;
5841 case AMDGPU::S_FLOOR_F16:
5842 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5843 : AMDGPU::V_FLOOR_F16_fake16_e64;
5844 case AMDGPU::S_TRUNC_F16:
5845 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5846 : AMDGPU::V_TRUNC_F16_fake16_e64;
5847 case AMDGPU::S_RNDNE_F16:
5848 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5849 : AMDGPU::V_RNDNE_F16_fake16_e64;
5850 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5851 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5852 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5853 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5854 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5855 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5856 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5857 case AMDGPU::S_ADD_F16:
5858 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5859 : AMDGPU::V_ADD_F16_fake16_e64;
5860 case AMDGPU::S_SUB_F16:
5861 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5862 : AMDGPU::V_SUB_F16_fake16_e64;
5863 case AMDGPU::S_MIN_F16:
5864 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5865 : AMDGPU::V_MIN_F16_fake16_e64;
5866 case AMDGPU::S_MAX_F16:
5867 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5868 : AMDGPU::V_MAX_F16_fake16_e64;
5869 case AMDGPU::S_MINIMUM_F16:
5870 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5871 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5872 case AMDGPU::S_MAXIMUM_F16:
5873 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5874 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5875 case AMDGPU::S_MUL_F16:
5876 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5877 : AMDGPU::V_MUL_F16_fake16_e64;
5878 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5879 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5880 case AMDGPU::S_FMAC_F16:
5881 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5882 : AMDGPU::V_FMAC_F16_fake16_e64;
5883 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5884 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5885 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5886 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5887 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5888 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5889 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5890 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5891 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5892 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5893 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5894 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5895 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5896 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5897 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5898 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5899 case AMDGPU::S_CMP_LT_F16:
5900 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5901 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5902 case AMDGPU::S_CMP_EQ_F16:
5903 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5904 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5905 case AMDGPU::S_CMP_LE_F16:
5906 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5907 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5908 case AMDGPU::S_CMP_GT_F16:
5909 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5910 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5911 case AMDGPU::S_CMP_LG_F16:
5912 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5913 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5914 case AMDGPU::S_CMP_GE_F16:
5915 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5916 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5917 case AMDGPU::S_CMP_O_F16:
5918 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5919 : AMDGPU::V_CMP_O_F16_fake16_e64;
5920 case AMDGPU::S_CMP_U_F16:
5921 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5922 : AMDGPU::V_CMP_U_F16_fake16_e64;
5923 case AMDGPU::S_CMP_NGE_F16:
5924 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5925 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5926 case AMDGPU::S_CMP_NLG_F16:
5927 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5928 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5929 case AMDGPU::S_CMP_NGT_F16:
5930 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5931 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5932 case AMDGPU::S_CMP_NLE_F16:
5933 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5934 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5935 case AMDGPU::S_CMP_NEQ_F16:
5936 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5937 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5938 case AMDGPU::S_CMP_NLT_F16:
5939 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5940 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5941 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5942 case AMDGPU::V_S_EXP_F16_e64:
5943 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5944 : AMDGPU::V_EXP_F16_fake16_e64;
5945 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5946 case AMDGPU::V_S_LOG_F16_e64:
5947 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5948 : AMDGPU::V_LOG_F16_fake16_e64;
5949 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5950 case AMDGPU::V_S_RCP_F16_e64:
5951 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5952 : AMDGPU::V_RCP_F16_fake16_e64;
5953 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5954 case AMDGPU::V_S_RSQ_F16_e64:
5955 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5956 : AMDGPU::V_RSQ_F16_fake16_e64;
5957 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5958 case AMDGPU::V_S_SQRT_F16_e64:
5959 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5960 : AMDGPU::V_SQRT_F16_fake16_e64;
5961 }
5963 "Unexpected scalar opcode without corresponding vector one!");
5964}
5965
5966// clang-format on
5967
5971 const DebugLoc &DL, Register Reg,
5972 bool IsSCCLive,
5973 SlotIndexes *Indexes) const {
5974 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5975 const SIInstrInfo *TII = ST.getInstrInfo();
5977 if (IsSCCLive) {
5978 // Insert two move instructions, one to save the original value of EXEC and
5979 // the other to turn on all bits in EXEC. This is required as we can't use
5980 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5981 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
5983 auto FlipExecMI =
5984 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
5985 if (Indexes) {
5986 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5987 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5988 }
5989 } else {
5990 auto SaveExec =
5991 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
5992 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5993 if (Indexes)
5994 Indexes->insertMachineInstrInMaps(*SaveExec);
5995 }
5996}
5997
6000 const DebugLoc &DL, Register Reg,
6001 SlotIndexes *Indexes) const {
6003 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6004 .addReg(Reg, RegState::Kill);
6005 if (Indexes)
6006 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6007}
6008
6012 "Not a whole wave func");
6013 MachineBasicBlock &MBB = *MF.begin();
6014 for (MachineInstr &MI : MBB)
6015 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6016 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6017 return &MI;
6018
6019 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6020}
6021
6022// FIXME: This should not be an overridable function. All subtarget dependent
6023// operand modifications should go through isLookupRegClassByHwMode in the
6024// generic handling.
6025const TargetRegisterClass *
6026SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
6027 const TargetRegisterInfo *TRI) const {
6028 if (OpNum >= TID.getNumOperands())
6029 return nullptr;
6030 const MCOperandInfo &OpInfo = TID.operands()[OpNum];
6031 int16_t RegClass = getOpRegClassID(OpInfo);
6032 return RI.getRegClass(RegClass);
6033}
6034
6036 unsigned OpNo) const {
6037 const MCInstrDesc &Desc = get(MI.getOpcode());
6038 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6039 Desc.operands()[OpNo].RegClass == -1) {
6040 Register Reg = MI.getOperand(OpNo).getReg();
6041
6042 if (Reg.isVirtual()) {
6043 const MachineRegisterInfo &MRI =
6044 MI.getParent()->getParent()->getRegInfo();
6045 return MRI.getRegClass(Reg);
6046 }
6047 return RI.getPhysRegBaseClass(Reg);
6048 }
6049
6050 return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo]));
6051}
6052
6055 MachineBasicBlock *MBB = MI.getParent();
6056 MachineOperand &MO = MI.getOperand(OpIdx);
6057 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6058 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6059 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6060 unsigned Size = RI.getRegSizeInBits(*RC);
6061 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6062 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6063 : AMDGPU::V_MOV_B32_e32;
6064 if (MO.isReg())
6065 Opcode = AMDGPU::COPY;
6066 else if (RI.isSGPRClass(RC))
6067 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6068
6069 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6070 Register Reg = MRI.createVirtualRegister(VRC);
6071 DebugLoc DL = MBB->findDebugLoc(I);
6072 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6073 MO.ChangeToRegister(Reg, false);
6074}
6075
6078 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6079 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6080 if (!SuperReg.getReg().isVirtual())
6081 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6082
6083 MachineBasicBlock *MBB = MI->getParent();
6084 const DebugLoc &DL = MI->getDebugLoc();
6085 Register SubReg = MRI.createVirtualRegister(SubRC);
6086
6087 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6088 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6089 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6090 return SubReg;
6091}
6092
6095 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6096 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6097 if (Op.isImm()) {
6098 if (SubIdx == AMDGPU::sub0)
6099 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6100 if (SubIdx == AMDGPU::sub1)
6101 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6102
6103 llvm_unreachable("Unhandled register index for immediate");
6104 }
6105
6106 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6107 SubIdx, SubRC);
6108 return MachineOperand::CreateReg(SubReg, false);
6109}
6110
6111// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6112void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6113 assert(Inst.getNumExplicitOperands() == 3);
6114 MachineOperand Op1 = Inst.getOperand(1);
6115 Inst.removeOperand(1);
6116 Inst.addOperand(Op1);
6117}
6118
6120 const MCOperandInfo &OpInfo,
6121 const MachineOperand &MO) const {
6122 if (!MO.isReg())
6123 return false;
6124
6125 Register Reg = MO.getReg();
6126
6127 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6128 if (Reg.isPhysical())
6129 return DRC->contains(Reg);
6130
6131 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6132
6133 if (MO.getSubReg()) {
6134 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
6135 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6136 if (!SuperRC)
6137 return false;
6138 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6139 }
6140
6141 return RI.getCommonSubClass(DRC, RC) != nullptr;
6142}
6143
6145 const MachineOperand &MO) const {
6146 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6147 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6148 unsigned Opc = MI.getOpcode();
6149
6150 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6151 // information.
6152 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6153 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6154 constexpr const AMDGPU::OpName OpNames[] = {
6155 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6156
6157 for (auto [I, OpName] : enumerate(OpNames)) {
6158 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6159 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6161 return false;
6162 }
6163 }
6164
6165 if (!isLegalRegOperand(MRI, OpInfo, MO))
6166 return false;
6167
6168 // check Accumulate GPR operand
6169 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6170 if (IsAGPR && !ST.hasMAIInsts())
6171 return false;
6172 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6173 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6174 return false;
6175 // Atomics should have both vdst and vdata either vgpr or agpr.
6176 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6177 const int DataIdx = AMDGPU::getNamedOperandIdx(
6178 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6179 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6180 MI.getOperand(DataIdx).isReg() &&
6181 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6182 return false;
6183 if ((int)OpIdx == DataIdx) {
6184 if (VDstIdx != -1 &&
6185 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6186 return false;
6187 // DS instructions with 2 src operands also must have tied RC.
6188 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6189 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6190 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6191 return false;
6192 }
6193
6194 // Check V_ACCVGPR_WRITE_B32_e64
6195 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6196 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6197 RI.isSGPRReg(MRI, MO.getReg()))
6198 return false;
6199 return true;
6200}
6201
6203 const MCOperandInfo &OpInfo,
6204 const MachineOperand &MO) const {
6205 if (MO.isReg())
6206 return isLegalRegOperand(MRI, OpInfo, MO);
6207
6208 // Handle non-register types that are treated like immediates.
6209 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6210 return true;
6211}
6212
6214 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6215 const MachineOperand *MO) const {
6216 constexpr const unsigned NumOps = 3;
6217 constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
6218 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6219 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6220 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6221
6222 assert(SrcN < NumOps);
6223
6224 if (!MO) {
6225 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6226 if (SrcIdx == -1)
6227 return true;
6228 MO = &MI.getOperand(SrcIdx);
6229 }
6230
6231 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6232 return true;
6233
6234 int ModsIdx =
6235 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6236 if (ModsIdx == -1)
6237 return true;
6238
6239 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6240 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6241 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6242
6243 return !OpSel && !OpSelHi;
6244}
6245
6247 const MachineOperand *MO) const {
6248 const MachineFunction &MF = *MI.getParent()->getParent();
6249 const MachineRegisterInfo &MRI = MF.getRegInfo();
6250 const MCInstrDesc &InstDesc = MI.getDesc();
6251 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6252 int64_t RegClass = getOpRegClassID(OpInfo);
6253 const TargetRegisterClass *DefinedRC =
6254 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6255 if (!MO)
6256 MO = &MI.getOperand(OpIdx);
6257
6258 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6259
6260 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6261 const MachineOperand *UsedLiteral = nullptr;
6262
6263 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6264 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6265
6266 // TODO: Be more permissive with frame indexes.
6267 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6268 if (!LiteralLimit--)
6269 return false;
6270
6271 UsedLiteral = MO;
6272 }
6273
6275 if (MO->isReg())
6276 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6277
6278 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6279 if (i == OpIdx)
6280 continue;
6281 const MachineOperand &Op = MI.getOperand(i);
6282 if (Op.isReg()) {
6283 if (Op.isUse()) {
6284 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6285 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6286 if (--ConstantBusLimit <= 0)
6287 return false;
6288 }
6289 }
6290 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6291 !isInlineConstant(Op, InstDesc.operands()[i])) {
6292 // The same literal may be used multiple times.
6293 if (!UsedLiteral)
6294 UsedLiteral = &Op;
6295 else if (UsedLiteral->isIdenticalTo(Op))
6296 continue;
6297
6298 if (!LiteralLimit--)
6299 return false;
6300 if (--ConstantBusLimit <= 0)
6301 return false;
6302 }
6303 }
6304 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6305 // There can be at most one literal operand, but it can be repeated.
6306 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6307 if (i == OpIdx)
6308 continue;
6309 const MachineOperand &Op = MI.getOperand(i);
6310 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6311 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6312 !Op.isIdenticalTo(*MO))
6313 return false;
6314
6315 // Do not fold a non-inlineable and non-register operand into an
6316 // instruction that already has a frame index. The frame index handling
6317 // code could not handle well when a frame index co-exists with another
6318 // non-register operand, unless that operand is an inlineable immediate.
6319 if (Op.isFI())
6320 return false;
6321 }
6322 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6323 isF16PseudoScalarTrans(MI.getOpcode())) {
6324 return false;
6325 }
6326
6327 if (MO->isReg()) {
6328 if (!DefinedRC)
6329 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6330 return isLegalRegOperand(MI, OpIdx, *MO);
6331 }
6332
6333 if (MO->isImm()) {
6334 uint64_t Imm = MO->getImm();
6335 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6336 bool Is64BitOp = Is64BitFPOp ||
6337 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6338 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6339 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6340 if (Is64BitOp &&
6341 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6342 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6343 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6344 return false;
6345
6346 // FIXME: We can use sign extended 64-bit literals, but only for signed
6347 // operands. At the moment we do not know if an operand is signed.
6348 // Such operand will be encoded as its low 32 bits and then either
6349 // correctly sign extended or incorrectly zero extended by HW.
6350 // If 64-bit literals are supported and the literal will be encoded
6351 // as full 64 bit we still can use it.
6352 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6353 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6354 return false;
6355 }
6356 }
6357
6358 // Handle non-register types that are treated like immediates.
6359 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6360
6361 if (!DefinedRC) {
6362 // This operand expects an immediate.
6363 return true;
6364 }
6365
6366 return isImmOperandLegal(MI, OpIdx, *MO);
6367}
6368
6370 bool IsGFX950Only = ST.hasGFX950Insts();
6371 bool IsGFX940Only = ST.hasGFX940Insts();
6372
6373 if (!IsGFX950Only && !IsGFX940Only)
6374 return false;
6375
6376 if (!isVALU(MI))
6377 return false;
6378
6379 // V_COS, V_EXP, V_RCP, etc.
6380 if (isTRANS(MI))
6381 return true;
6382
6383 // DOT2, DOT2C, DOT4, etc.
6384 if (isDOT(MI))
6385 return true;
6386
6387 // MFMA, SMFMA
6388 if (isMFMA(MI))
6389 return true;
6390
6391 unsigned Opcode = MI.getOpcode();
6392 switch (Opcode) {
6393 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6394 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6395 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6396 case AMDGPU::V_MQSAD_U32_U8_e64:
6397 case AMDGPU::V_PK_ADD_F16:
6398 case AMDGPU::V_PK_ADD_F32:
6399 case AMDGPU::V_PK_ADD_I16:
6400 case AMDGPU::V_PK_ADD_U16:
6401 case AMDGPU::V_PK_ASHRREV_I16:
6402 case AMDGPU::V_PK_FMA_F16:
6403 case AMDGPU::V_PK_FMA_F32:
6404 case AMDGPU::V_PK_FMAC_F16_e32:
6405 case AMDGPU::V_PK_FMAC_F16_e64:
6406 case AMDGPU::V_PK_LSHLREV_B16:
6407 case AMDGPU::V_PK_LSHRREV_B16:
6408 case AMDGPU::V_PK_MAD_I16:
6409 case AMDGPU::V_PK_MAD_U16:
6410 case AMDGPU::V_PK_MAX_F16:
6411 case AMDGPU::V_PK_MAX_I16:
6412 case AMDGPU::V_PK_MAX_U16:
6413 case AMDGPU::V_PK_MIN_F16:
6414 case AMDGPU::V_PK_MIN_I16:
6415 case AMDGPU::V_PK_MIN_U16:
6416 case AMDGPU::V_PK_MOV_B32:
6417 case AMDGPU::V_PK_MUL_F16:
6418 case AMDGPU::V_PK_MUL_F32:
6419 case AMDGPU::V_PK_MUL_LO_U16:
6420 case AMDGPU::V_PK_SUB_I16:
6421 case AMDGPU::V_PK_SUB_U16:
6422 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6423 return true;
6424 default:
6425 return false;
6426 }
6427}
6428
6430 MachineInstr &MI) const {
6431 unsigned Opc = MI.getOpcode();
6432 const MCInstrDesc &InstrDesc = get(Opc);
6433
6434 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6435 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6436
6437 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6438 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6439
6440 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6441 // we need to only have one constant bus use before GFX10.
6442 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6443 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6444 RI.isSGPRReg(MRI, Src0.getReg()))
6445 legalizeOpWithMove(MI, Src0Idx);
6446
6447 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6448 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6449 // src0/src1 with V_READFIRSTLANE.
6450 if (Opc == AMDGPU::V_WRITELANE_B32) {
6451 const DebugLoc &DL = MI.getDebugLoc();
6452 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6453 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6454 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6455 .add(Src0);
6456 Src0.ChangeToRegister(Reg, false);
6457 }
6458 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6459 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6460 const DebugLoc &DL = MI.getDebugLoc();
6461 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6462 .add(Src1);
6463 Src1.ChangeToRegister(Reg, false);
6464 }
6465 return;
6466 }
6467
6468 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6469 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6470 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6471 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6472 legalizeOpWithMove(MI, Src2Idx);
6473 }
6474
6475 // VOP2 src0 instructions support all operand types, so we don't need to check
6476 // their legality. If src1 is already legal, we don't need to do anything.
6477 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6478 return;
6479
6480 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6481 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6482 // select is uniform.
6483 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6484 RI.isVGPR(MRI, Src1.getReg())) {
6485 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6486 const DebugLoc &DL = MI.getDebugLoc();
6487 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6488 .add(Src1);
6489 Src1.ChangeToRegister(Reg, false);
6490 return;
6491 }
6492
6493 // We do not use commuteInstruction here because it is too aggressive and will
6494 // commute if it is possible. We only want to commute here if it improves
6495 // legality. This can be called a fairly large number of times so don't waste
6496 // compile time pointlessly swapping and checking legality again.
6497 if (HasImplicitSGPR || !MI.isCommutable()) {
6498 legalizeOpWithMove(MI, Src1Idx);
6499 return;
6500 }
6501
6502 // If src0 can be used as src1, commuting will make the operands legal.
6503 // Otherwise we have to give up and insert a move.
6504 //
6505 // TODO: Other immediate-like operand kinds could be commuted if there was a
6506 // MachineOperand::ChangeTo* for them.
6507 if ((!Src1.isImm() && !Src1.isReg()) ||
6508 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6509 legalizeOpWithMove(MI, Src1Idx);
6510 return;
6511 }
6512
6513 int CommutedOpc = commuteOpcode(MI);
6514 if (CommutedOpc == -1) {
6515 legalizeOpWithMove(MI, Src1Idx);
6516 return;
6517 }
6518
6519 MI.setDesc(get(CommutedOpc));
6520
6521 Register Src0Reg = Src0.getReg();
6522 unsigned Src0SubReg = Src0.getSubReg();
6523 bool Src0Kill = Src0.isKill();
6524
6525 if (Src1.isImm())
6526 Src0.ChangeToImmediate(Src1.getImm());
6527 else if (Src1.isReg()) {
6528 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6529 Src0.setSubReg(Src1.getSubReg());
6530 } else
6531 llvm_unreachable("Should only have register or immediate operands");
6532
6533 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6534 Src1.setSubReg(Src0SubReg);
6536}
6537
6538// Legalize VOP3 operands. All operand types are supported for any operand
6539// but only one literal constant and only starting from GFX10.
6541 MachineInstr &MI) const {
6542 unsigned Opc = MI.getOpcode();
6543
6544 int VOP3Idx[3] = {
6545 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6546 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6547 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6548 };
6549
6550 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6551 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6552 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6553 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6554 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6555 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6556 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6557 // src1 and src2 must be scalar
6558 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6559 const DebugLoc &DL = MI.getDebugLoc();
6560 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6561 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6562 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6563 .add(Src1);
6564 Src1.ChangeToRegister(Reg, false);
6565 }
6566 if (VOP3Idx[2] != -1) {
6567 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6568 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6569 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6570 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6571 .add(Src2);
6572 Src2.ChangeToRegister(Reg, false);
6573 }
6574 }
6575 }
6576
6577 // Find the one SGPR operand we are allowed to use.
6578 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6579 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6580 SmallDenseSet<unsigned> SGPRsUsed;
6581 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6582 if (SGPRReg) {
6583 SGPRsUsed.insert(SGPRReg);
6584 --ConstantBusLimit;
6585 }
6586
6587 for (int Idx : VOP3Idx) {
6588 if (Idx == -1)
6589 break;
6590 MachineOperand &MO = MI.getOperand(Idx);
6591
6592 if (!MO.isReg()) {
6593 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6594 continue;
6595
6596 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6597 --LiteralLimit;
6598 --ConstantBusLimit;
6599 continue;
6600 }
6601
6602 --LiteralLimit;
6603 --ConstantBusLimit;
6604 legalizeOpWithMove(MI, Idx);
6605 continue;
6606 }
6607
6608 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6609 continue; // VGPRs are legal
6610
6611 // We can use one SGPR in each VOP3 instruction prior to GFX10
6612 // and two starting from GFX10.
6613 if (SGPRsUsed.count(MO.getReg()))
6614 continue;
6615 if (ConstantBusLimit > 0) {
6616 SGPRsUsed.insert(MO.getReg());
6617 --ConstantBusLimit;
6618 continue;
6619 }
6620
6621 // If we make it this far, then the operand is not legal and we must
6622 // legalize it.
6623 legalizeOpWithMove(MI, Idx);
6624 }
6625
6626 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6627 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6628 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6629 legalizeOpWithMove(MI, VOP3Idx[2]);
6630
6631 // Fix the register class of packed FP32 instructions on gfx12+. See
6632 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6634 for (unsigned I = 0; I < 3; ++I) {
6636 legalizeOpWithMove(MI, VOP3Idx[I]);
6637 }
6638 }
6639}
6640
6643 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6644 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6645 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6646 if (DstRC)
6647 SRC = RI.getCommonSubClass(SRC, DstRC);
6648
6649 Register DstReg = MRI.createVirtualRegister(SRC);
6650 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6651
6652 if (RI.hasAGPRs(VRC)) {
6653 VRC = RI.getEquivalentVGPRClass(VRC);
6654 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6655 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6656 get(TargetOpcode::COPY), NewSrcReg)
6657 .addReg(SrcReg);
6658 SrcReg = NewSrcReg;
6659 }
6660
6661 if (SubRegs == 1) {
6662 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6663 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6664 .addReg(SrcReg);
6665 return DstReg;
6666 }
6667
6669 for (unsigned i = 0; i < SubRegs; ++i) {
6670 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6671 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6672 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6673 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6674 SRegs.push_back(SGPR);
6675 }
6676
6678 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6679 get(AMDGPU::REG_SEQUENCE), DstReg);
6680 for (unsigned i = 0; i < SubRegs; ++i) {
6681 MIB.addReg(SRegs[i]);
6682 MIB.addImm(RI.getSubRegFromChannel(i));
6683 }
6684 return DstReg;
6685}
6686
6688 MachineInstr &MI) const {
6689
6690 // If the pointer is store in VGPRs, then we need to move them to
6691 // SGPRs using v_readfirstlane. This is safe because we only select
6692 // loads with uniform pointers to SMRD instruction so we know the
6693 // pointer value is uniform.
6694 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6695 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6696 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6697 SBase->setReg(SGPR);
6698 }
6699 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6700 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6701 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6702 SOff->setReg(SGPR);
6703 }
6704}
6705
6707 unsigned Opc = Inst.getOpcode();
6708 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6709 if (OldSAddrIdx < 0)
6710 return false;
6711
6712 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6713
6714 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6715 if (NewOpc < 0)
6717 if (NewOpc < 0)
6718 return false;
6719
6721 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6722 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6723 return false;
6724
6725 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6726 if (NewVAddrIdx < 0)
6727 return false;
6728
6729 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6730
6731 // Check vaddr, it shall be zero or absent.
6732 MachineInstr *VAddrDef = nullptr;
6733 if (OldVAddrIdx >= 0) {
6734 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6735 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6736 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6737 !VAddrDef->getOperand(1).isImm() ||
6738 VAddrDef->getOperand(1).getImm() != 0)
6739 return false;
6740 }
6741
6742 const MCInstrDesc &NewDesc = get(NewOpc);
6743 Inst.setDesc(NewDesc);
6744
6745 // Callers expect iterator to be valid after this call, so modify the
6746 // instruction in place.
6747 if (OldVAddrIdx == NewVAddrIdx) {
6748 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6749 // Clear use list from the old vaddr holding a zero register.
6750 MRI.removeRegOperandFromUseList(&NewVAddr);
6751 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6752 Inst.removeOperand(OldSAddrIdx);
6753 // Update the use list with the pointer we have just moved from vaddr to
6754 // saddr position. Otherwise new vaddr will be missing from the use list.
6755 MRI.removeRegOperandFromUseList(&NewVAddr);
6756 MRI.addRegOperandToUseList(&NewVAddr);
6757 } else {
6758 assert(OldSAddrIdx == NewVAddrIdx);
6759
6760 if (OldVAddrIdx >= 0) {
6761 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6762 AMDGPU::OpName::vdst_in);
6763
6764 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6765 // it asserts. Untie the operands for now and retie them afterwards.
6766 if (NewVDstIn != -1) {
6767 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6768 Inst.untieRegOperand(OldVDstIn);
6769 }
6770
6771 Inst.removeOperand(OldVAddrIdx);
6772
6773 if (NewVDstIn != -1) {
6774 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6775 Inst.tieOperands(NewVDst, NewVDstIn);
6776 }
6777 }
6778 }
6779
6780 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6781 VAddrDef->eraseFromParent();
6782
6783 return true;
6784}
6785
6786// FIXME: Remove this when SelectionDAG is obsoleted.
6788 MachineInstr &MI) const {
6789 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6790 return;
6791
6792 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6793 // thinks they are uniform, so a readfirstlane should be valid.
6794 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6795 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6796 return;
6797
6799 return;
6800
6801 const TargetRegisterClass *DeclaredRC =
6802 getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
6803
6804 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6805 SAddr->setReg(ToSGPR);
6806}
6807
6810 const TargetRegisterClass *DstRC,
6813 const DebugLoc &DL) const {
6814 Register OpReg = Op.getReg();
6815 unsigned OpSubReg = Op.getSubReg();
6816
6817 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6818 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6819
6820 // Check if operand is already the correct register class.
6821 if (DstRC == OpRC)
6822 return;
6823
6824 Register DstReg = MRI.createVirtualRegister(DstRC);
6825 auto Copy =
6826 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6827 Op.setReg(DstReg);
6828
6829 MachineInstr *Def = MRI.getVRegDef(OpReg);
6830 if (!Def)
6831 return;
6832
6833 // Try to eliminate the copy if it is copying an immediate value.
6834 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6835 foldImmediate(*Copy, *Def, OpReg, &MRI);
6836
6837 bool ImpDef = Def->isImplicitDef();
6838 while (!ImpDef && Def && Def->isCopy()) {
6839 if (Def->getOperand(1).getReg().isPhysical())
6840 break;
6841 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6842 ImpDef = Def && Def->isImplicitDef();
6843 }
6844 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6845 !ImpDef)
6846 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6847}
6848
6849// Emit the actual waterfall loop, executing the wrapped instruction for each
6850// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6851// iteration, in the worst case we execute 64 (once per lane).
6852static void
6855 MachineBasicBlock &LoopBB,
6856 MachineBasicBlock &BodyBB,
6857 const DebugLoc &DL,
6858 ArrayRef<MachineOperand *> ScalarOps) {
6859 MachineFunction &MF = *LoopBB.getParent();
6860 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6861 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6863 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6864
6866 Register CondReg;
6867
6868 for (MachineOperand *ScalarOp : ScalarOps) {
6869 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6870 unsigned NumSubRegs = RegSize / 32;
6871 Register VScalarOp = ScalarOp->getReg();
6872
6873 if (NumSubRegs == 1) {
6874 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6875
6876 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6877 .addReg(VScalarOp);
6878
6879 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6880
6881 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6882 .addReg(CurReg)
6883 .addReg(VScalarOp);
6884
6885 // Combine the comparison results with AND.
6886 if (!CondReg) // First.
6887 CondReg = NewCondReg;
6888 else { // If not the first, we create an AND.
6889 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6890 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6891 .addReg(CondReg)
6892 .addReg(NewCondReg);
6893 CondReg = AndReg;
6894 }
6895
6896 // Update ScalarOp operand to use the SGPR ScalarOp.
6897 ScalarOp->setReg(CurReg);
6898 ScalarOp->setIsKill();
6899 } else {
6900 SmallVector<Register, 8> ReadlanePieces;
6901 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6902 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6903 "Unhandled register size");
6904
6905 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6906 Register CurRegLo =
6907 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6908 Register CurRegHi =
6909 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6910
6911 // Read the next variant <- also loop target.
6912 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6913 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6914
6915 // Read the next variant <- also loop target.
6916 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6917 .addReg(VScalarOp, VScalarOpUndef,
6918 TRI->getSubRegFromChannel(Idx + 1));
6919
6920 ReadlanePieces.push_back(CurRegLo);
6921 ReadlanePieces.push_back(CurRegHi);
6922
6923 // Comparison is to be done as 64-bit.
6924 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6925 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6926 .addReg(CurRegLo)
6927 .addImm(AMDGPU::sub0)
6928 .addReg(CurRegHi)
6929 .addImm(AMDGPU::sub1);
6930
6931 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6932 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6933 NewCondReg)
6934 .addReg(CurReg);
6935 if (NumSubRegs <= 2)
6936 Cmp.addReg(VScalarOp);
6937 else
6938 Cmp.addReg(VScalarOp, VScalarOpUndef,
6939 TRI->getSubRegFromChannel(Idx, 2));
6940
6941 // Combine the comparison results with AND.
6942 if (!CondReg) // First.
6943 CondReg = NewCondReg;
6944 else { // If not the first, we create an AND.
6945 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6946 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6947 .addReg(CondReg)
6948 .addReg(NewCondReg);
6949 CondReg = AndReg;
6950 }
6951 } // End for loop.
6952
6953 const auto *SScalarOpRC =
6954 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6955 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6956
6957 // Build scalar ScalarOp.
6958 auto Merge =
6959 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6960 unsigned Channel = 0;
6961 for (Register Piece : ReadlanePieces) {
6962 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6963 }
6964
6965 // Update ScalarOp operand to use the SGPR ScalarOp.
6966 ScalarOp->setReg(SScalarOp);
6967 ScalarOp->setIsKill();
6968 }
6969 }
6970
6971 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6972 MRI.setSimpleHint(SaveExec, CondReg);
6973
6974 // Update EXEC to matching lanes, saving original to SaveExec.
6975 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
6976 .addReg(CondReg, RegState::Kill);
6977
6978 // The original instruction is here; we insert the terminators after it.
6979 I = BodyBB.end();
6980
6981 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6982 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
6983 .addReg(LMC.ExecReg)
6984 .addReg(SaveExec);
6985
6986 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6987}
6988
6989// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6990// with SGPRs by iterating over all unique values across all lanes.
6991// Returns the loop basic block that now contains \p MI.
6992static MachineBasicBlock *
6996 MachineBasicBlock::iterator Begin = nullptr,
6997 MachineBasicBlock::iterator End = nullptr) {
6998 MachineBasicBlock &MBB = *MI.getParent();
6999 MachineFunction &MF = *MBB.getParent();
7000 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7001 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7003 if (!Begin.isValid())
7004 Begin = &MI;
7005 if (!End.isValid()) {
7006 End = &MI;
7007 ++End;
7008 }
7009 const DebugLoc &DL = MI.getDebugLoc();
7011 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7012
7013 // Save SCC. Waterfall Loop may overwrite SCC.
7014 Register SaveSCCReg;
7015
7016 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7017 // rather than unlimited scan everywhere
7018 bool SCCNotDead =
7019 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7020 std::numeric_limits<unsigned>::max()) !=
7022 if (SCCNotDead) {
7023 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7024 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7025 .addImm(1)
7026 .addImm(0);
7027 }
7028
7029 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7030
7031 // Save the EXEC mask
7032 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7033
7034 // Killed uses in the instruction we are waterfalling around will be
7035 // incorrect due to the added control-flow.
7037 ++AfterMI;
7038 for (auto I = Begin; I != AfterMI; I++) {
7039 for (auto &MO : I->all_uses())
7040 MRI.clearKillFlags(MO.getReg());
7041 }
7042
7043 // To insert the loop we need to split the block. Move everything after this
7044 // point to a new block, and insert a new empty block between the two.
7047 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7049 ++MBBI;
7050
7051 MF.insert(MBBI, LoopBB);
7052 MF.insert(MBBI, BodyBB);
7053 MF.insert(MBBI, RemainderBB);
7054
7055 LoopBB->addSuccessor(BodyBB);
7056 BodyBB->addSuccessor(LoopBB);
7057 BodyBB->addSuccessor(RemainderBB);
7058
7059 // Move Begin to MI to the BodyBB, and the remainder of the block to
7060 // RemainderBB.
7061 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7062 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7063 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7064
7065 MBB.addSuccessor(LoopBB);
7066
7067 // Update dominators. We know that MBB immediately dominates LoopBB, that
7068 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7069 // RemainderBB. RemainderBB immediately dominates all of the successors
7070 // transferred to it from MBB that MBB used to properly dominate.
7071 if (MDT) {
7072 MDT->addNewBlock(LoopBB, &MBB);
7073 MDT->addNewBlock(BodyBB, LoopBB);
7074 MDT->addNewBlock(RemainderBB, BodyBB);
7075 for (auto &Succ : RemainderBB->successors()) {
7076 if (MDT->properlyDominates(&MBB, Succ)) {
7077 MDT->changeImmediateDominator(Succ, RemainderBB);
7078 }
7079 }
7080 }
7081
7082 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7083
7084 MachineBasicBlock::iterator First = RemainderBB->begin();
7085 // Restore SCC
7086 if (SCCNotDead) {
7087 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7088 .addReg(SaveSCCReg, RegState::Kill)
7089 .addImm(0);
7090 }
7091
7092 // Restore the EXEC mask
7093 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7094 .addReg(SaveExec);
7095 return BodyBB;
7096}
7097
7098// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7099static std::tuple<unsigned, unsigned>
7101 MachineBasicBlock &MBB = *MI.getParent();
7102 MachineFunction &MF = *MBB.getParent();
7104
7105 // Extract the ptr from the resource descriptor.
7106 unsigned RsrcPtr =
7107 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7108 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7109
7110 // Create an empty resource descriptor
7111 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7112 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7113 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7114 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7115 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7116
7117 // Zero64 = 0
7118 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7119 .addImm(0);
7120
7121 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7122 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7123 .addImm(Lo_32(RsrcDataFormat));
7124
7125 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7126 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7127 .addImm(Hi_32(RsrcDataFormat));
7128
7129 // NewSRsrc = {Zero64, SRsrcFormat}
7130 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7131 .addReg(Zero64)
7132 .addImm(AMDGPU::sub0_sub1)
7133 .addReg(SRsrcFormatLo)
7134 .addImm(AMDGPU::sub2)
7135 .addReg(SRsrcFormatHi)
7136 .addImm(AMDGPU::sub3);
7137
7138 return std::tuple(RsrcPtr, NewSRsrc);
7139}
7140
7143 MachineDominatorTree *MDT) const {
7144 MachineFunction &MF = *MI.getParent()->getParent();
7146 MachineBasicBlock *CreatedBB = nullptr;
7147
7148 // Legalize VOP2
7149 if (isVOP2(MI) || isVOPC(MI)) {
7151 return CreatedBB;
7152 }
7153
7154 // Legalize VOP3
7155 if (isVOP3(MI)) {
7157 return CreatedBB;
7158 }
7159
7160 // Legalize SMRD
7161 if (isSMRD(MI)) {
7163 return CreatedBB;
7164 }
7165
7166 // Legalize FLAT
7167 if (isFLAT(MI)) {
7169 return CreatedBB;
7170 }
7171
7172 // Legalize REG_SEQUENCE and PHI
7173 // The register class of the operands much be the same type as the register
7174 // class of the output.
7175 if (MI.getOpcode() == AMDGPU::PHI) {
7176 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7177 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7178 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7179 continue;
7180 const TargetRegisterClass *OpRC =
7181 MRI.getRegClass(MI.getOperand(i).getReg());
7182 if (RI.hasVectorRegisters(OpRC)) {
7183 VRC = OpRC;
7184 } else {
7185 SRC = OpRC;
7186 }
7187 }
7188
7189 // If any of the operands are VGPR registers, then they all most be
7190 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7191 // them.
7192 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7193 if (!VRC) {
7194 assert(SRC);
7195 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7196 VRC = &AMDGPU::VReg_1RegClass;
7197 } else
7198 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7199 ? RI.getEquivalentAGPRClass(SRC)
7200 : RI.getEquivalentVGPRClass(SRC);
7201 } else {
7202 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7203 ? RI.getEquivalentAGPRClass(VRC)
7204 : RI.getEquivalentVGPRClass(VRC);
7205 }
7206 RC = VRC;
7207 } else {
7208 RC = SRC;
7209 }
7210
7211 // Update all the operands so they have the same type.
7212 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7213 MachineOperand &Op = MI.getOperand(I);
7214 if (!Op.isReg() || !Op.getReg().isVirtual())
7215 continue;
7216
7217 // MI is a PHI instruction.
7218 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7220
7221 // Avoid creating no-op copies with the same src and dst reg class. These
7222 // confuse some of the machine passes.
7223 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7224 }
7225 }
7226
7227 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7228 // VGPR dest type and SGPR sources, insert copies so all operands are
7229 // VGPRs. This seems to help operand folding / the register coalescer.
7230 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7231 MachineBasicBlock *MBB = MI.getParent();
7232 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7233 if (RI.hasVGPRs(DstRC)) {
7234 // Update all the operands so they are VGPR register classes. These may
7235 // not be the same register class because REG_SEQUENCE supports mixing
7236 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7237 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7238 MachineOperand &Op = MI.getOperand(I);
7239 if (!Op.isReg() || !Op.getReg().isVirtual())
7240 continue;
7241
7242 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7243 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7244 if (VRC == OpRC)
7245 continue;
7246
7247 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7248 Op.setIsKill();
7249 }
7250 }
7251
7252 return CreatedBB;
7253 }
7254
7255 // Legalize INSERT_SUBREG
7256 // src0 must have the same register class as dst
7257 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7258 Register Dst = MI.getOperand(0).getReg();
7259 Register Src0 = MI.getOperand(1).getReg();
7260 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7261 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7262 if (DstRC != Src0RC) {
7263 MachineBasicBlock *MBB = MI.getParent();
7264 MachineOperand &Op = MI.getOperand(1);
7265 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7266 }
7267 return CreatedBB;
7268 }
7269
7270 // Legalize SI_INIT_M0
7271 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7272 MachineOperand &Src = MI.getOperand(0);
7273 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7274 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7275 return CreatedBB;
7276 }
7277
7278 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7279 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7280 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7281 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7282 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7283 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7284 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7285 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7286 MachineOperand &Src = MI.getOperand(1);
7287 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7288 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7289 return CreatedBB;
7290 }
7291
7292 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7293 //
7294 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7295 // scratch memory access. In both cases, the legalization never involves
7296 // conversion to the addr64 form.
7298 (isMUBUF(MI) || isMTBUF(MI)))) {
7299 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7300 ? AMDGPU::OpName::rsrc
7301 : AMDGPU::OpName::srsrc;
7302 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7303 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7304 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7305
7306 AMDGPU::OpName SampOpName =
7307 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7308 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7309 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7310 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7311
7312 return CreatedBB;
7313 }
7314
7315 // Legalize SI_CALL
7316 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7317 MachineOperand *Dest = &MI.getOperand(0);
7318 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7319 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7320 // following copies, we also need to move copies from and to physical
7321 // registers into the loop block.
7322 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7323 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7324
7325 // Also move the copies to physical registers into the loop block
7326 MachineBasicBlock &MBB = *MI.getParent();
7328 while (Start->getOpcode() != FrameSetupOpcode)
7329 --Start;
7331 while (End->getOpcode() != FrameDestroyOpcode)
7332 ++End;
7333 // Also include following copies of the return value
7334 ++End;
7335 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7336 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7337 ++End;
7338 CreatedBB =
7339 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7340 }
7341 }
7342
7343 // Legalize s_sleep_var.
7344 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7345 const DebugLoc &DL = MI.getDebugLoc();
7346 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7347 int Src0Idx =
7348 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7349 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7350 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7351 .add(Src0);
7352 Src0.ChangeToRegister(Reg, false);
7353 return nullptr;
7354 }
7355
7356 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7357 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7358 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7359 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7360 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7361 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7362 for (MachineOperand &Src : MI.explicit_operands()) {
7363 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7364 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7365 }
7366 return CreatedBB;
7367 }
7368
7369 // Legalize MUBUF instructions.
7370 bool isSoffsetLegal = true;
7371 int SoffsetIdx =
7372 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7373 if (SoffsetIdx != -1) {
7374 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7375 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7376 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7377 isSoffsetLegal = false;
7378 }
7379 }
7380
7381 bool isRsrcLegal = true;
7382 int RsrcIdx =
7383 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7384 if (RsrcIdx != -1) {
7385 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7386 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7387 isRsrcLegal = false;
7388 }
7389
7390 // The operands are legal.
7391 if (isRsrcLegal && isSoffsetLegal)
7392 return CreatedBB;
7393
7394 if (!isRsrcLegal) {
7395 // Legalize a VGPR Rsrc
7396 //
7397 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7398 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7399 // a zero-value SRsrc.
7400 //
7401 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7402 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7403 // above.
7404 //
7405 // Otherwise we are on non-ADDR64 hardware, and/or we have
7406 // idxen/offen/bothen and we fall back to a waterfall loop.
7407
7408 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7409 MachineBasicBlock &MBB = *MI.getParent();
7410
7411 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7412 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7413 // This is already an ADDR64 instruction so we need to add the pointer
7414 // extracted from the resource descriptor to the current value of VAddr.
7415 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7416 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7417 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7418
7419 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7420 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7421 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7422
7423 unsigned RsrcPtr, NewSRsrc;
7424 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7425
7426 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7427 const DebugLoc &DL = MI.getDebugLoc();
7428 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7429 .addDef(CondReg0)
7430 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7431 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7432 .addImm(0);
7433
7434 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7435 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7436 .addDef(CondReg1, RegState::Dead)
7437 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7438 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7439 .addReg(CondReg0, RegState::Kill)
7440 .addImm(0);
7441
7442 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7443 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7444 .addReg(NewVAddrLo)
7445 .addImm(AMDGPU::sub0)
7446 .addReg(NewVAddrHi)
7447 .addImm(AMDGPU::sub1);
7448
7449 VAddr->setReg(NewVAddr);
7450 Rsrc->setReg(NewSRsrc);
7451 } else if (!VAddr && ST.hasAddr64()) {
7452 // This instructions is the _OFFSET variant, so we need to convert it to
7453 // ADDR64.
7454 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7455 "FIXME: Need to emit flat atomics here");
7456
7457 unsigned RsrcPtr, NewSRsrc;
7458 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7459
7460 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7461 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7462 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7463 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7464 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7465
7466 // Atomics with return have an additional tied operand and are
7467 // missing some of the special bits.
7468 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7469 MachineInstr *Addr64;
7470
7471 if (!VDataIn) {
7472 // Regular buffer load / store.
7474 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7475 .add(*VData)
7476 .addReg(NewVAddr)
7477 .addReg(NewSRsrc)
7478 .add(*SOffset)
7479 .add(*Offset);
7480
7481 if (const MachineOperand *CPol =
7482 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7483 MIB.addImm(CPol->getImm());
7484 }
7485
7486 if (const MachineOperand *TFE =
7487 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7488 MIB.addImm(TFE->getImm());
7489 }
7490
7491 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7492
7493 MIB.cloneMemRefs(MI);
7494 Addr64 = MIB;
7495 } else {
7496 // Atomics with return.
7497 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7498 .add(*VData)
7499 .add(*VDataIn)
7500 .addReg(NewVAddr)
7501 .addReg(NewSRsrc)
7502 .add(*SOffset)
7503 .add(*Offset)
7504 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7505 .cloneMemRefs(MI);
7506 }
7507
7508 MI.removeFromParent();
7509
7510 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7511 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7512 NewVAddr)
7513 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7514 .addImm(AMDGPU::sub0)
7515 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7516 .addImm(AMDGPU::sub1);
7517 } else {
7518 // Legalize a VGPR Rsrc and soffset together.
7519 if (!isSoffsetLegal) {
7520 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7521 CreatedBB =
7522 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7523 return CreatedBB;
7524 }
7525 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7526 return CreatedBB;
7527 }
7528 }
7529
7530 // Legalize a VGPR soffset.
7531 if (!isSoffsetLegal) {
7532 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7533 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7534 return CreatedBB;
7535 }
7536 return CreatedBB;
7537}
7538
7540 InstrList.insert(MI);
7541 // Add MBUF instructiosn to deferred list.
7542 int RsrcIdx =
7543 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7544 if (RsrcIdx != -1) {
7545 DeferredList.insert(MI);
7546 }
7547}
7548
7550 return DeferredList.contains(MI);
7551}
7552
7553// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7554// lowering (change spgr to vgpr).
7555// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7556// size. Need to legalize the size of the operands during the vgpr lowering
7557// chain. This can be removed after we have sgpr16 in place
7559 MachineRegisterInfo &MRI) const {
7560 if (!ST.useRealTrue16Insts())
7561 return;
7562
7563 unsigned Opcode = MI.getOpcode();
7564 MachineBasicBlock *MBB = MI.getParent();
7565 // Legalize operands and check for size mismatch
7566 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7567 OpIdx >= get(Opcode).getNumOperands() ||
7568 get(Opcode).operands()[OpIdx].RegClass == -1)
7569 return;
7570
7571 MachineOperand &Op = MI.getOperand(OpIdx);
7572 if (!Op.isReg() || !Op.getReg().isVirtual())
7573 return;
7574
7575 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7576 if (!RI.isVGPRClass(CurrRC))
7577 return;
7578
7579 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7580 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7581 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7582 Op.setSubReg(AMDGPU::lo16);
7583 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7584 const DebugLoc &DL = MI.getDebugLoc();
7585 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7586 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7587 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7588 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7589 .addReg(Op.getReg())
7590 .addImm(AMDGPU::lo16)
7591 .addReg(Undef)
7592 .addImm(AMDGPU::hi16);
7593 Op.setReg(NewDstReg);
7594 }
7595}
7597 MachineRegisterInfo &MRI) const {
7598 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7600}
7601
7603 MachineDominatorTree *MDT) const {
7604
7605 while (!Worklist.empty()) {
7606 MachineInstr &Inst = *Worklist.top();
7607 Worklist.erase_top();
7608 // Skip MachineInstr in the deferred list.
7609 if (Worklist.isDeferred(&Inst))
7610 continue;
7611 moveToVALUImpl(Worklist, MDT, Inst);
7612 }
7613
7614 // Deferred list of instructions will be processed once
7615 // all the MachineInstr in the worklist are done.
7616 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7617 moveToVALUImpl(Worklist, MDT, *Inst);
7618 assert(Worklist.empty() &&
7619 "Deferred MachineInstr are not supposed to re-populate worklist");
7620 }
7621}
7622
7625 MachineInstr &Inst) const {
7626
7628 if (!MBB)
7629 return;
7630 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7631 unsigned Opcode = Inst.getOpcode();
7632 unsigned NewOpcode = getVALUOp(Inst);
7633 // Handle some special cases
7634 switch (Opcode) {
7635 default:
7636 break;
7637 case AMDGPU::S_ADD_I32:
7638 case AMDGPU::S_SUB_I32: {
7639 // FIXME: The u32 versions currently selected use the carry.
7640 bool Changed;
7641 MachineBasicBlock *CreatedBBTmp = nullptr;
7642 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7643 if (Changed)
7644 return;
7645
7646 // Default handling
7647 break;
7648 }
7649
7650 case AMDGPU::S_MUL_U64:
7651 if (ST.hasVectorMulU64()) {
7652 NewOpcode = AMDGPU::V_MUL_U64_e64;
7653 break;
7654 }
7655 // Split s_mul_u64 in 32-bit vector multiplications.
7656 splitScalarSMulU64(Worklist, Inst, MDT);
7657 Inst.eraseFromParent();
7658 return;
7659
7660 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7661 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7662 // This is a special case of s_mul_u64 where all the operands are either
7663 // zero extended or sign extended.
7664 splitScalarSMulPseudo(Worklist, Inst, MDT);
7665 Inst.eraseFromParent();
7666 return;
7667
7668 case AMDGPU::S_AND_B64:
7669 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7670 Inst.eraseFromParent();
7671 return;
7672
7673 case AMDGPU::S_OR_B64:
7674 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7675 Inst.eraseFromParent();
7676 return;
7677
7678 case AMDGPU::S_XOR_B64:
7679 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7680 Inst.eraseFromParent();
7681 return;
7682
7683 case AMDGPU::S_NAND_B64:
7684 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7685 Inst.eraseFromParent();
7686 return;
7687
7688 case AMDGPU::S_NOR_B64:
7689 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7690 Inst.eraseFromParent();
7691 return;
7692
7693 case AMDGPU::S_XNOR_B64:
7694 if (ST.hasDLInsts())
7695 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7696 else
7697 splitScalar64BitXnor(Worklist, Inst, MDT);
7698 Inst.eraseFromParent();
7699 return;
7700
7701 case AMDGPU::S_ANDN2_B64:
7702 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7703 Inst.eraseFromParent();
7704 return;
7705
7706 case AMDGPU::S_ORN2_B64:
7707 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7708 Inst.eraseFromParent();
7709 return;
7710
7711 case AMDGPU::S_BREV_B64:
7712 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7713 Inst.eraseFromParent();
7714 return;
7715
7716 case AMDGPU::S_NOT_B64:
7717 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7718 Inst.eraseFromParent();
7719 return;
7720
7721 case AMDGPU::S_BCNT1_I32_B64:
7722 splitScalar64BitBCNT(Worklist, Inst);
7723 Inst.eraseFromParent();
7724 return;
7725
7726 case AMDGPU::S_BFE_I64:
7727 splitScalar64BitBFE(Worklist, Inst);
7728 Inst.eraseFromParent();
7729 return;
7730
7731 case AMDGPU::S_FLBIT_I32_B64:
7732 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7733 Inst.eraseFromParent();
7734 return;
7735 case AMDGPU::S_FF1_I32_B64:
7736 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7737 Inst.eraseFromParent();
7738 return;
7739
7740 case AMDGPU::S_LSHL_B32:
7741 if (ST.hasOnlyRevVALUShifts()) {
7742 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7743 swapOperands(Inst);
7744 }
7745 break;
7746 case AMDGPU::S_ASHR_I32:
7747 if (ST.hasOnlyRevVALUShifts()) {
7748 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7749 swapOperands(Inst);
7750 }
7751 break;
7752 case AMDGPU::S_LSHR_B32:
7753 if (ST.hasOnlyRevVALUShifts()) {
7754 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7755 swapOperands(Inst);
7756 }
7757 break;
7758 case AMDGPU::S_LSHL_B64:
7759 if (ST.hasOnlyRevVALUShifts()) {
7760 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7761 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7762 : AMDGPU::V_LSHLREV_B64_e64;
7763 swapOperands(Inst);
7764 }
7765 break;
7766 case AMDGPU::S_ASHR_I64:
7767 if (ST.hasOnlyRevVALUShifts()) {
7768 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7769 swapOperands(Inst);
7770 }
7771 break;
7772 case AMDGPU::S_LSHR_B64:
7773 if (ST.hasOnlyRevVALUShifts()) {
7774 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7775 swapOperands(Inst);
7776 }
7777 break;
7778
7779 case AMDGPU::S_ABS_I32:
7780 lowerScalarAbs(Worklist, Inst);
7781 Inst.eraseFromParent();
7782 return;
7783
7784 case AMDGPU::S_CBRANCH_SCC0:
7785 case AMDGPU::S_CBRANCH_SCC1: {
7786 // Clear unused bits of vcc
7787 Register CondReg = Inst.getOperand(1).getReg();
7788 bool IsSCC = CondReg == AMDGPU::SCC;
7790 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7791 .addReg(LMC.ExecReg)
7792 .addReg(IsSCC ? LMC.VccReg : CondReg);
7793 Inst.removeOperand(1);
7794 } break;
7795
7796 case AMDGPU::S_BFE_U64:
7797 case AMDGPU::S_BFM_B64:
7798 llvm_unreachable("Moving this op to VALU not implemented");
7799
7800 case AMDGPU::S_PACK_LL_B32_B16:
7801 case AMDGPU::S_PACK_LH_B32_B16:
7802 case AMDGPU::S_PACK_HL_B32_B16:
7803 case AMDGPU::S_PACK_HH_B32_B16:
7804 movePackToVALU(Worklist, MRI, Inst);
7805 Inst.eraseFromParent();
7806 return;
7807
7808 case AMDGPU::S_XNOR_B32:
7809 lowerScalarXnor(Worklist, Inst);
7810 Inst.eraseFromParent();
7811 return;
7812
7813 case AMDGPU::S_NAND_B32:
7814 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7815 Inst.eraseFromParent();
7816 return;
7817
7818 case AMDGPU::S_NOR_B32:
7819 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7820 Inst.eraseFromParent();
7821 return;
7822
7823 case AMDGPU::S_ANDN2_B32:
7824 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7825 Inst.eraseFromParent();
7826 return;
7827
7828 case AMDGPU::S_ORN2_B32:
7829 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7830 Inst.eraseFromParent();
7831 return;
7832
7833 // TODO: remove as soon as everything is ready
7834 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7835 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7836 // can only be selected from the uniform SDNode.
7837 case AMDGPU::S_ADD_CO_PSEUDO:
7838 case AMDGPU::S_SUB_CO_PSEUDO: {
7839 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7840 ? AMDGPU::V_ADDC_U32_e64
7841 : AMDGPU::V_SUBB_U32_e64;
7842 const auto *CarryRC = RI.getWaveMaskRegClass();
7843
7844 Register CarryInReg = Inst.getOperand(4).getReg();
7845 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7846 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7847 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7848 .addReg(CarryInReg);
7849 }
7850
7851 Register CarryOutReg = Inst.getOperand(1).getReg();
7852
7853 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7854 MRI.getRegClass(Inst.getOperand(0).getReg())));
7855 MachineInstr *CarryOp =
7856 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7857 .addReg(CarryOutReg, RegState::Define)
7858 .add(Inst.getOperand(2))
7859 .add(Inst.getOperand(3))
7860 .addReg(CarryInReg)
7861 .addImm(0);
7862 legalizeOperands(*CarryOp);
7863 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7864 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7865 Inst.eraseFromParent();
7866 }
7867 return;
7868 case AMDGPU::S_UADDO_PSEUDO:
7869 case AMDGPU::S_USUBO_PSEUDO: {
7870 const DebugLoc &DL = Inst.getDebugLoc();
7871 MachineOperand &Dest0 = Inst.getOperand(0);
7872 MachineOperand &Dest1 = Inst.getOperand(1);
7873 MachineOperand &Src0 = Inst.getOperand(2);
7874 MachineOperand &Src1 = Inst.getOperand(3);
7875
7876 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7877 ? AMDGPU::V_ADD_CO_U32_e64
7878 : AMDGPU::V_SUB_CO_U32_e64;
7879 const TargetRegisterClass *NewRC =
7880 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7881 Register DestReg = MRI.createVirtualRegister(NewRC);
7882 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7883 .addReg(Dest1.getReg(), RegState::Define)
7884 .add(Src0)
7885 .add(Src1)
7886 .addImm(0); // clamp bit
7887
7888 legalizeOperands(*NewInstr, MDT);
7889 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7890 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7891 Worklist);
7892 Inst.eraseFromParent();
7893 }
7894 return;
7895
7896 case AMDGPU::S_CSELECT_B32:
7897 case AMDGPU::S_CSELECT_B64:
7898 lowerSelect(Worklist, Inst, MDT);
7899 Inst.eraseFromParent();
7900 return;
7901 case AMDGPU::S_CMP_EQ_I32:
7902 case AMDGPU::S_CMP_LG_I32:
7903 case AMDGPU::S_CMP_GT_I32:
7904 case AMDGPU::S_CMP_GE_I32:
7905 case AMDGPU::S_CMP_LT_I32:
7906 case AMDGPU::S_CMP_LE_I32:
7907 case AMDGPU::S_CMP_EQ_U32:
7908 case AMDGPU::S_CMP_LG_U32:
7909 case AMDGPU::S_CMP_GT_U32:
7910 case AMDGPU::S_CMP_GE_U32:
7911 case AMDGPU::S_CMP_LT_U32:
7912 case AMDGPU::S_CMP_LE_U32:
7913 case AMDGPU::S_CMP_EQ_U64:
7914 case AMDGPU::S_CMP_LG_U64:
7915 case AMDGPU::S_CMP_LT_F32:
7916 case AMDGPU::S_CMP_EQ_F32:
7917 case AMDGPU::S_CMP_LE_F32:
7918 case AMDGPU::S_CMP_GT_F32:
7919 case AMDGPU::S_CMP_LG_F32:
7920 case AMDGPU::S_CMP_GE_F32:
7921 case AMDGPU::S_CMP_O_F32:
7922 case AMDGPU::S_CMP_U_F32:
7923 case AMDGPU::S_CMP_NGE_F32:
7924 case AMDGPU::S_CMP_NLG_F32:
7925 case AMDGPU::S_CMP_NGT_F32:
7926 case AMDGPU::S_CMP_NLE_F32:
7927 case AMDGPU::S_CMP_NEQ_F32:
7928 case AMDGPU::S_CMP_NLT_F32: {
7929 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7930 auto NewInstr =
7931 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7932 .setMIFlags(Inst.getFlags());
7933 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7934 0) {
7935 NewInstr
7936 .addImm(0) // src0_modifiers
7937 .add(Inst.getOperand(0)) // src0
7938 .addImm(0) // src1_modifiers
7939 .add(Inst.getOperand(1)) // src1
7940 .addImm(0); // clamp
7941 } else {
7942 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7943 }
7944 legalizeOperands(*NewInstr, MDT);
7945 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7946 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7947 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7948 Inst.eraseFromParent();
7949 return;
7950 }
7951 case AMDGPU::S_CMP_LT_F16:
7952 case AMDGPU::S_CMP_EQ_F16:
7953 case AMDGPU::S_CMP_LE_F16:
7954 case AMDGPU::S_CMP_GT_F16:
7955 case AMDGPU::S_CMP_LG_F16:
7956 case AMDGPU::S_CMP_GE_F16:
7957 case AMDGPU::S_CMP_O_F16:
7958 case AMDGPU::S_CMP_U_F16:
7959 case AMDGPU::S_CMP_NGE_F16:
7960 case AMDGPU::S_CMP_NLG_F16:
7961 case AMDGPU::S_CMP_NGT_F16:
7962 case AMDGPU::S_CMP_NLE_F16:
7963 case AMDGPU::S_CMP_NEQ_F16:
7964 case AMDGPU::S_CMP_NLT_F16: {
7965 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7966 auto NewInstr =
7967 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7968 .setMIFlags(Inst.getFlags());
7969 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7970 NewInstr
7971 .addImm(0) // src0_modifiers
7972 .add(Inst.getOperand(0)) // src0
7973 .addImm(0) // src1_modifiers
7974 .add(Inst.getOperand(1)) // src1
7975 .addImm(0); // clamp
7976 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7977 NewInstr.addImm(0); // op_sel0
7978 } else {
7979 NewInstr
7980 .add(Inst.getOperand(0))
7981 .add(Inst.getOperand(1));
7982 }
7983 legalizeOperandsVALUt16(*NewInstr, MRI);
7984 legalizeOperands(*NewInstr, MDT);
7985 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7986 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7987 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7988 Inst.eraseFromParent();
7989 return;
7990 }
7991 case AMDGPU::S_CVT_HI_F32_F16: {
7992 const DebugLoc &DL = Inst.getDebugLoc();
7993 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7994 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7995 if (ST.useRealTrue16Insts()) {
7996 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7997 .add(Inst.getOperand(1));
7998 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7999 .addImm(0) // src0_modifiers
8000 .addReg(TmpReg, 0, AMDGPU::hi16)
8001 .addImm(0) // clamp
8002 .addImm(0) // omod
8003 .addImm(0); // op_sel0
8004 } else {
8005 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8006 .addImm(16)
8007 .add(Inst.getOperand(1));
8008 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8009 .addImm(0) // src0_modifiers
8010 .addReg(TmpReg)
8011 .addImm(0) // clamp
8012 .addImm(0); // omod
8013 }
8014
8015 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8016 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8017 Inst.eraseFromParent();
8018 return;
8019 }
8020 case AMDGPU::S_MINIMUM_F32:
8021 case AMDGPU::S_MAXIMUM_F32: {
8022 const DebugLoc &DL = Inst.getDebugLoc();
8023 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8024 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8025 .addImm(0) // src0_modifiers
8026 .add(Inst.getOperand(1))
8027 .addImm(0) // src1_modifiers
8028 .add(Inst.getOperand(2))
8029 .addImm(0) // clamp
8030 .addImm(0); // omod
8031 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8032
8033 legalizeOperands(*NewInstr, MDT);
8034 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8035 Inst.eraseFromParent();
8036 return;
8037 }
8038 case AMDGPU::S_MINIMUM_F16:
8039 case AMDGPU::S_MAXIMUM_F16: {
8040 const DebugLoc &DL = Inst.getDebugLoc();
8041 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8042 ? &AMDGPU::VGPR_16RegClass
8043 : &AMDGPU::VGPR_32RegClass);
8044 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8045 .addImm(0) // src0_modifiers
8046 .add(Inst.getOperand(1))
8047 .addImm(0) // src1_modifiers
8048 .add(Inst.getOperand(2))
8049 .addImm(0) // clamp
8050 .addImm(0) // omod
8051 .addImm(0); // opsel0
8052 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8053 legalizeOperandsVALUt16(*NewInstr, MRI);
8054 legalizeOperands(*NewInstr, MDT);
8055 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8056 Inst.eraseFromParent();
8057 return;
8058 }
8059 case AMDGPU::V_S_EXP_F16_e64:
8060 case AMDGPU::V_S_LOG_F16_e64:
8061 case AMDGPU::V_S_RCP_F16_e64:
8062 case AMDGPU::V_S_RSQ_F16_e64:
8063 case AMDGPU::V_S_SQRT_F16_e64: {
8064 const DebugLoc &DL = Inst.getDebugLoc();
8065 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8066 ? &AMDGPU::VGPR_16RegClass
8067 : &AMDGPU::VGPR_32RegClass);
8068 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8069 .add(Inst.getOperand(1)) // src0_modifiers
8070 .add(Inst.getOperand(2))
8071 .add(Inst.getOperand(3)) // clamp
8072 .add(Inst.getOperand(4)) // omod
8073 .setMIFlags(Inst.getFlags());
8074 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8075 NewInstr.addImm(0); // opsel0
8076 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8077 legalizeOperandsVALUt16(*NewInstr, MRI);
8078 legalizeOperands(*NewInstr, MDT);
8079 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8080 Inst.eraseFromParent();
8081 return;
8082 }
8083 }
8084
8085 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8086 // We cannot move this instruction to the VALU, so we should try to
8087 // legalize its operands instead.
8088 legalizeOperands(Inst, MDT);
8089 return;
8090 }
8091 // Handle converting generic instructions like COPY-to-SGPR into
8092 // COPY-to-VGPR.
8093 if (NewOpcode == Opcode) {
8094 Register DstReg = Inst.getOperand(0).getReg();
8095 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8096
8097 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8098 // hope for the best.
8099 if (Inst.isCopy() && DstReg.isPhysical() &&
8100 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8101 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8102 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8103 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8104 .add(Inst.getOperand(1));
8105 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8106 DstReg)
8107 .addReg(NewDst);
8108
8109 Inst.eraseFromParent();
8110 return;
8111 }
8112
8113 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8114 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8115 // Instead of creating a copy where src and dst are the same register
8116 // class, we just replace all uses of dst with src. These kinds of
8117 // copies interfere with the heuristics MachineSink uses to decide
8118 // whether or not to split a critical edge. Since the pass assumes
8119 // that copies will end up as machine instructions and not be
8120 // eliminated.
8121 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8122 Register NewDstReg = Inst.getOperand(1).getReg();
8123 MRI.replaceRegWith(DstReg, NewDstReg);
8124 MRI.clearKillFlags(NewDstReg);
8125 Inst.getOperand(0).setReg(DstReg);
8126 Inst.eraseFromParent();
8127 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8128 for (MachineOperand &MO :
8129 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8130 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8131 }
8132 return;
8133 }
8134
8135 // If this is a v2s copy between 16bit and 32bit reg,
8136 // replace vgpr copy to reg_sequence/extract_subreg
8137 // This can be remove after we have sgpr16 in place
8138 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8139 Inst.getOperand(1).getReg().isVirtual() &&
8140 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8141 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8142 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8143 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8144 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8145 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8146 get(AMDGPU::IMPLICIT_DEF), Undef);
8147 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8148 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8149 .addReg(Inst.getOperand(1).getReg())
8150 .addImm(AMDGPU::lo16)
8151 .addReg(Undef)
8152 .addImm(AMDGPU::hi16);
8153 Inst.eraseFromParent();
8154 MRI.replaceRegWith(DstReg, NewDstReg);
8155 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8156 return;
8157 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8158 AMDGPU::lo16)) {
8159 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8160 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8161 MRI.replaceRegWith(DstReg, NewDstReg);
8162 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8163 return;
8164 }
8165 }
8166
8167 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8168 MRI.replaceRegWith(DstReg, NewDstReg);
8169 legalizeOperands(Inst, MDT);
8170 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8171 return;
8172 }
8173
8174 // Use the new VALU Opcode.
8175 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8176 .setMIFlags(Inst.getFlags());
8177 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8178 // Intersperse VOP3 modifiers among the SALU operands.
8179 NewInstr->addOperand(Inst.getOperand(0));
8180 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8181 AMDGPU::OpName::src0_modifiers) >= 0)
8182 NewInstr.addImm(0);
8183 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8184 MachineOperand Src = Inst.getOperand(1);
8185 NewInstr->addOperand(Src);
8186 }
8187
8188 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8189 // We are converting these to a BFE, so we need to add the missing
8190 // operands for the size and offset.
8191 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8192 NewInstr.addImm(0);
8193 NewInstr.addImm(Size);
8194 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8195 // The VALU version adds the second operand to the result, so insert an
8196 // extra 0 operand.
8197 NewInstr.addImm(0);
8198 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8199 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8200 // If we need to move this to VGPRs, we need to unpack the second
8201 // operand back into the 2 separate ones for bit offset and width.
8202 assert(OffsetWidthOp.isImm() &&
8203 "Scalar BFE is only implemented for constant width and offset");
8204 uint32_t Imm = OffsetWidthOp.getImm();
8205
8206 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8207 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8208 NewInstr.addImm(Offset);
8209 NewInstr.addImm(BitWidth);
8210 } else {
8211 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8212 AMDGPU::OpName::src1_modifiers) >= 0)
8213 NewInstr.addImm(0);
8214 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8215 NewInstr->addOperand(Inst.getOperand(2));
8216 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8217 AMDGPU::OpName::src2_modifiers) >= 0)
8218 NewInstr.addImm(0);
8219 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8220 NewInstr->addOperand(Inst.getOperand(3));
8221 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8222 NewInstr.addImm(0);
8223 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8224 NewInstr.addImm(0);
8225 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8226 NewInstr.addImm(0);
8227 }
8228 } else {
8229 // Just copy the SALU operands.
8230 for (const MachineOperand &Op : Inst.explicit_operands())
8231 NewInstr->addOperand(Op);
8232 }
8233
8234 // Remove any references to SCC. Vector instructions can't read from it, and
8235 // We're just about to add the implicit use / defs of VCC, and we don't want
8236 // both.
8237 for (MachineOperand &Op : Inst.implicit_operands()) {
8238 if (Op.getReg() == AMDGPU::SCC) {
8239 // Only propagate through live-def of SCC.
8240 if (Op.isDef() && !Op.isDead())
8241 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8242 if (Op.isUse())
8243 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8244 }
8245 }
8246 Inst.eraseFromParent();
8247 Register NewDstReg;
8248 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8249 Register DstReg = NewInstr->getOperand(0).getReg();
8250 assert(DstReg.isVirtual());
8251 // Update the destination register class.
8252 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8253 assert(NewDstRC);
8254 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8255 MRI.replaceRegWith(DstReg, NewDstReg);
8256 }
8257 fixImplicitOperands(*NewInstr);
8258
8259 legalizeOperandsVALUt16(*NewInstr, MRI);
8260
8261 // Legalize the operands
8262 legalizeOperands(*NewInstr, MDT);
8263 if (NewDstReg)
8264 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8265}
8266
8267// Add/sub require special handling to deal with carry outs.
8268std::pair<bool, MachineBasicBlock *>
8269SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8270 MachineDominatorTree *MDT) const {
8271 if (ST.hasAddNoCarry()) {
8272 // Assume there is no user of scc since we don't select this in that case.
8273 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8274 // is used.
8275
8276 MachineBasicBlock &MBB = *Inst.getParent();
8277 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8278
8279 Register OldDstReg = Inst.getOperand(0).getReg();
8280 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8281
8282 unsigned Opc = Inst.getOpcode();
8283 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8284
8285 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8286 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8287
8288 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8289 Inst.removeOperand(3);
8290
8291 Inst.setDesc(get(NewOpc));
8292 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8293 Inst.addImplicitDefUseOperands(*MBB.getParent());
8294 MRI.replaceRegWith(OldDstReg, ResultReg);
8295 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8296
8297 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8298 return std::pair(true, NewBB);
8299 }
8300
8301 return std::pair(false, nullptr);
8302}
8303
8304void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8305 MachineDominatorTree *MDT) const {
8306
8307 MachineBasicBlock &MBB = *Inst.getParent();
8308 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8309 MachineBasicBlock::iterator MII = Inst;
8310 DebugLoc DL = Inst.getDebugLoc();
8311
8312 MachineOperand &Dest = Inst.getOperand(0);
8313 MachineOperand &Src0 = Inst.getOperand(1);
8314 MachineOperand &Src1 = Inst.getOperand(2);
8315 MachineOperand &Cond = Inst.getOperand(3);
8316
8317 Register CondReg = Cond.getReg();
8318 bool IsSCC = (CondReg == AMDGPU::SCC);
8319
8320 // If this is a trivial select where the condition is effectively not SCC
8321 // (CondReg is a source of copy to SCC), then the select is semantically
8322 // equivalent to copying CondReg. Hence, there is no need to create
8323 // V_CNDMASK, we can just use that and bail out.
8324 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8325 (Src1.getImm() == 0)) {
8326 MRI.replaceRegWith(Dest.getReg(), CondReg);
8327 return;
8328 }
8329
8330 Register NewCondReg = CondReg;
8331 if (IsSCC) {
8332 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8333 NewCondReg = MRI.createVirtualRegister(TC);
8334
8335 // Now look for the closest SCC def if it is a copy
8336 // replacing the CondReg with the COPY source register
8337 bool CopyFound = false;
8338 for (MachineInstr &CandI :
8340 Inst.getParent()->rend())) {
8341 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8342 -1) {
8343 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8344 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8345 .addReg(CandI.getOperand(1).getReg());
8346 CopyFound = true;
8347 }
8348 break;
8349 }
8350 }
8351 if (!CopyFound) {
8352 // SCC def is not a copy
8353 // Insert a trivial select instead of creating a copy, because a copy from
8354 // SCC would semantically mean just copying a single bit, but we may need
8355 // the result to be a vector condition mask that needs preserving.
8356 unsigned Opcode =
8357 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8358 auto NewSelect =
8359 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8360 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8361 }
8362 }
8363
8364 Register NewDestReg = MRI.createVirtualRegister(
8365 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8366 MachineInstr *NewInst;
8367 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8368 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8369 .addImm(0)
8370 .add(Src1) // False
8371 .addImm(0)
8372 .add(Src0) // True
8373 .addReg(NewCondReg);
8374 } else {
8375 NewInst =
8376 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8377 .add(Src1) // False
8378 .add(Src0) // True
8379 .addReg(NewCondReg);
8380 }
8381 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8382 legalizeOperands(*NewInst, MDT);
8383 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8384}
8385
8386void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8387 MachineInstr &Inst) const {
8388 MachineBasicBlock &MBB = *Inst.getParent();
8389 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8390 MachineBasicBlock::iterator MII = Inst;
8391 DebugLoc DL = Inst.getDebugLoc();
8392
8393 MachineOperand &Dest = Inst.getOperand(0);
8394 MachineOperand &Src = Inst.getOperand(1);
8395 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8396 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8397
8398 unsigned SubOp = ST.hasAddNoCarry() ?
8399 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8400
8401 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8402 .addImm(0)
8403 .addReg(Src.getReg());
8404
8405 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8406 .addReg(Src.getReg())
8407 .addReg(TmpReg);
8408
8409 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8410 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8411}
8412
8413void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8414 MachineInstr &Inst) const {
8415 MachineBasicBlock &MBB = *Inst.getParent();
8416 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8417 MachineBasicBlock::iterator MII = Inst;
8418 const DebugLoc &DL = Inst.getDebugLoc();
8419
8420 MachineOperand &Dest = Inst.getOperand(0);
8421 MachineOperand &Src0 = Inst.getOperand(1);
8422 MachineOperand &Src1 = Inst.getOperand(2);
8423
8424 if (ST.hasDLInsts()) {
8425 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8426 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8427 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8428
8429 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8430 .add(Src0)
8431 .add(Src1);
8432
8433 MRI.replaceRegWith(Dest.getReg(), NewDest);
8434 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8435 } else {
8436 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8437 // invert either source and then perform the XOR. If either source is a
8438 // scalar register, then we can leave the inversion on the scalar unit to
8439 // achieve a better distribution of scalar and vector instructions.
8440 bool Src0IsSGPR = Src0.isReg() &&
8441 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8442 bool Src1IsSGPR = Src1.isReg() &&
8443 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8444 MachineInstr *Xor;
8445 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8446 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8447
8448 // Build a pair of scalar instructions and add them to the work list.
8449 // The next iteration over the work list will lower these to the vector
8450 // unit as necessary.
8451 if (Src0IsSGPR) {
8452 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8453 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8454 .addReg(Temp)
8455 .add(Src1);
8456 } else if (Src1IsSGPR) {
8457 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8458 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8459 .add(Src0)
8460 .addReg(Temp);
8461 } else {
8462 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8463 .add(Src0)
8464 .add(Src1);
8465 MachineInstr *Not =
8466 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8467 Worklist.insert(Not);
8468 }
8469
8470 MRI.replaceRegWith(Dest.getReg(), NewDest);
8471
8472 Worklist.insert(Xor);
8473
8474 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8475 }
8476}
8477
8478void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8479 MachineInstr &Inst,
8480 unsigned Opcode) const {
8481 MachineBasicBlock &MBB = *Inst.getParent();
8482 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8483 MachineBasicBlock::iterator MII = Inst;
8484 const DebugLoc &DL = Inst.getDebugLoc();
8485
8486 MachineOperand &Dest = Inst.getOperand(0);
8487 MachineOperand &Src0 = Inst.getOperand(1);
8488 MachineOperand &Src1 = Inst.getOperand(2);
8489
8490 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8491 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8492
8493 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8494 .add(Src0)
8495 .add(Src1);
8496
8497 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8498 .addReg(Interm);
8499
8500 Worklist.insert(&Op);
8501 Worklist.insert(&Not);
8502
8503 MRI.replaceRegWith(Dest.getReg(), NewDest);
8504 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8505}
8506
8507void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8508 MachineInstr &Inst,
8509 unsigned Opcode) const {
8510 MachineBasicBlock &MBB = *Inst.getParent();
8511 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8512 MachineBasicBlock::iterator MII = Inst;
8513 const DebugLoc &DL = Inst.getDebugLoc();
8514
8515 MachineOperand &Dest = Inst.getOperand(0);
8516 MachineOperand &Src0 = Inst.getOperand(1);
8517 MachineOperand &Src1 = Inst.getOperand(2);
8518
8519 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8520 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8521
8522 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8523 .add(Src1);
8524
8525 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8526 .add(Src0)
8527 .addReg(Interm);
8528
8529 Worklist.insert(&Not);
8530 Worklist.insert(&Op);
8531
8532 MRI.replaceRegWith(Dest.getReg(), NewDest);
8533 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8534}
8535
8536void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8537 MachineInstr &Inst, unsigned Opcode,
8538 bool Swap) const {
8539 MachineBasicBlock &MBB = *Inst.getParent();
8540 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8541
8542 MachineOperand &Dest = Inst.getOperand(0);
8543 MachineOperand &Src0 = Inst.getOperand(1);
8544 DebugLoc DL = Inst.getDebugLoc();
8545
8546 MachineBasicBlock::iterator MII = Inst;
8547
8548 const MCInstrDesc &InstDesc = get(Opcode);
8549 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8550 MRI.getRegClass(Src0.getReg()) :
8551 &AMDGPU::SGPR_32RegClass;
8552
8553 const TargetRegisterClass *Src0SubRC =
8554 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8555
8556 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8557 AMDGPU::sub0, Src0SubRC);
8558
8559 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8560 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8561 const TargetRegisterClass *NewDestSubRC =
8562 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8563
8564 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8565 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8566
8567 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8568 AMDGPU::sub1, Src0SubRC);
8569
8570 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8571 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8572
8573 if (Swap)
8574 std::swap(DestSub0, DestSub1);
8575
8576 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8577 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8578 .addReg(DestSub0)
8579 .addImm(AMDGPU::sub0)
8580 .addReg(DestSub1)
8581 .addImm(AMDGPU::sub1);
8582
8583 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8584
8585 Worklist.insert(&LoHalf);
8586 Worklist.insert(&HiHalf);
8587
8588 // We don't need to legalizeOperands here because for a single operand, src0
8589 // will support any kind of input.
8590
8591 // Move all users of this moved value.
8592 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8593}
8594
8595// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8596// split the s_mul_u64 in 32-bit vector multiplications.
8597void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8598 MachineInstr &Inst,
8599 MachineDominatorTree *MDT) const {
8600 MachineBasicBlock &MBB = *Inst.getParent();
8601 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8602
8603 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8604 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8605 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8606
8607 MachineOperand &Dest = Inst.getOperand(0);
8608 MachineOperand &Src0 = Inst.getOperand(1);
8609 MachineOperand &Src1 = Inst.getOperand(2);
8610 const DebugLoc &DL = Inst.getDebugLoc();
8611 MachineBasicBlock::iterator MII = Inst;
8612
8613 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8614 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8615 const TargetRegisterClass *Src0SubRC =
8616 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8617 if (RI.isSGPRClass(Src0SubRC))
8618 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8619 const TargetRegisterClass *Src1SubRC =
8620 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8621 if (RI.isSGPRClass(Src1SubRC))
8622 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8623
8624 // First, we extract the low 32-bit and high 32-bit values from each of the
8625 // operands.
8626 MachineOperand Op0L =
8627 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8628 MachineOperand Op1L =
8629 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8630 MachineOperand Op0H =
8631 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8632 MachineOperand Op1H =
8633 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8634
8635 // The multilication is done as follows:
8636 //
8637 // Op1H Op1L
8638 // * Op0H Op0L
8639 // --------------------
8640 // Op1H*Op0L Op1L*Op0L
8641 // + Op1H*Op0H Op1L*Op0H
8642 // -----------------------------------------
8643 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8644 //
8645 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8646 // value and that would overflow.
8647 // The low 32-bit value is Op1L*Op0L.
8648 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8649
8650 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8651 MachineInstr *Op1L_Op0H =
8652 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8653 .add(Op1L)
8654 .add(Op0H);
8655
8656 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8657 MachineInstr *Op1H_Op0L =
8658 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8659 .add(Op1H)
8660 .add(Op0L);
8661
8662 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8663 MachineInstr *Carry =
8664 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8665 .add(Op1L)
8666 .add(Op0L);
8667
8668 MachineInstr *LoHalf =
8669 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8670 .add(Op1L)
8671 .add(Op0L);
8672
8673 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8674 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8675 .addReg(Op1L_Op0H_Reg)
8676 .addReg(Op1H_Op0L_Reg);
8677
8678 MachineInstr *HiHalf =
8679 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8680 .addReg(AddReg)
8681 .addReg(CarryReg);
8682
8683 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8684 .addReg(DestSub0)
8685 .addImm(AMDGPU::sub0)
8686 .addReg(DestSub1)
8687 .addImm(AMDGPU::sub1);
8688
8689 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8690
8691 // Try to legalize the operands in case we need to swap the order to keep it
8692 // valid.
8693 legalizeOperands(*Op1L_Op0H, MDT);
8694 legalizeOperands(*Op1H_Op0L, MDT);
8695 legalizeOperands(*Carry, MDT);
8696 legalizeOperands(*LoHalf, MDT);
8697 legalizeOperands(*Add, MDT);
8698 legalizeOperands(*HiHalf, MDT);
8699
8700 // Move all users of this moved value.
8701 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8702}
8703
8704// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8705// multiplications.
8706void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8707 MachineInstr &Inst,
8708 MachineDominatorTree *MDT) const {
8709 MachineBasicBlock &MBB = *Inst.getParent();
8710 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8711
8712 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8713 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8714 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8715
8716 MachineOperand &Dest = Inst.getOperand(0);
8717 MachineOperand &Src0 = Inst.getOperand(1);
8718 MachineOperand &Src1 = Inst.getOperand(2);
8719 const DebugLoc &DL = Inst.getDebugLoc();
8720 MachineBasicBlock::iterator MII = Inst;
8721
8722 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8723 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8724 const TargetRegisterClass *Src0SubRC =
8725 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8726 if (RI.isSGPRClass(Src0SubRC))
8727 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8728 const TargetRegisterClass *Src1SubRC =
8729 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8730 if (RI.isSGPRClass(Src1SubRC))
8731 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8732
8733 // First, we extract the low 32-bit and high 32-bit values from each of the
8734 // operands.
8735 MachineOperand Op0L =
8736 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8737 MachineOperand Op1L =
8738 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8739
8740 unsigned Opc = Inst.getOpcode();
8741 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8742 ? AMDGPU::V_MUL_HI_U32_e64
8743 : AMDGPU::V_MUL_HI_I32_e64;
8744 MachineInstr *HiHalf =
8745 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8746
8747 MachineInstr *LoHalf =
8748 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8749 .add(Op1L)
8750 .add(Op0L);
8751
8752 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8753 .addReg(DestSub0)
8754 .addImm(AMDGPU::sub0)
8755 .addReg(DestSub1)
8756 .addImm(AMDGPU::sub1);
8757
8758 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8759
8760 // Try to legalize the operands in case we need to swap the order to keep it
8761 // valid.
8762 legalizeOperands(*HiHalf, MDT);
8763 legalizeOperands(*LoHalf, MDT);
8764
8765 // Move all users of this moved value.
8766 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8767}
8768
8769void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8770 MachineInstr &Inst, unsigned Opcode,
8771 MachineDominatorTree *MDT) const {
8772 MachineBasicBlock &MBB = *Inst.getParent();
8773 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8774
8775 MachineOperand &Dest = Inst.getOperand(0);
8776 MachineOperand &Src0 = Inst.getOperand(1);
8777 MachineOperand &Src1 = Inst.getOperand(2);
8778 DebugLoc DL = Inst.getDebugLoc();
8779
8780 MachineBasicBlock::iterator MII = Inst;
8781
8782 const MCInstrDesc &InstDesc = get(Opcode);
8783 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8784 MRI.getRegClass(Src0.getReg()) :
8785 &AMDGPU::SGPR_32RegClass;
8786
8787 const TargetRegisterClass *Src0SubRC =
8788 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8789 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8790 MRI.getRegClass(Src1.getReg()) :
8791 &AMDGPU::SGPR_32RegClass;
8792
8793 const TargetRegisterClass *Src1SubRC =
8794 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8795
8796 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8797 AMDGPU::sub0, Src0SubRC);
8798 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8799 AMDGPU::sub0, Src1SubRC);
8800 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8801 AMDGPU::sub1, Src0SubRC);
8802 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8803 AMDGPU::sub1, Src1SubRC);
8804
8805 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8806 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8807 const TargetRegisterClass *NewDestSubRC =
8808 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8809
8810 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8811 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8812 .add(SrcReg0Sub0)
8813 .add(SrcReg1Sub0);
8814
8815 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8816 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8817 .add(SrcReg0Sub1)
8818 .add(SrcReg1Sub1);
8819
8820 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8821 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8822 .addReg(DestSub0)
8823 .addImm(AMDGPU::sub0)
8824 .addReg(DestSub1)
8825 .addImm(AMDGPU::sub1);
8826
8827 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8828
8829 Worklist.insert(&LoHalf);
8830 Worklist.insert(&HiHalf);
8831
8832 // Move all users of this moved value.
8833 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8834}
8835
8836void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8837 MachineInstr &Inst,
8838 MachineDominatorTree *MDT) const {
8839 MachineBasicBlock &MBB = *Inst.getParent();
8840 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8841
8842 MachineOperand &Dest = Inst.getOperand(0);
8843 MachineOperand &Src0 = Inst.getOperand(1);
8844 MachineOperand &Src1 = Inst.getOperand(2);
8845 const DebugLoc &DL = Inst.getDebugLoc();
8846
8847 MachineBasicBlock::iterator MII = Inst;
8848
8849 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8850
8851 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8852
8853 MachineOperand* Op0;
8854 MachineOperand* Op1;
8855
8856 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8857 Op0 = &Src0;
8858 Op1 = &Src1;
8859 } else {
8860 Op0 = &Src1;
8861 Op1 = &Src0;
8862 }
8863
8864 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8865 .add(*Op0);
8866
8867 Register NewDest = MRI.createVirtualRegister(DestRC);
8868
8869 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8870 .addReg(Interm)
8871 .add(*Op1);
8872
8873 MRI.replaceRegWith(Dest.getReg(), NewDest);
8874
8875 Worklist.insert(&Xor);
8876}
8877
8878void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8879 MachineInstr &Inst) const {
8880 MachineBasicBlock &MBB = *Inst.getParent();
8881 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8882
8883 MachineBasicBlock::iterator MII = Inst;
8884 const DebugLoc &DL = Inst.getDebugLoc();
8885
8886 MachineOperand &Dest = Inst.getOperand(0);
8887 MachineOperand &Src = Inst.getOperand(1);
8888
8889 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8890 const TargetRegisterClass *SrcRC = Src.isReg() ?
8891 MRI.getRegClass(Src.getReg()) :
8892 &AMDGPU::SGPR_32RegClass;
8893
8894 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8895 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8896
8897 const TargetRegisterClass *SrcSubRC =
8898 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8899
8900 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8901 AMDGPU::sub0, SrcSubRC);
8902 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8903 AMDGPU::sub1, SrcSubRC);
8904
8905 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8906
8907 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8908
8909 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8910
8911 // We don't need to legalize operands here. src0 for either instruction can be
8912 // an SGPR, and the second input is unused or determined here.
8913 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8914}
8915
8916void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8917 MachineInstr &Inst) const {
8918 MachineBasicBlock &MBB = *Inst.getParent();
8919 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8920 MachineBasicBlock::iterator MII = Inst;
8921 const DebugLoc &DL = Inst.getDebugLoc();
8922
8923 MachineOperand &Dest = Inst.getOperand(0);
8924 uint32_t Imm = Inst.getOperand(2).getImm();
8925 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8926 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8927
8928 (void) Offset;
8929
8930 // Only sext_inreg cases handled.
8931 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8932 Offset == 0 && "Not implemented");
8933
8934 if (BitWidth < 32) {
8935 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8936 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8937 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8938
8939 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8940 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8941 .addImm(0)
8942 .addImm(BitWidth);
8943
8944 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8945 .addImm(31)
8946 .addReg(MidRegLo);
8947
8948 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8949 .addReg(MidRegLo)
8950 .addImm(AMDGPU::sub0)
8951 .addReg(MidRegHi)
8952 .addImm(AMDGPU::sub1);
8953
8954 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8955 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8956 return;
8957 }
8958
8959 MachineOperand &Src = Inst.getOperand(1);
8960 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8961 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8962
8963 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8964 .addImm(31)
8965 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8966
8967 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8968 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8969 .addImm(AMDGPU::sub0)
8970 .addReg(TmpReg)
8971 .addImm(AMDGPU::sub1);
8972
8973 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8974 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8975}
8976
8977void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8978 MachineInstr &Inst, unsigned Opcode,
8979 MachineDominatorTree *MDT) const {
8980 // (S_FLBIT_I32_B64 hi:lo) ->
8981 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8982 // (S_FF1_I32_B64 hi:lo) ->
8983 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8984
8985 MachineBasicBlock &MBB = *Inst.getParent();
8986 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8987 MachineBasicBlock::iterator MII = Inst;
8988 const DebugLoc &DL = Inst.getDebugLoc();
8989
8990 MachineOperand &Dest = Inst.getOperand(0);
8991 MachineOperand &Src = Inst.getOperand(1);
8992
8993 const MCInstrDesc &InstDesc = get(Opcode);
8994
8995 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8996 unsigned OpcodeAdd =
8997 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8998
8999 const TargetRegisterClass *SrcRC =
9000 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9001 const TargetRegisterClass *SrcSubRC =
9002 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9003
9004 MachineOperand SrcRegSub0 =
9005 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9006 MachineOperand SrcRegSub1 =
9007 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9008
9009 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9010 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9011 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9012 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9013
9014 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9015
9016 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9017
9018 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9019 .addReg(IsCtlz ? MidReg1 : MidReg2)
9020 .addImm(32)
9021 .addImm(1); // enable clamp
9022
9023 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9024 .addReg(MidReg3)
9025 .addReg(IsCtlz ? MidReg2 : MidReg1);
9026
9027 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9028
9029 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9030}
9031
9032void SIInstrInfo::addUsersToMoveToVALUWorklist(
9034 SIInstrWorklist &Worklist) const {
9035 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9036 MachineInstr &UseMI = *MO.getParent();
9037
9038 unsigned OpNo = 0;
9039
9040 switch (UseMI.getOpcode()) {
9041 case AMDGPU::COPY:
9042 case AMDGPU::WQM:
9043 case AMDGPU::SOFT_WQM:
9044 case AMDGPU::STRICT_WWM:
9045 case AMDGPU::STRICT_WQM:
9046 case AMDGPU::REG_SEQUENCE:
9047 case AMDGPU::PHI:
9048 case AMDGPU::INSERT_SUBREG:
9049 break;
9050 default:
9051 OpNo = MO.getOperandNo();
9052 break;
9053 }
9054
9055 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9056 MRI.constrainRegClass(DstReg, OpRC);
9057
9058 if (!RI.hasVectorRegisters(OpRC))
9059 Worklist.insert(&UseMI);
9060 else
9061 // Legalization could change user list.
9063 }
9064}
9065
9066void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9068 MachineInstr &Inst) const {
9069 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9070 MachineBasicBlock *MBB = Inst.getParent();
9071 MachineOperand &Src0 = Inst.getOperand(1);
9072 MachineOperand &Src1 = Inst.getOperand(2);
9073 const DebugLoc &DL = Inst.getDebugLoc();
9074
9075 switch (Inst.getOpcode()) {
9076 case AMDGPU::S_PACK_LL_B32_B16: {
9077 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9078 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9079
9080 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9081 // 0.
9082 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9083 .addImm(0xffff);
9084
9085 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9086 .addReg(ImmReg, RegState::Kill)
9087 .add(Src0);
9088
9089 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9090 .add(Src1)
9091 .addImm(16)
9092 .addReg(TmpReg, RegState::Kill);
9093 break;
9094 }
9095 case AMDGPU::S_PACK_LH_B32_B16: {
9096 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9097 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9098 .addImm(0xffff);
9099 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9100 .addReg(ImmReg, RegState::Kill)
9101 .add(Src0)
9102 .add(Src1);
9103 break;
9104 }
9105 case AMDGPU::S_PACK_HL_B32_B16: {
9106 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9107 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9108 .addImm(16)
9109 .add(Src0);
9110 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9111 .add(Src1)
9112 .addImm(16)
9113 .addReg(TmpReg, RegState::Kill);
9114 break;
9115 }
9116 case AMDGPU::S_PACK_HH_B32_B16: {
9117 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9118 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9119 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9120 .addImm(16)
9121 .add(Src0);
9122 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9123 .addImm(0xffff0000);
9124 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9125 .add(Src1)
9126 .addReg(ImmReg, RegState::Kill)
9127 .addReg(TmpReg, RegState::Kill);
9128 break;
9129 }
9130 default:
9131 llvm_unreachable("unhandled s_pack_* instruction");
9132 }
9133
9134 MachineOperand &Dest = Inst.getOperand(0);
9135 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9136 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9137}
9138
9139void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
9140 MachineInstr &SCCDefInst,
9141 SIInstrWorklist &Worklist,
9142 Register NewCond) const {
9143
9144 // Ensure that def inst defines SCC, which is still live.
9145 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9146 !Op.isDead() && Op.getParent() == &SCCDefInst);
9147 SmallVector<MachineInstr *, 4> CopyToDelete;
9148 // This assumes that all the users of SCC are in the same block
9149 // as the SCC def.
9150 for (MachineInstr &MI : // Skip the def inst itself.
9151 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9152 SCCDefInst.getParent()->end())) {
9153 // Check if SCC is used first.
9154 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9155 if (SCCIdx != -1) {
9156 if (MI.isCopy()) {
9157 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9158 Register DestReg = MI.getOperand(0).getReg();
9159
9160 MRI.replaceRegWith(DestReg, NewCond);
9161 CopyToDelete.push_back(&MI);
9162 } else {
9163
9164 if (NewCond.isValid())
9165 MI.getOperand(SCCIdx).setReg(NewCond);
9166
9167 Worklist.insert(&MI);
9168 }
9169 }
9170 // Exit if we find another SCC def.
9171 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9172 break;
9173 }
9174 for (auto &Copy : CopyToDelete)
9175 Copy->eraseFromParent();
9176}
9177
9178// Instructions that use SCC may be converted to VALU instructions. When that
9179// happens, the SCC register is changed to VCC_LO. The instruction that defines
9180// SCC must be changed to an instruction that defines VCC. This function makes
9181// sure that the instruction that defines SCC is added to the moveToVALU
9182// worklist.
9183void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9184 SIInstrWorklist &Worklist) const {
9185 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9186 // then there is nothing to do because the defining instruction has been
9187 // converted to a VALU already. If SCC then that instruction needs to be
9188 // converted to a VALU.
9189 for (MachineInstr &MI :
9190 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9191 SCCUseInst->getParent()->rend())) {
9192 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9193 break;
9194 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9195 Worklist.insert(&MI);
9196 break;
9197 }
9198 }
9199}
9200
9201const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9202 const MachineInstr &Inst) const {
9203 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9204
9205 switch (Inst.getOpcode()) {
9206 // For target instructions, getOpRegClass just returns the virtual register
9207 // class associated with the operand, so we need to find an equivalent VGPR
9208 // register class in order to move the instruction to the VALU.
9209 case AMDGPU::COPY:
9210 case AMDGPU::PHI:
9211 case AMDGPU::REG_SEQUENCE:
9212 case AMDGPU::INSERT_SUBREG:
9213 case AMDGPU::WQM:
9214 case AMDGPU::SOFT_WQM:
9215 case AMDGPU::STRICT_WWM:
9216 case AMDGPU::STRICT_WQM: {
9217 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9218 if (RI.isAGPRClass(SrcRC)) {
9219 if (RI.isAGPRClass(NewDstRC))
9220 return nullptr;
9221
9222 switch (Inst.getOpcode()) {
9223 case AMDGPU::PHI:
9224 case AMDGPU::REG_SEQUENCE:
9225 case AMDGPU::INSERT_SUBREG:
9226 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9227 break;
9228 default:
9229 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9230 }
9231
9232 if (!NewDstRC)
9233 return nullptr;
9234 } else {
9235 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9236 return nullptr;
9237
9238 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9239 if (!NewDstRC)
9240 return nullptr;
9241 }
9242
9243 return NewDstRC;
9244 }
9245 default:
9246 return NewDstRC;
9247 }
9248}
9249
9250// Find the one SGPR operand we are allowed to use.
9251Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9252 int OpIndices[3]) const {
9253 const MCInstrDesc &Desc = MI.getDesc();
9254
9255 // Find the one SGPR operand we are allowed to use.
9256 //
9257 // First we need to consider the instruction's operand requirements before
9258 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9259 // of VCC, but we are still bound by the constant bus requirement to only use
9260 // one.
9261 //
9262 // If the operand's class is an SGPR, we can never move it.
9263
9264 Register SGPRReg = findImplicitSGPRRead(MI);
9265 if (SGPRReg)
9266 return SGPRReg;
9267
9268 Register UsedSGPRs[3] = {Register()};
9269 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9270
9271 for (unsigned i = 0; i < 3; ++i) {
9272 int Idx = OpIndices[i];
9273 if (Idx == -1)
9274 break;
9275
9276 const MachineOperand &MO = MI.getOperand(Idx);
9277 if (!MO.isReg())
9278 continue;
9279
9280 // Is this operand statically required to be an SGPR based on the operand
9281 // constraints?
9282 const TargetRegisterClass *OpRC =
9283 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9284 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9285 if (IsRequiredSGPR)
9286 return MO.getReg();
9287
9288 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9289 Register Reg = MO.getReg();
9290 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9291 if (RI.isSGPRClass(RegRC))
9292 UsedSGPRs[i] = Reg;
9293 }
9294
9295 // We don't have a required SGPR operand, so we have a bit more freedom in
9296 // selecting operands to move.
9297
9298 // Try to select the most used SGPR. If an SGPR is equal to one of the
9299 // others, we choose that.
9300 //
9301 // e.g.
9302 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9303 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9304
9305 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9306 // prefer those.
9307
9308 if (UsedSGPRs[0]) {
9309 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9310 SGPRReg = UsedSGPRs[0];
9311 }
9312
9313 if (!SGPRReg && UsedSGPRs[1]) {
9314 if (UsedSGPRs[1] == UsedSGPRs[2])
9315 SGPRReg = UsedSGPRs[1];
9316 }
9317
9318 return SGPRReg;
9319}
9320
9322 AMDGPU::OpName OperandName) const {
9323 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9324 return nullptr;
9325
9326 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9327 if (Idx == -1)
9328 return nullptr;
9329
9330 return &MI.getOperand(Idx);
9331}
9332
9334 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9335 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9338 return (Format << 44) |
9339 (1ULL << 56) | // RESOURCE_LEVEL = 1
9340 (3ULL << 60); // OOB_SELECT = 3
9341 }
9342
9343 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9344 if (ST.isAmdHsaOS()) {
9345 // Set ATC = 1. GFX9 doesn't have this bit.
9346 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9347 RsrcDataFormat |= (1ULL << 56);
9348
9349 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9350 // BTW, it disables TC L2 and therefore decreases performance.
9351 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9352 RsrcDataFormat |= (2ULL << 59);
9353 }
9354
9355 return RsrcDataFormat;
9356}
9357
9361 0xffffffff; // Size;
9362
9363 // GFX9 doesn't have ELEMENT_SIZE.
9364 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9365 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9366 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9367 }
9368
9369 // IndexStride = 64 / 32.
9370 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9371 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9372
9373 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9374 // Clear them unless we want a huge stride.
9375 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9376 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9377 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9378
9379 return Rsrc23;
9380}
9381
9383 unsigned Opc = MI.getOpcode();
9384
9385 return isSMRD(Opc);
9386}
9387
9389 return get(Opc).mayLoad() &&
9390 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9391}
9392
9394 int &FrameIndex) const {
9395 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9396 if (!Addr || !Addr->isFI())
9397 return Register();
9398
9399 assert(!MI.memoperands_empty() &&
9400 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9401
9402 FrameIndex = Addr->getIndex();
9403 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9404}
9405
9407 int &FrameIndex) const {
9408 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9409 assert(Addr && Addr->isFI());
9410 FrameIndex = Addr->getIndex();
9411 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9412}
9413
9415 int &FrameIndex) const {
9416 if (!MI.mayLoad())
9417 return Register();
9418
9419 if (isMUBUF(MI) || isVGPRSpill(MI))
9420 return isStackAccess(MI, FrameIndex);
9421
9422 if (isSGPRSpill(MI))
9423 return isSGPRStackAccess(MI, FrameIndex);
9424
9425 return Register();
9426}
9427
9429 int &FrameIndex) const {
9430 if (!MI.mayStore())
9431 return Register();
9432
9433 if (isMUBUF(MI) || isVGPRSpill(MI))
9434 return isStackAccess(MI, FrameIndex);
9435
9436 if (isSGPRSpill(MI))
9437 return isSGPRStackAccess(MI, FrameIndex);
9438
9439 return Register();
9440}
9441
9443 unsigned Size = 0;
9445 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9446 while (++I != E && I->isInsideBundle()) {
9447 assert(!I->isBundle() && "No nested bundle!");
9449 }
9450
9451 return Size;
9452}
9453
9455 unsigned Opc = MI.getOpcode();
9457 unsigned DescSize = Desc.getSize();
9458
9459 // If we have a definitive size, we can use it. Otherwise we need to inspect
9460 // the operands to know the size.
9461 if (isFixedSize(MI)) {
9462 unsigned Size = DescSize;
9463
9464 // If we hit the buggy offset, an extra nop will be inserted in MC so
9465 // estimate the worst case.
9466 if (MI.isBranch() && ST.hasOffset3fBug())
9467 Size += 4;
9468
9469 return Size;
9470 }
9471
9472 // Instructions may have a 32-bit literal encoded after them. Check
9473 // operands that could ever be literals.
9474 if (isVALU(MI) || isSALU(MI)) {
9475 if (isDPP(MI))
9476 return DescSize;
9477 bool HasLiteral = false;
9478 unsigned LiteralSize = 4;
9479 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9480 const MachineOperand &Op = MI.getOperand(I);
9481 const MCOperandInfo &OpInfo = Desc.operands()[I];
9482 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9483 HasLiteral = true;
9484 if (ST.has64BitLiterals()) {
9485 switch (OpInfo.OperandType) {
9486 default:
9487 break;
9489 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9490 LiteralSize = 8;
9491 break;
9493 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9494 LiteralSize = 8;
9495 break;
9496 }
9497 }
9498 break;
9499 }
9500 }
9501 return HasLiteral ? DescSize + LiteralSize : DescSize;
9502 }
9503
9504 // Check whether we have extra NSA words.
9505 if (isMIMG(MI)) {
9506 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9507 if (VAddr0Idx < 0)
9508 return 8;
9509
9510 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9511 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9512 }
9513
9514 switch (Opc) {
9515 case TargetOpcode::BUNDLE:
9516 return getInstBundleSize(MI);
9517 case TargetOpcode::INLINEASM:
9518 case TargetOpcode::INLINEASM_BR: {
9519 const MachineFunction *MF = MI.getParent()->getParent();
9520 const char *AsmStr = MI.getOperand(0).getSymbolName();
9521 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9522 }
9523 default:
9524 if (MI.isMetaInstruction())
9525 return 0;
9526
9527 // If D16 Pseudo inst, get correct MC code size
9528 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9529 if (D16Info) {
9530 // Assume d16_lo/hi inst are always in same size
9531 unsigned LoInstOpcode = D16Info->LoOp;
9532 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9533 DescSize = Desc.getSize();
9534 }
9535
9536 // If FMA Pseudo inst, get correct MC code size
9537 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9538 // All potential lowerings are the same size; arbitrarily pick one.
9539 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9540 DescSize = Desc.getSize();
9541 }
9542
9543 return DescSize;
9544 }
9545}
9546
9548 if (!isFLAT(MI))
9549 return false;
9550
9551 if (MI.memoperands_empty())
9552 return true;
9553
9554 for (const MachineMemOperand *MMO : MI.memoperands()) {
9555 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9556 return true;
9557 }
9558 return false;
9559}
9560
9563 static const std::pair<int, const char *> TargetIndices[] = {
9564 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9565 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9566 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9567 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9568 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9569 return ArrayRef(TargetIndices);
9570}
9571
9572/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9573/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9579
9580/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9581/// pass.
9586
9587// Called during:
9588// - pre-RA scheduling and post-RA scheduling
9591 const ScheduleDAGMI *DAG) const {
9592 // Borrowed from Arm Target
9593 // We would like to restrict this hazard recognizer to only
9594 // post-RA scheduling; we can tell that we're post-RA because we don't
9595 // track VRegLiveness.
9596 if (!DAG->hasVRegLiveness())
9597 return new GCNHazardRecognizer(DAG->MF);
9599}
9600
9601std::pair<unsigned, unsigned>
9603 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9604}
9605
9608 static const std::pair<unsigned, const char *> TargetFlags[] = {
9609 {MO_GOTPCREL, "amdgpu-gotprel"},
9610 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9611 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9612 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9613 {MO_REL32_LO, "amdgpu-rel32-lo"},
9614 {MO_REL32_HI, "amdgpu-rel32-hi"},
9615 {MO_REL64, "amdgpu-rel64"},
9616 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9617 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9618 {MO_ABS64, "amdgpu-abs64"},
9619 };
9620
9621 return ArrayRef(TargetFlags);
9622}
9623
9626 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9627 {
9628 {MONoClobber, "amdgpu-noclobber"},
9629 {MOLastUse, "amdgpu-last-use"},
9630 {MOCooperative, "amdgpu-cooperative"},
9631 };
9632
9633 return ArrayRef(TargetFlags);
9634}
9635
9637 const MachineFunction &MF) const {
9639 assert(SrcReg.isVirtual());
9640 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9641 return AMDGPU::WWM_COPY;
9642
9643 return AMDGPU::COPY;
9644}
9645
9647 Register Reg) const {
9648 // We need to handle instructions which may be inserted during register
9649 // allocation to handle the prolog. The initial prolog instruction may have
9650 // been separated from the start of the block by spills and copies inserted
9651 // needed by the prolog. However, the insertions for scalar registers can
9652 // always be placed at the BB top as they are independent of the exec mask
9653 // value.
9654 const MachineFunction *MF = MI.getParent()->getParent();
9655 bool IsNullOrVectorRegister = true;
9656 if (Reg) {
9657 const MachineRegisterInfo &MRI = MF->getRegInfo();
9658 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9659 }
9660
9661 uint16_t Opcode = MI.getOpcode();
9663 return IsNullOrVectorRegister &&
9664 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9665 (Opcode == AMDGPU::IMPLICIT_DEF &&
9666 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9667 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9668 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9669}
9670
9674 const DebugLoc &DL,
9675 Register DestReg) const {
9676 if (ST.hasAddNoCarry())
9677 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9678
9679 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9680 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9681 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9682
9683 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9684 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9685}
9686
9689 const DebugLoc &DL,
9690 Register DestReg,
9691 RegScavenger &RS) const {
9692 if (ST.hasAddNoCarry())
9693 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9694
9695 // If available, prefer to use vcc.
9696 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9697 ? Register(RI.getVCC())
9698 : RS.scavengeRegisterBackwards(
9699 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9700 0, /* AllowSpill */ false);
9701
9702 // TODO: Users need to deal with this.
9703 if (!UnusedCarry.isValid())
9704 return MachineInstrBuilder();
9705
9706 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9707 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9708}
9709
9710bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9711 switch (Opcode) {
9712 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9713 case AMDGPU::SI_KILL_I1_TERMINATOR:
9714 return true;
9715 default:
9716 return false;
9717 }
9718}
9719
9721 switch (Opcode) {
9722 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9723 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9724 case AMDGPU::SI_KILL_I1_PSEUDO:
9725 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9726 default:
9727 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9728 }
9729}
9730
9731bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9732 return Imm <= getMaxMUBUFImmOffset(ST);
9733}
9734
9736 // GFX12 field is non-negative 24-bit signed byte offset.
9737 const unsigned OffsetBits =
9738 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9739 return (1 << OffsetBits) - 1;
9740}
9741
9743 if (!ST.isWave32())
9744 return;
9745
9746 if (MI.isInlineAsm())
9747 return;
9748
9749 for (auto &Op : MI.implicit_operands()) {
9750 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9751 Op.setReg(AMDGPU::VCC_LO);
9752 }
9753}
9754
9756 if (!isSMRD(MI))
9757 return false;
9758
9759 // Check that it is using a buffer resource.
9760 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9761 if (Idx == -1) // e.g. s_memtime
9762 return false;
9763
9764 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
9765 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9766}
9767
9768// Given Imm, split it into the values to put into the SOffset and ImmOffset
9769// fields in an MUBUF instruction. Return false if it is not possible (due to a
9770// hardware bug needing a workaround).
9771//
9772// The required alignment ensures that individual address components remain
9773// aligned if they are aligned to begin with. It also ensures that additional
9774// offsets within the given alignment can be added to the resulting ImmOffset.
9776 uint32_t &ImmOffset, Align Alignment) const {
9777 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9778 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9779 uint32_t Overflow = 0;
9780
9781 if (Imm > MaxImm) {
9782 if (Imm <= MaxImm + 64) {
9783 // Use an SOffset inline constant for 4..64
9784 Overflow = Imm - MaxImm;
9785 Imm = MaxImm;
9786 } else {
9787 // Try to keep the same value in SOffset for adjacent loads, so that
9788 // the corresponding register contents can be re-used.
9789 //
9790 // Load values with all low-bits (except for alignment bits) set into
9791 // SOffset, so that a larger range of values can be covered using
9792 // s_movk_i32.
9793 //
9794 // Atomic operations fail to work correctly when individual address
9795 // components are unaligned, even if their sum is aligned.
9796 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9797 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9798 Imm = Low;
9799 Overflow = High - Alignment.value();
9800 }
9801 }
9802
9803 if (Overflow > 0) {
9804 // There is a hardware bug in SI and CI which prevents address clamping in
9805 // MUBUF instructions from working correctly with SOffsets. The immediate
9806 // offset is unaffected.
9807 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9808 return false;
9809
9810 // It is not possible to set immediate in SOffset field on some targets.
9811 if (ST.hasRestrictedSOffset())
9812 return false;
9813 }
9814
9815 ImmOffset = Imm;
9816 SOffset = Overflow;
9817 return true;
9818}
9819
9820// Depending on the used address space and instructions, some immediate offsets
9821// are allowed and some are not.
9822// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9823// scratch instruction offsets can also be negative. On GFX12, offsets can be
9824// negative for all variants.
9825//
9826// There are several bugs related to these offsets:
9827// On gfx10.1, flat instructions that go into the global address space cannot
9828// use an offset.
9829//
9830// For scratch instructions, the address can be either an SGPR or a VGPR.
9831// The following offsets can be used, depending on the architecture (x means
9832// cannot be used):
9833// +----------------------------+------+------+
9834// | Address-Mode | SGPR | VGPR |
9835// +----------------------------+------+------+
9836// | gfx9 | | |
9837// | negative, 4-aligned offset | x | ok |
9838// | negative, unaligned offset | x | ok |
9839// +----------------------------+------+------+
9840// | gfx10 | | |
9841// | negative, 4-aligned offset | ok | ok |
9842// | negative, unaligned offset | ok | x |
9843// +----------------------------+------+------+
9844// | gfx10.3 | | |
9845// | negative, 4-aligned offset | ok | ok |
9846// | negative, unaligned offset | ok | ok |
9847// +----------------------------+------+------+
9848//
9849// This function ignores the addressing mode, so if an offset cannot be used in
9850// one addressing mode, it is considered illegal.
9851bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9852 uint64_t FlatVariant) const {
9853 // TODO: Should 0 be special cased?
9854 if (!ST.hasFlatInstOffsets())
9855 return false;
9856
9857 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9858 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9859 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9860 return false;
9861
9862 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9863 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9864 (Offset % 4) != 0) {
9865 return false;
9866 }
9867
9868 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9869 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9870 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9871}
9872
9873// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9874std::pair<int64_t, int64_t>
9875SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9876 uint64_t FlatVariant) const {
9877 int64_t RemainderOffset = COffsetVal;
9878 int64_t ImmField = 0;
9879
9880 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9881 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9882
9883 if (AllowNegative) {
9884 // Use signed division by a power of two to truncate towards 0.
9885 int64_t D = 1LL << NumBits;
9886 RemainderOffset = (COffsetVal / D) * D;
9887 ImmField = COffsetVal - RemainderOffset;
9888
9889 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9890 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9891 (ImmField % 4) != 0) {
9892 // Make ImmField a multiple of 4
9893 RemainderOffset += ImmField % 4;
9894 ImmField -= ImmField % 4;
9895 }
9896 } else if (COffsetVal >= 0) {
9897 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9898 RemainderOffset = COffsetVal - ImmField;
9899 }
9900
9901 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9902 assert(RemainderOffset + ImmField == COffsetVal);
9903 return {ImmField, RemainderOffset};
9904}
9905
9907 if (ST.hasNegativeScratchOffsetBug() &&
9908 FlatVariant == SIInstrFlags::FlatScratch)
9909 return false;
9910
9911 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9912}
9913
9914static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9915 switch (ST.getGeneration()) {
9916 default:
9917 break;
9920 return SIEncodingFamily::SI;
9923 return SIEncodingFamily::VI;
9929 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9931 }
9932 llvm_unreachable("Unknown subtarget generation!");
9933}
9934
9935bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9936 switch(MCOp) {
9937 // These opcodes use indirect register addressing so
9938 // they need special handling by codegen (currently missing).
9939 // Therefore it is too risky to allow these opcodes
9940 // to be selected by dpp combiner or sdwa peepholer.
9941 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9942 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9943 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9944 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9945 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9946 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9947 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9948 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9949 return true;
9950 default:
9951 return false;
9952 }
9953}
9954
9955#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9956 case OPCODE##_dpp: \
9957 case OPCODE##_e32: \
9958 case OPCODE##_e64: \
9959 case OPCODE##_e64_dpp: \
9960 case OPCODE##_sdwa:
9961
9962static bool isRenamedInGFX9(int Opcode) {
9963 switch (Opcode) {
9964 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9965 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9966 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9967 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9968 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9969 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9970 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9971 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9972 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9973 //
9974 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9975 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9976 case AMDGPU::V_FMA_F16_gfx9_e64:
9977 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
9978 case AMDGPU::V_INTERP_P2_F16:
9979 case AMDGPU::V_MAD_F16_e64:
9980 case AMDGPU::V_MAD_U16_e64:
9981 case AMDGPU::V_MAD_I16_e64:
9982 return true;
9983 default:
9984 return false;
9985 }
9986}
9987
9988int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9989 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9990
9991 unsigned Gen = subtargetEncodingFamily(ST);
9992
9993 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
9995
9996 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9997 // subtarget has UnpackedD16VMem feature.
9998 // TODO: remove this when we discard GFX80 encoding.
9999 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10001
10002 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10003 switch (ST.getGeneration()) {
10004 default:
10006 break;
10009 break;
10012 break;
10013 }
10014 }
10015
10016 if (isMAI(Opcode)) {
10017 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10018 if (MFMAOp != -1)
10019 Opcode = MFMAOp;
10020 }
10021
10022 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10023
10024 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10026
10027 // -1 means that Opcode is already a native instruction.
10028 if (MCOp == -1)
10029 return Opcode;
10030
10031 if (ST.hasGFX90AInsts()) {
10032 uint16_t NMCOp = (uint16_t)-1;
10033 if (ST.hasGFX940Insts())
10035 if (NMCOp == (uint16_t)-1)
10037 if (NMCOp == (uint16_t)-1)
10039 if (NMCOp != (uint16_t)-1)
10040 MCOp = NMCOp;
10041 }
10042
10043 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10044 // no encoding in the given subtarget generation.
10045 if (MCOp == (uint16_t)-1)
10046 return -1;
10047
10048 if (isAsmOnlyOpcode(MCOp))
10049 return -1;
10050
10051 return MCOp;
10052}
10053
10054static
10056 assert(RegOpnd.isReg());
10057 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10058 getRegSubRegPair(RegOpnd);
10059}
10060
10063 assert(MI.isRegSequence());
10064 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10065 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10066 auto &RegOp = MI.getOperand(1 + 2 * I);
10067 return getRegOrUndef(RegOp);
10068 }
10070}
10071
10072// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10073// Following a subreg of reg:subreg isn't supported
10076 if (!RSR.SubReg)
10077 return false;
10078 switch (MI.getOpcode()) {
10079 default: break;
10080 case AMDGPU::REG_SEQUENCE:
10081 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10082 return true;
10083 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10084 case AMDGPU::INSERT_SUBREG:
10085 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10086 // inserted the subreg we're looking for
10087 RSR = getRegOrUndef(MI.getOperand(2));
10088 else { // the subreg in the rest of the reg
10089 auto R1 = getRegOrUndef(MI.getOperand(1));
10090 if (R1.SubReg) // subreg of subreg isn't supported
10091 return false;
10092 RSR.Reg = R1.Reg;
10093 }
10094 return true;
10095 }
10096 return false;
10097}
10098
10101 assert(MRI.isSSA());
10102 if (!P.Reg.isVirtual())
10103 return nullptr;
10104
10105 auto RSR = P;
10106 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10107 while (auto *MI = DefInst) {
10108 DefInst = nullptr;
10109 switch (MI->getOpcode()) {
10110 case AMDGPU::COPY:
10111 case AMDGPU::V_MOV_B32_e32: {
10112 auto &Op1 = MI->getOperand(1);
10113 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10114 if (Op1.isUndef())
10115 return nullptr;
10116 RSR = getRegSubRegPair(Op1);
10117 DefInst = MRI.getVRegDef(RSR.Reg);
10118 }
10119 break;
10120 }
10121 default:
10122 if (followSubRegDef(*MI, RSR)) {
10123 if (!RSR.Reg)
10124 return nullptr;
10125 DefInst = MRI.getVRegDef(RSR.Reg);
10126 }
10127 }
10128 if (!DefInst)
10129 return MI;
10130 }
10131 return nullptr;
10132}
10133
10135 Register VReg,
10136 const MachineInstr &DefMI,
10137 const MachineInstr &UseMI) {
10138 assert(MRI.isSSA() && "Must be run on SSA");
10139
10140 auto *TRI = MRI.getTargetRegisterInfo();
10141 auto *DefBB = DefMI.getParent();
10142
10143 // Don't bother searching between blocks, although it is possible this block
10144 // doesn't modify exec.
10145 if (UseMI.getParent() != DefBB)
10146 return true;
10147
10148 const int MaxInstScan = 20;
10149 int NumInst = 0;
10150
10151 // Stop scan at the use.
10152 auto E = UseMI.getIterator();
10153 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10154 if (I->isDebugInstr())
10155 continue;
10156
10157 if (++NumInst > MaxInstScan)
10158 return true;
10159
10160 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10161 return true;
10162 }
10163
10164 return false;
10165}
10166
10168 Register VReg,
10169 const MachineInstr &DefMI) {
10170 assert(MRI.isSSA() && "Must be run on SSA");
10171
10172 auto *TRI = MRI.getTargetRegisterInfo();
10173 auto *DefBB = DefMI.getParent();
10174
10175 const int MaxUseScan = 10;
10176 int NumUse = 0;
10177
10178 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10179 auto &UseInst = *Use.getParent();
10180 // Don't bother searching between blocks, although it is possible this block
10181 // doesn't modify exec.
10182 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10183 return true;
10184
10185 if (++NumUse > MaxUseScan)
10186 return true;
10187 }
10188
10189 if (NumUse == 0)
10190 return false;
10191
10192 const int MaxInstScan = 20;
10193 int NumInst = 0;
10194
10195 // Stop scan when we have seen all the uses.
10196 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10197 assert(I != DefBB->end());
10198
10199 if (I->isDebugInstr())
10200 continue;
10201
10202 if (++NumInst > MaxInstScan)
10203 return true;
10204
10205 for (const MachineOperand &Op : I->operands()) {
10206 // We don't check reg masks here as they're used only on calls:
10207 // 1. EXEC is only considered const within one BB
10208 // 2. Call should be a terminator instruction if present in a BB
10209
10210 if (!Op.isReg())
10211 continue;
10212
10213 Register Reg = Op.getReg();
10214 if (Op.isUse()) {
10215 if (Reg == VReg && --NumUse == 0)
10216 return false;
10217 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10218 return true;
10219 }
10220 }
10221}
10222
10225 const DebugLoc &DL, Register Src, Register Dst) const {
10226 auto Cur = MBB.begin();
10227 if (Cur != MBB.end())
10228 do {
10229 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10230 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10231 ++Cur;
10232 } while (Cur != MBB.end() && Cur != LastPHIIt);
10233
10234 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10235 Dst);
10236}
10237
10240 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10241 if (InsPt != MBB.end() &&
10242 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10243 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10244 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10245 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10246 InsPt++;
10247 return BuildMI(MBB, InsPt, DL,
10248 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10249 .addReg(Src, 0, SrcSubReg)
10250 .addReg(AMDGPU::EXEC, RegState::Implicit);
10251 }
10252 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10253 Dst);
10254}
10255
10256bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10257
10260 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10261 VirtRegMap *VRM) const {
10262 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10263 //
10264 // %0:sreg_32 = COPY $m0
10265 //
10266 // We explicitly chose SReg_32 for the virtual register so such a copy might
10267 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10268 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10269 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10270 // TargetInstrInfo::foldMemoryOperand() is going to try.
10271 // A similar issue also exists with spilling and reloading $exec registers.
10272 //
10273 // To prevent that, constrain the %0 register class here.
10274 if (isFullCopyInstr(MI)) {
10275 Register DstReg = MI.getOperand(0).getReg();
10276 Register SrcReg = MI.getOperand(1).getReg();
10277 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10278 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10280 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10281 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10282 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10283 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10284 return nullptr;
10285 }
10286 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10287 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10288 return nullptr;
10289 }
10290 }
10291 }
10292
10293 return nullptr;
10294}
10295
10297 const MachineInstr &MI,
10298 unsigned *PredCost) const {
10299 if (MI.isBundle()) {
10301 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10302 unsigned Lat = 0, Count = 0;
10303 for (++I; I != E && I->isBundledWithPred(); ++I) {
10304 ++Count;
10305 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10306 }
10307 return Lat + Count - 1;
10308 }
10309
10310 return SchedModel.computeInstrLatency(&MI);
10311}
10312
10315 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10316 unsigned Opcode = MI.getOpcode();
10317
10318 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10319 Register Dst = MI.getOperand(0).getReg();
10320 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10321 : MI.getOperand(1).getReg();
10322 LLT DstTy = MRI.getType(Dst);
10323 LLT SrcTy = MRI.getType(Src);
10324 unsigned DstAS = DstTy.getAddressSpace();
10325 unsigned SrcAS = SrcTy.getAddressSpace();
10326 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10327 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10328 ST.hasGloballyAddressableScratch()
10331 };
10332
10333 // If the target supports globally addressable scratch, the mapping from
10334 // scratch memory to the flat aperture changes therefore an address space cast
10335 // is no longer uniform.
10336 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10337 return HandleAddrSpaceCast(MI);
10338
10339 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10340 auto IID = GI->getIntrinsicID();
10345
10346 switch (IID) {
10347 case Intrinsic::amdgcn_addrspacecast_nonnull:
10348 return HandleAddrSpaceCast(MI);
10349 case Intrinsic::amdgcn_if:
10350 case Intrinsic::amdgcn_else:
10351 // FIXME: Uniform if second result
10352 break;
10353 }
10354
10356 }
10357
10358 // Loads from the private and flat address spaces are divergent, because
10359 // threads can execute the load instruction with the same inputs and get
10360 // different results.
10361 //
10362 // All other loads are not divergent, because if threads issue loads with the
10363 // same arguments, they will always get the same result.
10364 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10365 Opcode == AMDGPU::G_SEXTLOAD) {
10366 if (MI.memoperands_empty())
10367 return InstructionUniformity::NeverUniform; // conservative assumption
10368
10369 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10370 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10371 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10372 })) {
10373 // At least one MMO in a non-global address space.
10375 }
10377 }
10378
10379 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10380 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10381 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10382 AMDGPU::isGenericAtomic(Opcode)) {
10384 }
10386}
10387
10390
10391 if (isNeverUniform(MI))
10393
10394 unsigned opcode = MI.getOpcode();
10395 if (opcode == AMDGPU::V_READLANE_B32 ||
10396 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10397 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10399
10400 if (isCopyInstr(MI)) {
10401 const MachineOperand &srcOp = MI.getOperand(1);
10402 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10403 const TargetRegisterClass *regClass =
10404 RI.getPhysRegBaseClass(srcOp.getReg());
10405 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10407 }
10409 }
10410
10411 // GMIR handling
10412 if (MI.isPreISelOpcode())
10414
10415 // Atomics are divergent because they are executed sequentially: when an
10416 // atomic operation refers to the same address in each thread, then each
10417 // thread after the first sees the value written by the previous thread as
10418 // original value.
10419
10420 if (isAtomic(MI))
10422
10423 // Loads from the private and flat address spaces are divergent, because
10424 // threads can execute the load instruction with the same inputs and get
10425 // different results.
10426 if (isFLAT(MI) && MI.mayLoad()) {
10427 if (MI.memoperands_empty())
10428 return InstructionUniformity::NeverUniform; // conservative assumption
10429
10430 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10431 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10432 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10433 })) {
10434 // At least one MMO in a non-global address space.
10436 }
10437
10439 }
10440
10441 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10442 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10443
10444 // FIXME: It's conceptually broken to report this for an instruction, and not
10445 // a specific def operand. For inline asm in particular, there could be mixed
10446 // uniform and divergent results.
10447 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10448 const MachineOperand &SrcOp = MI.getOperand(I);
10449 if (!SrcOp.isReg())
10450 continue;
10451
10452 Register Reg = SrcOp.getReg();
10453 if (!Reg || !SrcOp.readsReg())
10454 continue;
10455
10456 // If RegBank is null, this is unassigned or an unallocatable special
10457 // register, which are all scalars.
10458 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10459 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10461 }
10462
10463 // TODO: Uniformity check condtions above can be rearranged for more
10464 // redability
10465
10466 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10467 // currently turned into no-op COPYs by SelectionDAG ISel and are
10468 // therefore no longer recognizable.
10469
10471}
10472
10474 switch (MF.getFunction().getCallingConv()) {
10476 return 1;
10478 return 2;
10480 return 3;
10484 const Function &F = MF.getFunction();
10485 F.getContext().diagnose(DiagnosticInfoUnsupported(
10486 F, "ds_ordered_count unsupported for this calling conv"));
10487 [[fallthrough]];
10488 }
10491 case CallingConv::C:
10492 case CallingConv::Fast:
10493 default:
10494 // Assume other calling conventions are various compute callable functions
10495 return 0;
10496 }
10497}
10498
10500 Register &SrcReg2, int64_t &CmpMask,
10501 int64_t &CmpValue) const {
10502 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10503 return false;
10504
10505 switch (MI.getOpcode()) {
10506 default:
10507 break;
10508 case AMDGPU::S_CMP_EQ_U32:
10509 case AMDGPU::S_CMP_EQ_I32:
10510 case AMDGPU::S_CMP_LG_U32:
10511 case AMDGPU::S_CMP_LG_I32:
10512 case AMDGPU::S_CMP_LT_U32:
10513 case AMDGPU::S_CMP_LT_I32:
10514 case AMDGPU::S_CMP_GT_U32:
10515 case AMDGPU::S_CMP_GT_I32:
10516 case AMDGPU::S_CMP_LE_U32:
10517 case AMDGPU::S_CMP_LE_I32:
10518 case AMDGPU::S_CMP_GE_U32:
10519 case AMDGPU::S_CMP_GE_I32:
10520 case AMDGPU::S_CMP_EQ_U64:
10521 case AMDGPU::S_CMP_LG_U64:
10522 SrcReg = MI.getOperand(0).getReg();
10523 if (MI.getOperand(1).isReg()) {
10524 if (MI.getOperand(1).getSubReg())
10525 return false;
10526 SrcReg2 = MI.getOperand(1).getReg();
10527 CmpValue = 0;
10528 } else if (MI.getOperand(1).isImm()) {
10529 SrcReg2 = Register();
10530 CmpValue = MI.getOperand(1).getImm();
10531 } else {
10532 return false;
10533 }
10534 CmpMask = ~0;
10535 return true;
10536 case AMDGPU::S_CMPK_EQ_U32:
10537 case AMDGPU::S_CMPK_EQ_I32:
10538 case AMDGPU::S_CMPK_LG_U32:
10539 case AMDGPU::S_CMPK_LG_I32:
10540 case AMDGPU::S_CMPK_LT_U32:
10541 case AMDGPU::S_CMPK_LT_I32:
10542 case AMDGPU::S_CMPK_GT_U32:
10543 case AMDGPU::S_CMPK_GT_I32:
10544 case AMDGPU::S_CMPK_LE_U32:
10545 case AMDGPU::S_CMPK_LE_I32:
10546 case AMDGPU::S_CMPK_GE_U32:
10547 case AMDGPU::S_CMPK_GE_I32:
10548 SrcReg = MI.getOperand(0).getReg();
10549 SrcReg2 = Register();
10550 CmpValue = MI.getOperand(1).getImm();
10551 CmpMask = ~0;
10552 return true;
10553 }
10554
10555 return false;
10556}
10557
10559 Register SrcReg2, int64_t CmpMask,
10560 int64_t CmpValue,
10561 const MachineRegisterInfo *MRI) const {
10562 if (!SrcReg || SrcReg.isPhysical())
10563 return false;
10564
10565 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10566 return false;
10567
10568 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10569 this](int64_t ExpectedValue, unsigned SrcSize,
10570 bool IsReversible, bool IsSigned) -> bool {
10571 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10572 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10573 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10574 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10575 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10576 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10577 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10578 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10579 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10580 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10581 //
10582 // Signed ge/gt are not used for the sign bit.
10583 //
10584 // If result of the AND is unused except in the compare:
10585 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10586 //
10587 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10588 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10589 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10590 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10591 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10592 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10593
10594 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10595 if (!Def || Def->getParent() != CmpInstr.getParent())
10596 return false;
10597
10598 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10599 Def->getOpcode() != AMDGPU::S_AND_B64)
10600 return false;
10601
10602 int64_t Mask;
10603 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10604 if (MO->isImm())
10605 Mask = MO->getImm();
10606 else if (!getFoldableImm(MO, Mask))
10607 return false;
10608 Mask &= maxUIntN(SrcSize);
10609 return isPowerOf2_64(Mask);
10610 };
10611
10612 MachineOperand *SrcOp = &Def->getOperand(1);
10613 if (isMask(SrcOp))
10614 SrcOp = &Def->getOperand(2);
10615 else if (isMask(&Def->getOperand(2)))
10616 SrcOp = &Def->getOperand(1);
10617 else
10618 return false;
10619
10620 // A valid Mask is required to have a single bit set, hence a non-zero and
10621 // power-of-two value. This verifies that we will not do 64-bit shift below.
10622 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10623 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10624 if (IsSigned && BitNo == SrcSize - 1)
10625 return false;
10626
10627 ExpectedValue <<= BitNo;
10628
10629 bool IsReversedCC = false;
10630 if (CmpValue != ExpectedValue) {
10631 if (!IsReversible)
10632 return false;
10633 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10634 if (!IsReversedCC)
10635 return false;
10636 }
10637
10638 Register DefReg = Def->getOperand(0).getReg();
10639 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10640 return false;
10641
10642 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10643 I != E; ++I) {
10644 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10645 I->killsRegister(AMDGPU::SCC, &RI))
10646 return false;
10647 }
10648
10649 MachineOperand *SccDef =
10650 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10651 SccDef->setIsDead(false);
10652 CmpInstr.eraseFromParent();
10653
10654 if (!MRI->use_nodbg_empty(DefReg)) {
10655 assert(!IsReversedCC);
10656 return true;
10657 }
10658
10659 // Replace AND with unused result with a S_BITCMP.
10660 MachineBasicBlock *MBB = Def->getParent();
10661
10662 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10663 : AMDGPU::S_BITCMP1_B32
10664 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10665 : AMDGPU::S_BITCMP1_B64;
10666
10667 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10668 .add(*SrcOp)
10669 .addImm(BitNo);
10670 Def->eraseFromParent();
10671
10672 return true;
10673 };
10674
10675 switch (CmpInstr.getOpcode()) {
10676 default:
10677 break;
10678 case AMDGPU::S_CMP_EQ_U32:
10679 case AMDGPU::S_CMP_EQ_I32:
10680 case AMDGPU::S_CMPK_EQ_U32:
10681 case AMDGPU::S_CMPK_EQ_I32:
10682 return optimizeCmpAnd(1, 32, true, false);
10683 case AMDGPU::S_CMP_GE_U32:
10684 case AMDGPU::S_CMPK_GE_U32:
10685 return optimizeCmpAnd(1, 32, false, false);
10686 case AMDGPU::S_CMP_GE_I32:
10687 case AMDGPU::S_CMPK_GE_I32:
10688 return optimizeCmpAnd(1, 32, false, true);
10689 case AMDGPU::S_CMP_EQ_U64:
10690 return optimizeCmpAnd(1, 64, true, false);
10691 case AMDGPU::S_CMP_LG_U32:
10692 case AMDGPU::S_CMP_LG_I32:
10693 case AMDGPU::S_CMPK_LG_U32:
10694 case AMDGPU::S_CMPK_LG_I32:
10695 return optimizeCmpAnd(0, 32, true, false);
10696 case AMDGPU::S_CMP_GT_U32:
10697 case AMDGPU::S_CMPK_GT_U32:
10698 return optimizeCmpAnd(0, 32, false, false);
10699 case AMDGPU::S_CMP_GT_I32:
10700 case AMDGPU::S_CMPK_GT_I32:
10701 return optimizeCmpAnd(0, 32, false, true);
10702 case AMDGPU::S_CMP_LG_U64:
10703 return optimizeCmpAnd(0, 64, true, false);
10704 }
10705
10706 return false;
10707}
10708
10710 AMDGPU::OpName OpName) const {
10711 if (!ST.needsAlignedVGPRs())
10712 return;
10713
10714 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10715 if (OpNo < 0)
10716 return;
10717 MachineOperand &Op = MI.getOperand(OpNo);
10718 if (getOpSize(MI, OpNo) > 4)
10719 return;
10720
10721 // Add implicit aligned super-reg to force alignment on the data operand.
10722 const DebugLoc &DL = MI.getDebugLoc();
10723 MachineBasicBlock *BB = MI.getParent();
10725 Register DataReg = Op.getReg();
10726 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10727 Register Undef = MRI.createVirtualRegister(
10728 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10729 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10730 Register NewVR =
10731 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10732 : &AMDGPU::VReg_64_Align2RegClass);
10733 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10734 .addReg(DataReg, 0, Op.getSubReg())
10735 .addImm(AMDGPU::sub0)
10736 .addReg(Undef)
10737 .addImm(AMDGPU::sub1);
10738 Op.setReg(NewVR);
10739 Op.setSubReg(AMDGPU::sub0);
10740 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10741}
10742
10744 if (isIGLP(*MI))
10745 return false;
10746
10748}
10749
10751 if (!isWMMA(MI) && !isSWMMAC(MI))
10752 return false;
10753
10754 if (AMDGPU::isGFX1250(ST))
10755 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
10756
10757 return true;
10758}
10759
10761 unsigned Opcode = MI.getOpcode();
10762
10763 if (AMDGPU::isGFX12Plus(ST))
10764 return isDOT(MI) || isXDLWMMA(MI);
10765
10766 if (!isMAI(MI) || isDGEMM(Opcode) ||
10767 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10768 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10769 return false;
10770
10771 if (!ST.hasGFX940Insts())
10772 return true;
10773
10774 return AMDGPU::getMAIIsGFX940XDL(Opcode);
10775}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:87
uint8_t OperandType
Information about the type of the operand.
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:96
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:102
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:574
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:576
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:573
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:575
@ TI_CONSTDATA_START
Definition AMDGPU.h:572
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:72
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:70
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:71
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:62
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:73
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:216
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
Op::Description Desc
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:127
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:583
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:219
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.