LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/MC/MCContext.h"
38
39using namespace llvm;
40
41#define DEBUG_TYPE "si-instr-info"
42
43#define GET_INSTRINFO_CTOR_DTOR
44#include "AMDGPUGenInstrInfo.inc"
45
46namespace llvm::AMDGPU {
47#define GET_D16ImageDimIntrinsics_IMPL
48#define GET_ImageDimIntrinsicTable_IMPL
49#define GET_RsrcIntrinsics_IMPL
50#include "AMDGPUGenSearchableTables.inc"
51} // namespace llvm::AMDGPU
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
68 AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(&ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
87 AMDGPU::OpName OpName) {
88 unsigned Opc0 = N0->getMachineOpcode();
89 unsigned Opc1 = N1->getMachineOpcode();
90
91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
93
94 if (Op0Idx == -1 && Op1Idx == -1)
95 return true;
96
97
98 if ((Op0Idx == -1 && Op1Idx != -1) ||
99 (Op1Idx == -1 && Op0Idx != -1))
100 return false;
101
102 // getNamedOperandIdx returns the index for the MachineInstr's operands,
103 // which includes the result as the first operand. We are indexing into the
104 // MachineSDNode's operands, so we need to skip the result operand to get
105 // the real index.
106 --Op0Idx;
107 --Op1Idx;
108
109 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
110}
111
112static bool canRemat(const MachineInstr &MI) {
113
117 return true;
118
119 if (SIInstrInfo::isSMRD(MI)) {
120 return !MI.memoperands_empty() &&
121 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
122 return MMO->isLoad() && MMO->isInvariant();
123 });
124 }
125
126 return false;
127}
128
130 const MachineInstr &MI) const {
131
132 if (canRemat(MI)) {
133 // Normally VALU use of exec would block the rematerialization, but that
134 // is OK in this case to have an implicit exec read as all VALU do.
135 // We really want all of the generic logic for this except for this.
136
137 // Another potential implicit use is mode register. The core logic of
138 // the RA will not attempt rematerialization if mode is set anywhere
139 // in the function, otherwise it is safe since mode is not changed.
140
141 // There is difference to generic method which does not allow
142 // rematerialization if there are virtual register uses. We allow this,
143 // therefore this method includes SOP instructions as well.
144 if (!MI.hasImplicitDef() &&
145 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
146 !MI.mayRaiseFPException())
147 return true;
148 }
149
151}
152
153// Returns true if the result of a VALU instruction depends on exec.
154bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
155 assert(isVALU(MI, /*AllowLDSDMA=*/true));
156
157 // If it is convergent it depends on EXEC.
158 if (MI.isConvergent())
159 return true;
160
161 // If it defines SGPR it depends on EXEC
162 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
163 for (const MachineOperand &Def : MI.defs()) {
164 if (!Def.isReg())
165 continue;
166
167 Register Reg = Def.getReg();
168 if (Reg && RI.isSGPRReg(MRI, Reg))
169 return true;
170 }
171
172 return false;
173}
174
176 // Any implicit use of exec by VALU is not a real register read.
177 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
178 isVALU(*MO.getParent(), /*AllowLDSDMA=*/true) &&
179 !resultDependsOnExec(*MO.getParent());
180}
181
183 MachineBasicBlock *SuccToSinkTo,
184 MachineCycleInfo *CI) const {
185 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
186 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
187 return true;
188
189 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
190 // Check if sinking of MI would create temporal divergent use.
191 for (auto Op : MI.uses()) {
192 if (Op.isReg() && Op.getReg().isVirtual() &&
193 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
194 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
195
196 // SgprDef defined inside cycle
197 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
198 if (FromCycle == nullptr)
199 continue;
200
201 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
202 // Check if there is a FromCycle that contains SgprDef's basic block but
203 // does not contain SuccToSinkTo and also has divergent exit condition.
204 while (FromCycle && !FromCycle->contains(ToCycle)) {
206 FromCycle->getExitingBlocks(ExitingBlocks);
207
208 // FromCycle has divergent exit condition.
209 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
210 if (hasDivergentBranch(ExitingBlock))
211 return false;
212 }
213
214 FromCycle = FromCycle->getParentCycle();
215 }
216 }
217 }
218
219 return true;
220}
221
223 int64_t &Offset0,
224 int64_t &Offset1) const {
225 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
226 return false;
227
228 unsigned Opc0 = Load0->getMachineOpcode();
229 unsigned Opc1 = Load1->getMachineOpcode();
230
231 // Make sure both are actually loads.
232 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
233 return false;
234
235 // A mayLoad instruction without a def is not a load. Likely a prefetch.
236 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
237 return false;
238
239 if (isDS(Opc0) && isDS(Opc1)) {
240
241 // FIXME: Handle this case:
242 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
243 return false;
244
245 // Check base reg.
246 if (Load0->getOperand(0) != Load1->getOperand(0))
247 return false;
248
249 // Skip read2 / write2 variants for simplicity.
250 // TODO: We should report true if the used offsets are adjacent (excluded
251 // st64 versions).
252 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
253 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
254 if (Offset0Idx == -1 || Offset1Idx == -1)
255 return false;
256
257 // XXX - be careful of dataless loads
258 // getNamedOperandIdx returns the index for MachineInstrs. Since they
259 // include the output in the operand list, but SDNodes don't, we need to
260 // subtract the index by one.
261 Offset0Idx -= get(Opc0).NumDefs;
262 Offset1Idx -= get(Opc1).NumDefs;
263 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
264 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
265 return true;
266 }
267
268 if (isSMRD(Opc0) && isSMRD(Opc1)) {
269 // Skip time and cache invalidation instructions.
270 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
271 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
272 return false;
273
274 unsigned NumOps = getNumOperandsNoGlue(Load0);
275 if (NumOps != getNumOperandsNoGlue(Load1))
276 return false;
277
278 // Check base reg.
279 if (Load0->getOperand(0) != Load1->getOperand(0))
280 return false;
281
282 // Match register offsets, if both register and immediate offsets present.
283 assert(NumOps == 4 || NumOps == 5);
284 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
285 return false;
286
287 const ConstantSDNode *Load0Offset =
289 const ConstantSDNode *Load1Offset =
291
292 if (!Load0Offset || !Load1Offset)
293 return false;
294
295 Offset0 = Load0Offset->getZExtValue();
296 Offset1 = Load1Offset->getZExtValue();
297 return true;
298 }
299
300 // MUBUF and MTBUF can access the same addresses.
301 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
302
303 // MUBUF and MTBUF have vaddr at different indices.
304 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
305 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
306 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
307 return false;
308
309 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
310 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
311
312 if (OffIdx0 == -1 || OffIdx1 == -1)
313 return false;
314
315 // getNamedOperandIdx returns the index for MachineInstrs. Since they
316 // include the output in the operand list, but SDNodes don't, we need to
317 // subtract the index by one.
318 OffIdx0 -= get(Opc0).NumDefs;
319 OffIdx1 -= get(Opc1).NumDefs;
320
321 SDValue Off0 = Load0->getOperand(OffIdx0);
322 SDValue Off1 = Load1->getOperand(OffIdx1);
323
324 // The offset might be a FrameIndexSDNode.
325 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
326 return false;
327
328 Offset0 = Off0->getAsZExtVal();
329 Offset1 = Off1->getAsZExtVal();
330 return true;
331 }
332
333 return false;
334}
335
336static bool isStride64(unsigned Opc) {
337 switch (Opc) {
338 case AMDGPU::DS_READ2ST64_B32:
339 case AMDGPU::DS_READ2ST64_B64:
340 case AMDGPU::DS_WRITE2ST64_B32:
341 case AMDGPU::DS_WRITE2ST64_B64:
342 return true;
343 default:
344 return false;
345 }
346}
347
350 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
351 const TargetRegisterInfo *TRI) const {
352 if (!LdSt.mayLoadOrStore())
353 return false;
354
355 unsigned Opc = LdSt.getOpcode();
356 OffsetIsScalable = false;
357 const MachineOperand *BaseOp, *OffsetOp;
358 int DataOpIdx;
359
360 if (isDS(LdSt)) {
361 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
362 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
363 if (OffsetOp) {
364 // Normal, single offset LDS instruction.
365 if (!BaseOp) {
366 // DS_CONSUME/DS_APPEND use M0 for the base address.
367 // TODO: find the implicit use operand for M0 and use that as BaseOp?
368 return false;
369 }
370 BaseOps.push_back(BaseOp);
371 Offset = OffsetOp->getImm();
372 // Get appropriate operand, and compute width accordingly.
373 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
374 if (DataOpIdx == -1)
375 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
376 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
377 Width = LocationSize::precise(64);
378 else
379 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
380 } else {
381 // The 2 offset instructions use offset0 and offset1 instead. We can treat
382 // these as a load with a single offset if the 2 offsets are consecutive.
383 // We will use this for some partially aligned loads.
384 const MachineOperand *Offset0Op =
385 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
386 const MachineOperand *Offset1Op =
387 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
388
389 unsigned Offset0 = Offset0Op->getImm() & 0xff;
390 unsigned Offset1 = Offset1Op->getImm() & 0xff;
391 if (Offset0 + 1 != Offset1)
392 return false;
393
394 // Each of these offsets is in element sized units, so we need to convert
395 // to bytes of the individual reads.
396
397 unsigned EltSize;
398 if (LdSt.mayLoad())
399 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
400 else {
401 assert(LdSt.mayStore());
402 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
403 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
404 }
405
406 if (isStride64(Opc))
407 EltSize *= 64;
408
409 BaseOps.push_back(BaseOp);
410 Offset = EltSize * Offset0;
411 // Get appropriate operand(s), and compute width accordingly.
412 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
413 if (DataOpIdx == -1) {
414 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
415 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
416 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
417 Width = LocationSize::precise(
418 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
419 } else {
420 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
421 }
422 }
423 return true;
424 }
425
426 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
427 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
428 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
429 return false;
430 BaseOps.push_back(RSrc);
431 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
432 if (BaseOp && !BaseOp->isFI())
433 BaseOps.push_back(BaseOp);
434 const MachineOperand *OffsetImm =
435 getNamedOperand(LdSt, AMDGPU::OpName::offset);
436 Offset = OffsetImm->getImm();
437 const MachineOperand *SOffset =
438 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
439 if (SOffset) {
440 if (SOffset->isReg())
441 BaseOps.push_back(SOffset);
442 else
443 Offset += SOffset->getImm();
444 }
445 // Get appropriate operand, and compute width accordingly.
446 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
447 if (DataOpIdx == -1)
448 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
449 if (DataOpIdx == -1) // LDS DMA
450 return false;
451 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
452 return true;
453 }
454
455 if (isImage(LdSt)) {
456 auto RsrcOpName =
457 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
458 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
459 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
460 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
461 if (VAddr0Idx >= 0) {
462 // GFX10 possible NSA encoding.
463 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
464 BaseOps.push_back(&LdSt.getOperand(I));
465 } else {
466 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
467 }
468 Offset = 0;
469 // Get appropriate operand, and compute width accordingly.
470 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
471 if (DataOpIdx == -1)
472 return false; // no return sampler
473 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
474 return true;
475 }
476
477 if (isSMRD(LdSt)) {
478 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
479 if (!BaseOp) // e.g. S_MEMTIME
480 return false;
481 BaseOps.push_back(BaseOp);
482 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
483 Offset = OffsetOp ? OffsetOp->getImm() : 0;
484 // Get appropriate operand, and compute width accordingly.
485 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
486 if (DataOpIdx == -1)
487 return false;
488 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
489 return true;
490 }
491
492 if (isFLAT(LdSt)) {
493 // Instructions have either vaddr or saddr or both or none.
494 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
495 if (BaseOp)
496 BaseOps.push_back(BaseOp);
497 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
498 if (BaseOp)
499 BaseOps.push_back(BaseOp);
500 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
503 if (DataOpIdx == -1)
504 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
505 if (DataOpIdx == -1) // LDS DMA
506 return false;
507 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
508 return true;
509 }
510
511 return false;
512}
513
514static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
516 const MachineInstr &MI2,
518 // Only examine the first "base" operand of each instruction, on the
519 // assumption that it represents the real base address of the memory access.
520 // Other operands are typically offsets or indices from this base address.
521 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
522 return true;
523
524 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
525 return false;
526
527 auto *MO1 = *MI1.memoperands_begin();
528 auto *MO2 = *MI2.memoperands_begin();
529 if (MO1->getAddrSpace() != MO2->getAddrSpace())
530 return false;
531
532 const auto *Base1 = MO1->getValue();
533 const auto *Base2 = MO2->getValue();
534 if (!Base1 || !Base2)
535 return false;
536 Base1 = getUnderlyingObject(Base1);
537 Base2 = getUnderlyingObject(Base2);
538
539 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
540 return false;
541
542 return Base1 == Base2;
543}
544
546 int64_t Offset1, bool OffsetIsScalable1,
548 int64_t Offset2, bool OffsetIsScalable2,
549 unsigned ClusterSize,
550 unsigned NumBytes) const {
551 // If the mem ops (to be clustered) do not have the same base ptr, then they
552 // should not be clustered
553 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
554 if (!BaseOps1.empty() && !BaseOps2.empty()) {
555 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
556 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
557 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
558 return false;
559
560 const SIMachineFunctionInfo *MFI =
561 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
562 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
563 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
564 // If only one base op is empty, they do not have the same base ptr
565 return false;
566 }
567
568 // In order to avoid register pressure, on an average, the number of DWORDS
569 // loaded together by all clustered mem ops should not exceed
570 // MaxMemoryClusterDWords. This is an empirical value based on certain
571 // observations and performance related experiments.
572 // The good thing about this heuristic is - it avoids clustering of too many
573 // sub-word loads, and also avoids clustering of wide loads. Below is the
574 // brief summary of how the heuristic behaves for various `LoadSize` when
575 // MaxMemoryClusterDWords is 8.
576 //
577 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
578 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
579 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
580 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
581 // (5) LoadSize >= 17: do not cluster
582 const unsigned LoadSize = NumBytes / ClusterSize;
583 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
584 return NumDWords <= MaxMemoryClusterDWords;
585}
586
587// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
588// the first 16 loads will be interleaved with the stores, and the next 16 will
589// be clustered as expected. It should really split into 2 16 store batches.
590//
591// Loads are clustered until this returns false, rather than trying to schedule
592// groups of stores. This also means we have to deal with saying different
593// address space loads should be clustered, and ones which might cause bank
594// conflicts.
595//
596// This might be deprecated so it might not be worth that much effort to fix.
598 int64_t Offset0, int64_t Offset1,
599 unsigned NumLoads) const {
600 assert(Offset1 > Offset0 &&
601 "Second offset should be larger than first offset!");
602 // If we have less than 16 loads in a row, and the offsets are within 64
603 // bytes, then schedule together.
604
605 // A cacheline is 64 bytes (for global memory).
606 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
607}
608
611 const DebugLoc &DL, MCRegister DestReg,
612 MCRegister SrcReg, bool KillSrc,
613 const char *Msg = "illegal VGPR to SGPR copy") {
614 MachineFunction *MF = MBB.getParent();
615
617 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
618
619 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
620 .addReg(SrcReg, getKillRegState(KillSrc));
621}
622
623/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
624/// possible to have a direct copy in these cases on GFX908, so an intermediate
625/// VGPR copy is required.
629 const DebugLoc &DL, MCRegister DestReg,
630 MCRegister SrcReg, bool KillSrc,
631 RegScavenger &RS, bool RegsOverlap,
632 Register ImpDefSuperReg = Register(),
633 Register ImpUseSuperReg = Register()) {
634 assert((TII.getSubtarget().hasMAIInsts() &&
635 !TII.getSubtarget().hasGFX90AInsts()) &&
636 "Expected GFX908 subtarget.");
637
638 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
639 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
640 "Source register of the copy should be either an SGPR or an AGPR.");
641
642 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
643 "Destination register of the copy should be an AGPR.");
644
645 const SIRegisterInfo &RI = TII.getRegisterInfo();
646
647 // First try to find defining accvgpr_write to avoid temporary registers.
648 // In the case of copies of overlapping AGPRs, we conservatively do not
649 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
650 // an accvgpr_write used for this same copy due to implicit-defs
651 if (!RegsOverlap) {
652 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
653 --Def;
654
655 if (!Def->modifiesRegister(SrcReg, &RI))
656 continue;
657
658 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
659 Def->getOperand(0).getReg() != SrcReg)
660 break;
661
662 MachineOperand &DefOp = Def->getOperand(1);
663 assert(DefOp.isReg() || DefOp.isImm());
664
665 if (DefOp.isReg()) {
666 bool SafeToPropagate = true;
667 // Check that register source operand is not clobbered before MI.
668 // Immediate operands are always safe to propagate.
669 for (auto I = Def; I != MI && SafeToPropagate; ++I)
670 if (I->modifiesRegister(DefOp.getReg(), &RI))
671 SafeToPropagate = false;
672
673 if (!SafeToPropagate)
674 break;
675
676 for (auto I = Def; I != MI; ++I)
677 I->clearRegisterKills(DefOp.getReg(), &RI);
678 }
679
680 MachineInstrBuilder Builder =
681 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
682 .add(DefOp);
683 if (ImpDefSuperReg)
684 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
685
686 if (ImpUseSuperReg) {
687 Builder.addReg(ImpUseSuperReg,
689 }
690
691 return;
692 }
693 }
694
695 RS.enterBasicBlockEnd(MBB);
696 RS.backward(std::next(MI));
697
698 // Ideally we want to have three registers for a long reg_sequence copy
699 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
700 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
701 *MBB.getParent());
702
703 // Registers in the sequence are allocated contiguously so we can just
704 // use register number to pick one of three round-robin temps.
705 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
706 Register Tmp =
707 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
708 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
709 "VGPR used for an intermediate copy should have been reserved.");
710
711 // Only loop through if there are any free registers left. We don't want to
712 // spill.
713 while (RegNo--) {
714 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
715 /* RestoreAfter */ false, 0,
716 /* AllowSpill */ false);
717 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
718 break;
719 Tmp = Tmp2;
720 RS.setRegUsed(Tmp);
721 }
722
723 // Insert copy to temporary VGPR.
724 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
725 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
726 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
727 } else {
728 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
729 }
730
731 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
732 .addReg(SrcReg, getKillRegState(KillSrc));
733 if (ImpUseSuperReg) {
734 UseBuilder.addReg(ImpUseSuperReg,
736 }
737
738 MachineInstrBuilder DefBuilder
739 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
740 .addReg(Tmp, RegState::Kill);
741
742 if (ImpDefSuperReg)
743 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
744}
745
748 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
749 const TargetRegisterClass *RC, bool Forward) {
750 const SIRegisterInfo &RI = TII.getRegisterInfo();
751 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
753 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
754
755 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
756 int16_t SubIdx = BaseIndices[Idx];
757 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
758 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
759 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
760 unsigned Opcode = AMDGPU::S_MOV_B32;
761
762 // Is SGPR aligned? If so try to combine with next.
763 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
764 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
765 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
766 // Can use SGPR64 copy
767 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
768 SubIdx = RI.getSubRegFromChannel(Channel, 2);
769 DestSubReg = RI.getSubReg(DestReg, SubIdx);
770 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
771 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
772 Opcode = AMDGPU::S_MOV_B64;
773 Idx++;
774 }
775
776 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
777 .addReg(SrcSubReg)
778 .addReg(SrcReg, RegState::Implicit);
779
780 if (!FirstMI)
781 FirstMI = LastMI;
782
783 if (!Forward)
784 I--;
785 }
786
787 assert(FirstMI && LastMI);
788 if (!Forward)
789 std::swap(FirstMI, LastMI);
790
791 FirstMI->addOperand(
792 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
793
794 if (KillSrc)
795 LastMI->addRegisterKilled(SrcReg, &RI);
796}
797
800 const DebugLoc &DL, Register DestReg,
801 Register SrcReg, bool KillSrc, bool RenamableDest,
802 bool RenamableSrc) const {
803 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
804 unsigned Size = RI.getRegSizeInBits(*RC);
805 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
806 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
807
808 // The rest of copyPhysReg assumes Src and Dst size are the same size.
809 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
810 // we remove Fix16BitCopies and this code block?
811 if (Fix16BitCopies) {
812 if (((Size == 16) != (SrcSize == 16))) {
813 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
814 assert(ST.useRealTrue16Insts());
815 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
816 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
817 RegToFix = SubReg;
818
819 if (DestReg == SrcReg) {
820 // Identity copy. Insert empty bundle since ExpandPostRA expects an
821 // instruction here.
822 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
823 return;
824 }
825 RC = RI.getPhysRegBaseClass(DestReg);
826 Size = RI.getRegSizeInBits(*RC);
827 SrcRC = RI.getPhysRegBaseClass(SrcReg);
828 SrcSize = RI.getRegSizeInBits(*SrcRC);
829 }
830 }
831
832 if (RC == &AMDGPU::VGPR_32RegClass) {
833 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
834 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
835 AMDGPU::AGPR_32RegClass.contains(SrcReg));
836 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
837 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
838 BuildMI(MBB, MI, DL, get(Opc), DestReg)
839 .addReg(SrcReg, getKillRegState(KillSrc));
840 return;
841 }
842
843 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
844 RC == &AMDGPU::SReg_32RegClass) {
845 if (SrcReg == AMDGPU::SCC) {
846 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
847 .addImm(1)
848 .addImm(0);
849 return;
850 }
851
852 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
853 if (DestReg == AMDGPU::VCC_LO) {
854 // FIXME: Hack until VReg_1 removed.
855 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
856 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
857 .addImm(0)
858 .addReg(SrcReg, getKillRegState(KillSrc));
859 return;
860 }
861
862 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
863 return;
864 }
865
866 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
867 .addReg(SrcReg, getKillRegState(KillSrc));
868 return;
869 }
870
871 if (RC == &AMDGPU::SReg_64RegClass) {
872 if (SrcReg == AMDGPU::SCC) {
873 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
874 .addImm(1)
875 .addImm(0);
876 return;
877 }
878
879 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
880 if (DestReg == AMDGPU::VCC) {
881 // FIXME: Hack until VReg_1 removed.
882 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
883 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
884 .addImm(0)
885 .addReg(SrcReg, getKillRegState(KillSrc));
886 return;
887 }
888
889 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
890 return;
891 }
892
893 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
894 .addReg(SrcReg, getKillRegState(KillSrc));
895 return;
896 }
897
898 if (DestReg == AMDGPU::SCC) {
899 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
900 // but SelectionDAG emits such copies for i1 sources.
901 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
902 // This copy can only be produced by patterns
903 // with explicit SCC, which are known to be enabled
904 // only for subtargets with S_CMP_LG_U64 present.
905 assert(ST.hasScalarCompareEq64());
906 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
907 .addReg(SrcReg, getKillRegState(KillSrc))
908 .addImm(0);
909 } else {
910 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
911 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
912 .addReg(SrcReg, getKillRegState(KillSrc))
913 .addImm(0);
914 }
915
916 return;
917 }
918
919 if (RC == &AMDGPU::AGPR_32RegClass) {
920 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
921 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
922 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
923 .addReg(SrcReg, getKillRegState(KillSrc));
924 return;
925 }
926
927 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
928 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
929 .addReg(SrcReg, getKillRegState(KillSrc));
930 return;
931 }
932
933 // FIXME: Pass should maintain scavenger to avoid scan through the block on
934 // every AGPR spill.
935 RegScavenger RS;
936 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
937 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
938 return;
939 }
940
941 if (Size == 16) {
942 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
943 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
944 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
945
946 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
947 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
948 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
949 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
950 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
951 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
952 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
953 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
954
955 if (IsSGPRDst) {
956 if (!IsSGPRSrc) {
957 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
958 return;
959 }
960
961 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
962 .addReg(NewSrcReg, getKillRegState(KillSrc));
963 return;
964 }
965
966 if (IsAGPRDst || IsAGPRSrc) {
967 if (!DstLow || !SrcLow) {
968 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
969 "Cannot use hi16 subreg with an AGPR!");
970 }
971
972 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
973 return;
974 }
975
976 if (ST.useRealTrue16Insts()) {
977 if (IsSGPRSrc) {
978 assert(SrcLow);
979 SrcReg = NewSrcReg;
980 }
981 // Use the smaller instruction encoding if possible.
982 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
983 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
984 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
985 .addReg(SrcReg);
986 } else {
987 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
988 .addImm(0) // src0_modifiers
989 .addReg(SrcReg)
990 .addImm(0); // op_sel
991 }
992 return;
993 }
994
995 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
996 if (!DstLow || !SrcLow) {
997 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
998 "Cannot use hi16 subreg on VI!");
999 }
1000
1001 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1002 .addReg(NewSrcReg, getKillRegState(KillSrc));
1003 return;
1004 }
1005
1006 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1007 .addImm(0) // src0_modifiers
1008 .addReg(NewSrcReg)
1009 .addImm(0) // clamp
1016 // First implicit operand is $exec.
1017 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1018 return;
1019 }
1020
1021 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1022 if (ST.hasVMovB64Inst()) {
1023 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1024 .addReg(SrcReg, getKillRegState(KillSrc));
1025 return;
1026 }
1027 if (ST.hasPkMovB32()) {
1028 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1030 .addReg(SrcReg)
1032 .addReg(SrcReg)
1033 .addImm(0) // op_sel_lo
1034 .addImm(0) // op_sel_hi
1035 .addImm(0) // neg_lo
1036 .addImm(0) // neg_hi
1037 .addImm(0) // clamp
1038 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1039 return;
1040 }
1041 }
1042
1043 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1044 if (RI.isSGPRClass(RC)) {
1045 if (!RI.isSGPRClass(SrcRC)) {
1046 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1047 return;
1048 }
1049 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1050 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1051 Forward);
1052 return;
1053 }
1054
1055 unsigned EltSize = 4;
1056 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1057 if (RI.isAGPRClass(RC)) {
1058 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1059 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1060 else if (RI.hasVGPRs(SrcRC) ||
1061 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1062 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1063 else
1064 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1065 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1066 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1067 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1068 (RI.isProperlyAlignedRC(*RC) &&
1069 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1070 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1071 if (ST.hasVMovB64Inst()) {
1072 Opcode = AMDGPU::V_MOV_B64_e32;
1073 EltSize = 8;
1074 } else if (ST.hasPkMovB32()) {
1075 Opcode = AMDGPU::V_PK_MOV_B32;
1076 EltSize = 8;
1077 }
1078 }
1079
1080 // For the cases where we need an intermediate instruction/temporary register
1081 // (destination is an AGPR), we need a scavenger.
1082 //
1083 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1084 // whole block for every handled copy.
1085 std::unique_ptr<RegScavenger> RS;
1086 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1087 RS = std::make_unique<RegScavenger>();
1088
1089 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1090
1091 // If there is an overlap, we can't kill the super-register on the last
1092 // instruction, since it will also kill the components made live by this def.
1093 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1094 const bool CanKillSuperReg = KillSrc && !Overlap;
1095
1096 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1097 unsigned SubIdx;
1098 if (Forward)
1099 SubIdx = SubIndices[Idx];
1100 else
1101 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1102 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1103 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1104 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1105
1106 bool IsFirstSubreg = Idx == 0;
1107 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1108
1109 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1110 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1111 Register ImpUseSuper = SrcReg;
1112 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1113 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1114 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1116 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1118 .addReg(SrcSubReg)
1120 .addReg(SrcSubReg)
1121 .addImm(0) // op_sel_lo
1122 .addImm(0) // op_sel_hi
1123 .addImm(0) // neg_lo
1124 .addImm(0) // neg_hi
1125 .addImm(0) // clamp
1126 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1127 if (IsFirstSubreg)
1129 } else {
1130 MachineInstrBuilder Builder =
1131 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1132 if (IsFirstSubreg)
1133 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1134
1135 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1136 }
1137 }
1138}
1139
1140int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1141 int32_t NewOpc;
1142
1143 // Try to map original to commuted opcode
1144 NewOpc = AMDGPU::getCommuteRev(Opcode);
1145 if (NewOpc != -1)
1146 // Check if the commuted (REV) opcode exists on the target.
1147 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1148
1149 // Try to map commuted to original opcode
1150 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1151 if (NewOpc != -1)
1152 // Check if the original (non-REV) opcode exists on the target.
1153 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1154
1155 return Opcode;
1156}
1157
1159 const Register Reg,
1160 int64_t &ImmVal) const {
1161 switch (MI.getOpcode()) {
1162 case AMDGPU::V_MOV_B32_e32:
1163 case AMDGPU::S_MOV_B32:
1164 case AMDGPU::S_MOVK_I32:
1165 case AMDGPU::S_MOV_B64:
1166 case AMDGPU::V_MOV_B64_e32:
1167 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1168 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1169 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1170 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1171 case AMDGPU::V_MOV_B64_PSEUDO:
1172 case AMDGPU::V_MOV_B16_t16_e32: {
1173 const MachineOperand &Src0 = MI.getOperand(1);
1174 if (Src0.isImm()) {
1175 ImmVal = Src0.getImm();
1176 return MI.getOperand(0).getReg() == Reg;
1177 }
1178
1179 return false;
1180 }
1181 case AMDGPU::V_MOV_B16_t16_e64: {
1182 const MachineOperand &Src0 = MI.getOperand(2);
1183 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1184 ImmVal = Src0.getImm();
1185 return MI.getOperand(0).getReg() == Reg;
1186 }
1187
1188 return false;
1189 }
1190 case AMDGPU::S_BREV_B32:
1191 case AMDGPU::V_BFREV_B32_e32:
1192 case AMDGPU::V_BFREV_B32_e64: {
1193 const MachineOperand &Src0 = MI.getOperand(1);
1194 if (Src0.isImm()) {
1195 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1196 return MI.getOperand(0).getReg() == Reg;
1197 }
1198
1199 return false;
1200 }
1201 case AMDGPU::S_NOT_B32:
1202 case AMDGPU::V_NOT_B32_e32:
1203 case AMDGPU::V_NOT_B32_e64: {
1204 const MachineOperand &Src0 = MI.getOperand(1);
1205 if (Src0.isImm()) {
1206 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1207 return MI.getOperand(0).getReg() == Reg;
1208 }
1209
1210 return false;
1211 }
1212 default:
1213 return false;
1214 }
1215}
1216
1217std::optional<int64_t>
1219 if (Op.isImm())
1220 return Op.getImm();
1221
1222 if (!Op.isReg() || !Op.getReg().isVirtual())
1223 return std::nullopt;
1224 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1225 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1226 if (Def && Def->isMoveImmediate()) {
1227 const MachineOperand &ImmSrc = Def->getOperand(1);
1228 if (ImmSrc.isImm())
1229 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1230 }
1231
1232 return std::nullopt;
1233}
1234
1236
1237 if (RI.isAGPRClass(DstRC))
1238 return AMDGPU::COPY;
1239 if (RI.getRegSizeInBits(*DstRC) == 16) {
1240 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1241 // before RA.
1242 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1243 }
1244 if (RI.getRegSizeInBits(*DstRC) == 32)
1245 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1246 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1247 return AMDGPU::S_MOV_B64;
1248 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1249 return AMDGPU::V_MOV_B64_PSEUDO;
1250 return AMDGPU::COPY;
1251}
1252
1253const MCInstrDesc &
1255 bool IsIndirectSrc) const {
1256 if (IsIndirectSrc) {
1257 if (VecSize <= 32) // 4 bytes
1258 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1259 if (VecSize <= 64) // 8 bytes
1260 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1261 if (VecSize <= 96) // 12 bytes
1262 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1263 if (VecSize <= 128) // 16 bytes
1264 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1265 if (VecSize <= 160) // 20 bytes
1266 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1267 if (VecSize <= 192) // 24 bytes
1268 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1269 if (VecSize <= 224) // 28 bytes
1270 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1271 if (VecSize <= 256) // 32 bytes
1272 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1273 if (VecSize <= 288) // 36 bytes
1274 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1275 if (VecSize <= 320) // 40 bytes
1276 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1277 if (VecSize <= 352) // 44 bytes
1278 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1279 if (VecSize <= 384) // 48 bytes
1280 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1281 if (VecSize <= 512) // 64 bytes
1282 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1283 if (VecSize <= 1024) // 128 bytes
1284 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1285
1286 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1287 }
1288
1289 if (VecSize <= 32) // 4 bytes
1290 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1291 if (VecSize <= 64) // 8 bytes
1292 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1293 if (VecSize <= 96) // 12 bytes
1294 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1295 if (VecSize <= 128) // 16 bytes
1296 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1297 if (VecSize <= 160) // 20 bytes
1298 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1299 if (VecSize <= 192) // 24 bytes
1300 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1301 if (VecSize <= 224) // 28 bytes
1302 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1303 if (VecSize <= 256) // 32 bytes
1304 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1305 if (VecSize <= 288) // 36 bytes
1306 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1307 if (VecSize <= 320) // 40 bytes
1308 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1309 if (VecSize <= 352) // 44 bytes
1310 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1311 if (VecSize <= 384) // 48 bytes
1312 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1313 if (VecSize <= 512) // 64 bytes
1314 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1315 if (VecSize <= 1024) // 128 bytes
1316 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1317
1318 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1319}
1320
1321static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1322 if (VecSize <= 32) // 4 bytes
1323 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1324 if (VecSize <= 64) // 8 bytes
1325 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1326 if (VecSize <= 96) // 12 bytes
1327 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1328 if (VecSize <= 128) // 16 bytes
1329 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1330 if (VecSize <= 160) // 20 bytes
1331 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1332 if (VecSize <= 192) // 24 bytes
1333 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1334 if (VecSize <= 224) // 28 bytes
1335 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1336 if (VecSize <= 256) // 32 bytes
1337 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1338 if (VecSize <= 288) // 36 bytes
1339 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1340 if (VecSize <= 320) // 40 bytes
1341 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1342 if (VecSize <= 352) // 44 bytes
1343 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1344 if (VecSize <= 384) // 48 bytes
1345 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1346 if (VecSize <= 512) // 64 bytes
1347 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1348 if (VecSize <= 1024) // 128 bytes
1349 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1350
1351 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1352}
1353
1354static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1355 if (VecSize <= 32) // 4 bytes
1356 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1357 if (VecSize <= 64) // 8 bytes
1358 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1359 if (VecSize <= 96) // 12 bytes
1360 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1361 if (VecSize <= 128) // 16 bytes
1362 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1363 if (VecSize <= 160) // 20 bytes
1364 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1365 if (VecSize <= 192) // 24 bytes
1366 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1367 if (VecSize <= 224) // 28 bytes
1368 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1369 if (VecSize <= 256) // 32 bytes
1370 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1371 if (VecSize <= 288) // 36 bytes
1372 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1373 if (VecSize <= 320) // 40 bytes
1374 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1375 if (VecSize <= 352) // 44 bytes
1376 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1377 if (VecSize <= 384) // 48 bytes
1378 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1379 if (VecSize <= 512) // 64 bytes
1380 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1381 if (VecSize <= 1024) // 128 bytes
1382 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1383
1384 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1385}
1386
1387static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1388 if (VecSize <= 64) // 8 bytes
1389 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1390 if (VecSize <= 128) // 16 bytes
1391 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1392 if (VecSize <= 256) // 32 bytes
1393 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1394 if (VecSize <= 512) // 64 bytes
1395 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1396 if (VecSize <= 1024) // 128 bytes
1397 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1398
1399 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1400}
1401
1402const MCInstrDesc &
1403SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1404 bool IsSGPR) const {
1405 if (IsSGPR) {
1406 switch (EltSize) {
1407 case 32:
1408 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1409 case 64:
1410 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1411 default:
1412 llvm_unreachable("invalid reg indexing elt size");
1413 }
1414 }
1415
1416 assert(EltSize == 32 && "invalid reg indexing elt size");
1418}
1419
1420static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1421 switch (Size) {
1422 case 4:
1423 return NeedsCFI ? AMDGPU::SI_SPILL_S32_CFI_SAVE : AMDGPU::SI_SPILL_S32_SAVE;
1424 case 8:
1425 return NeedsCFI ? AMDGPU::SI_SPILL_S64_CFI_SAVE : AMDGPU::SI_SPILL_S64_SAVE;
1426 case 12:
1427 return NeedsCFI ? AMDGPU::SI_SPILL_S96_CFI_SAVE : AMDGPU::SI_SPILL_S96_SAVE;
1428 case 16:
1429 return NeedsCFI ? AMDGPU::SI_SPILL_S128_CFI_SAVE
1430 : AMDGPU::SI_SPILL_S128_SAVE;
1431 case 20:
1432 return NeedsCFI ? AMDGPU::SI_SPILL_S160_CFI_SAVE
1433 : AMDGPU::SI_SPILL_S160_SAVE;
1434 case 24:
1435 return NeedsCFI ? AMDGPU::SI_SPILL_S192_CFI_SAVE
1436 : AMDGPU::SI_SPILL_S192_SAVE;
1437 case 28:
1438 return NeedsCFI ? AMDGPU::SI_SPILL_S224_CFI_SAVE
1439 : AMDGPU::SI_SPILL_S224_SAVE;
1440 case 32:
1441 return AMDGPU::SI_SPILL_S256_SAVE;
1442 case 36:
1443 return AMDGPU::SI_SPILL_S288_SAVE;
1444 case 40:
1445 return AMDGPU::SI_SPILL_S320_SAVE;
1446 case 44:
1447 return AMDGPU::SI_SPILL_S352_SAVE;
1448 case 48:
1449 return AMDGPU::SI_SPILL_S384_SAVE;
1450 case 64:
1451 return NeedsCFI ? AMDGPU::SI_SPILL_S512_CFI_SAVE
1452 : AMDGPU::SI_SPILL_S512_SAVE;
1453 case 128:
1454 return NeedsCFI ? AMDGPU::SI_SPILL_S1024_CFI_SAVE
1455 : AMDGPU::SI_SPILL_S1024_SAVE;
1456 default:
1457 llvm_unreachable("unknown register size");
1458 }
1459}
1460
1461static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1462 switch (Size) {
1463 case 2:
1464 return AMDGPU::SI_SPILL_V16_SAVE;
1465 case 4:
1466 return NeedsCFI ? AMDGPU::SI_SPILL_V32_CFI_SAVE : AMDGPU::SI_SPILL_V32_SAVE;
1467 case 8:
1468 return NeedsCFI ? AMDGPU::SI_SPILL_V64_CFI_SAVE : AMDGPU::SI_SPILL_V64_SAVE;
1469 case 12:
1470 return NeedsCFI ? AMDGPU::SI_SPILL_V96_CFI_SAVE : AMDGPU::SI_SPILL_V96_SAVE;
1471 case 16:
1472 return NeedsCFI ? AMDGPU::SI_SPILL_V128_CFI_SAVE
1473 : AMDGPU::SI_SPILL_V128_SAVE;
1474 case 20:
1475 return NeedsCFI ? AMDGPU::SI_SPILL_V160_CFI_SAVE
1476 : AMDGPU::SI_SPILL_V160_SAVE;
1477 case 24:
1478 return NeedsCFI ? AMDGPU::SI_SPILL_V192_CFI_SAVE
1479 : AMDGPU::SI_SPILL_V192_SAVE;
1480 case 28:
1481 return NeedsCFI ? AMDGPU::SI_SPILL_V224_CFI_SAVE
1482 : AMDGPU::SI_SPILL_V224_SAVE;
1483 case 32:
1484 return NeedsCFI ? AMDGPU::SI_SPILL_V256_CFI_SAVE
1485 : AMDGPU::SI_SPILL_V256_SAVE;
1486 case 36:
1487 return NeedsCFI ? AMDGPU::SI_SPILL_V288_CFI_SAVE
1488 : AMDGPU::SI_SPILL_V288_SAVE;
1489 case 40:
1490 return NeedsCFI ? AMDGPU::SI_SPILL_V320_CFI_SAVE
1491 : AMDGPU::SI_SPILL_V320_SAVE;
1492 case 44:
1493 return NeedsCFI ? AMDGPU::SI_SPILL_V352_CFI_SAVE
1494 : AMDGPU::SI_SPILL_V352_SAVE;
1495 case 48:
1496 return NeedsCFI ? AMDGPU::SI_SPILL_V384_CFI_SAVE
1497 : AMDGPU::SI_SPILL_V384_SAVE;
1498 case 64:
1499 return NeedsCFI ? AMDGPU::SI_SPILL_V512_CFI_SAVE
1500 : AMDGPU::SI_SPILL_V512_SAVE;
1501 case 128:
1502 return NeedsCFI ? AMDGPU::SI_SPILL_V1024_CFI_SAVE
1503 : AMDGPU::SI_SPILL_V1024_SAVE;
1504 default:
1505 llvm_unreachable("unknown register size");
1506 }
1507}
1508
1509static unsigned getAVSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1510 switch (Size) {
1511 case 4:
1512 return NeedsCFI ? AMDGPU::SI_SPILL_AV32_CFI_SAVE
1513 : AMDGPU::SI_SPILL_AV32_SAVE;
1514 case 8:
1515 return NeedsCFI ? AMDGPU::SI_SPILL_AV64_CFI_SAVE
1516 : AMDGPU::SI_SPILL_AV64_SAVE;
1517 case 12:
1518 return NeedsCFI ? AMDGPU::SI_SPILL_AV96_CFI_SAVE
1519 : AMDGPU::SI_SPILL_AV96_SAVE;
1520 case 16:
1521 return NeedsCFI ? AMDGPU::SI_SPILL_AV128_CFI_SAVE
1522 : AMDGPU::SI_SPILL_AV128_SAVE;
1523 case 20:
1524 return NeedsCFI ? AMDGPU::SI_SPILL_AV160_CFI_SAVE
1525 : AMDGPU::SI_SPILL_AV160_SAVE;
1526 case 24:
1527 return NeedsCFI ? AMDGPU::SI_SPILL_AV192_CFI_SAVE
1528 : AMDGPU::SI_SPILL_AV192_SAVE;
1529 case 28:
1530 return NeedsCFI ? AMDGPU::SI_SPILL_AV224_CFI_SAVE
1531 : AMDGPU::SI_SPILL_AV224_SAVE;
1532 case 32:
1533 return NeedsCFI ? AMDGPU::SI_SPILL_AV256_CFI_SAVE
1534 : AMDGPU::SI_SPILL_AV256_SAVE;
1535 case 36:
1536 return AMDGPU::SI_SPILL_AV288_SAVE;
1537 case 40:
1538 return AMDGPU::SI_SPILL_AV320_SAVE;
1539 case 44:
1540 return AMDGPU::SI_SPILL_AV352_SAVE;
1541 case 48:
1542 return AMDGPU::SI_SPILL_AV384_SAVE;
1543 case 64:
1544 return NeedsCFI ? AMDGPU::SI_SPILL_AV512_CFI_SAVE
1545 : AMDGPU::SI_SPILL_AV512_SAVE;
1546 case 128:
1547 return NeedsCFI ? AMDGPU::SI_SPILL_AV1024_CFI_SAVE
1548 : AMDGPU::SI_SPILL_AV1024_SAVE;
1549 default:
1550 llvm_unreachable("unknown register size");
1551 }
1552}
1553
1554static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1555 bool IsVectorSuperClass) {
1556 // Currently, there is only 32-bit WWM register spills needed.
1557 if (Size != 4)
1558 llvm_unreachable("unknown wwm register spill size");
1559
1560 if (IsVectorSuperClass)
1561 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1562
1563 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1564}
1565
1567 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1568 const SIMachineFunctionInfo &MFI, bool NeedsCFI) const {
1569 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1570
1571 // Choose the right opcode if spilling a WWM register.
1573 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1574
1575 // TODO: Check if AGPRs are available
1576 if (ST.hasMAIInsts())
1577 return getAVSpillSaveOpcode(Size, NeedsCFI);
1578
1579 return getVGPRSpillSaveOpcode(Size, NeedsCFI);
1580}
1581
1582void SIInstrInfo::storeRegToStackSlotImpl(
1584 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1585 MachineInstr::MIFlag Flags, bool NeedsCFI) const {
1586 MachineFunction *MF = MBB.getParent();
1588 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1589 const DebugLoc &DL = MBB.findDebugLoc(MI);
1590
1591 MachinePointerInfo PtrInfo
1592 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1594 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1595 FrameInfo.getObjectAlign(FrameIndex));
1596 unsigned SpillSize = RI.getSpillSize(*RC);
1597
1598 MachineRegisterInfo &MRI = MF->getRegInfo();
1599 if (RI.isSGPRClass(RC)) {
1600 if (FrameInfo.getStackID(FrameIndex) == TargetStackID::SGPRSpill)
1601 MFI->setHasSpilledSGPRs();
1602 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1603 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1604 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1605
1606 // We are only allowed to create one new instruction when spilling
1607 // registers, so we need to use pseudo instruction for spilling SGPRs.
1608 const MCInstrDesc &OpDesc =
1609 get(getSGPRSpillSaveOpcode(SpillSize, NeedsCFI));
1610
1611 // The SGPR spill/restore instructions only work on number sgprs, so we need
1612 // to make sure we are using the correct register class.
1613 if (SrcReg.isVirtual() && SpillSize == 4) {
1614 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1615 }
1616
1617 BuildMI(MBB, MI, DL, OpDesc)
1618 .addReg(SrcReg, getKillRegState(isKill)) // data
1619 .addFrameIndex(FrameIndex) // addr
1620 .addMemOperand(MMO)
1622
1623 return;
1624 }
1625
1626 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1627 SpillSize, *MFI, NeedsCFI);
1628 MFI->setHasSpilledVGPRs();
1629
1630 BuildMI(MBB, MI, DL, get(Opcode))
1631 .addReg(SrcReg, getKillRegState(isKill)) // data
1632 .addFrameIndex(FrameIndex) // addr
1633 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1634 .addImm(0) // offset
1635 .addMemOperand(MMO);
1636}
1637
1640 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1641 MachineInstr::MIFlag Flags) const {
1642 storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, VReg, Flags,
1643 false);
1644}
1645
1648 Register SrcReg, bool isKill,
1649 int FrameIndex,
1650 const TargetRegisterClass *RC) const {
1651 storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, Register(),
1652 MachineInstr::NoFlags, true);
1653}
1654
1655static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1656 switch (Size) {
1657 case 4:
1658 return AMDGPU::SI_SPILL_S32_RESTORE;
1659 case 8:
1660 return AMDGPU::SI_SPILL_S64_RESTORE;
1661 case 12:
1662 return AMDGPU::SI_SPILL_S96_RESTORE;
1663 case 16:
1664 return AMDGPU::SI_SPILL_S128_RESTORE;
1665 case 20:
1666 return AMDGPU::SI_SPILL_S160_RESTORE;
1667 case 24:
1668 return AMDGPU::SI_SPILL_S192_RESTORE;
1669 case 28:
1670 return AMDGPU::SI_SPILL_S224_RESTORE;
1671 case 32:
1672 return AMDGPU::SI_SPILL_S256_RESTORE;
1673 case 36:
1674 return AMDGPU::SI_SPILL_S288_RESTORE;
1675 case 40:
1676 return AMDGPU::SI_SPILL_S320_RESTORE;
1677 case 44:
1678 return AMDGPU::SI_SPILL_S352_RESTORE;
1679 case 48:
1680 return AMDGPU::SI_SPILL_S384_RESTORE;
1681 case 64:
1682 return AMDGPU::SI_SPILL_S512_RESTORE;
1683 case 128:
1684 return AMDGPU::SI_SPILL_S1024_RESTORE;
1685 default:
1686 llvm_unreachable("unknown register size");
1687 }
1688}
1689
1690static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1691 switch (Size) {
1692 case 2:
1693 return AMDGPU::SI_SPILL_V16_RESTORE;
1694 case 4:
1695 return AMDGPU::SI_SPILL_V32_RESTORE;
1696 case 8:
1697 return AMDGPU::SI_SPILL_V64_RESTORE;
1698 case 12:
1699 return AMDGPU::SI_SPILL_V96_RESTORE;
1700 case 16:
1701 return AMDGPU::SI_SPILL_V128_RESTORE;
1702 case 20:
1703 return AMDGPU::SI_SPILL_V160_RESTORE;
1704 case 24:
1705 return AMDGPU::SI_SPILL_V192_RESTORE;
1706 case 28:
1707 return AMDGPU::SI_SPILL_V224_RESTORE;
1708 case 32:
1709 return AMDGPU::SI_SPILL_V256_RESTORE;
1710 case 36:
1711 return AMDGPU::SI_SPILL_V288_RESTORE;
1712 case 40:
1713 return AMDGPU::SI_SPILL_V320_RESTORE;
1714 case 44:
1715 return AMDGPU::SI_SPILL_V352_RESTORE;
1716 case 48:
1717 return AMDGPU::SI_SPILL_V384_RESTORE;
1718 case 64:
1719 return AMDGPU::SI_SPILL_V512_RESTORE;
1720 case 128:
1721 return AMDGPU::SI_SPILL_V1024_RESTORE;
1722 default:
1723 llvm_unreachable("unknown register size");
1724 }
1725}
1726
1727static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1728 switch (Size) {
1729 case 4:
1730 return AMDGPU::SI_SPILL_AV32_RESTORE;
1731 case 8:
1732 return AMDGPU::SI_SPILL_AV64_RESTORE;
1733 case 12:
1734 return AMDGPU::SI_SPILL_AV96_RESTORE;
1735 case 16:
1736 return AMDGPU::SI_SPILL_AV128_RESTORE;
1737 case 20:
1738 return AMDGPU::SI_SPILL_AV160_RESTORE;
1739 case 24:
1740 return AMDGPU::SI_SPILL_AV192_RESTORE;
1741 case 28:
1742 return AMDGPU::SI_SPILL_AV224_RESTORE;
1743 case 32:
1744 return AMDGPU::SI_SPILL_AV256_RESTORE;
1745 case 36:
1746 return AMDGPU::SI_SPILL_AV288_RESTORE;
1747 case 40:
1748 return AMDGPU::SI_SPILL_AV320_RESTORE;
1749 case 44:
1750 return AMDGPU::SI_SPILL_AV352_RESTORE;
1751 case 48:
1752 return AMDGPU::SI_SPILL_AV384_RESTORE;
1753 case 64:
1754 return AMDGPU::SI_SPILL_AV512_RESTORE;
1755 case 128:
1756 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1757 default:
1758 llvm_unreachable("unknown register size");
1759 }
1760}
1761
1762static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1763 bool IsVectorSuperClass) {
1764 // Currently, there is only 32-bit WWM register spills needed.
1765 if (Size != 4)
1766 llvm_unreachable("unknown wwm register spill size");
1767
1768 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1769 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1770
1771 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1772}
1773
1775 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1776 const SIMachineFunctionInfo &MFI) const {
1777 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1778
1779 // Choose the right opcode if restoring a WWM register.
1781 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1782
1783 // TODO: Check if AGPRs are available
1784 if (ST.hasMAIInsts())
1786
1787 assert(!RI.isAGPRClass(RC));
1789}
1790
1793 Register DestReg, int FrameIndex,
1794 const TargetRegisterClass *RC,
1795 Register VReg, unsigned SubReg,
1796 MachineInstr::MIFlag Flags) const {
1797 MachineFunction *MF = MBB.getParent();
1799 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1800 const DebugLoc &DL = MBB.findDebugLoc(MI);
1801 unsigned SpillSize = RI.getSpillSize(*RC);
1802
1803 MachinePointerInfo PtrInfo
1804 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1805
1807 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1808 FrameInfo.getObjectAlign(FrameIndex));
1809
1810 if (RI.isSGPRClass(RC)) {
1811 if (FrameInfo.getStackID(FrameIndex) == TargetStackID::SGPRSpill)
1812 MFI->setHasSpilledSGPRs();
1813 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1814 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1815 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1816
1817 // FIXME: Maybe this should not include a memoperand because it will be
1818 // lowered to non-memory instructions.
1819 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1820 if (DestReg.isVirtual() && SpillSize == 4) {
1821 MachineRegisterInfo &MRI = MF->getRegInfo();
1822 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1823 }
1824
1825 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1826 .addFrameIndex(FrameIndex) // addr
1827 .addMemOperand(MMO)
1829
1830 return;
1831 }
1832
1833 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1834 SpillSize, *MFI);
1835 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1836 .addFrameIndex(FrameIndex) // vaddr
1837 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1838 .addImm(0) // offset
1839 .addMemOperand(MMO);
1840}
1841
1846
1849 unsigned Quantity) const {
1850 DebugLoc DL = MBB.findDebugLoc(MI);
1851 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1852 while (Quantity > 0) {
1853 unsigned Arg = std::min(Quantity, MaxSNopCount);
1854 Quantity -= Arg;
1855 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1856 }
1857}
1858
1862 const DebugLoc &DL) const {
1863 MachineFunction *MF = MBB.getParent();
1864 constexpr unsigned DoorbellIDMask = 0x3ff;
1865 constexpr unsigned ECQueueWaveAbort = 0x400;
1866
1867 MachineBasicBlock *TrapBB = &MBB;
1868 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1869
1870 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1871 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1872 TrapBB = MF->CreateMachineBasicBlock();
1873 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1874 MF->push_back(TrapBB);
1875 MBB.addSuccessor(TrapBB);
1876 }
1877 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1878 // will be a nop.
1879 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1880 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1881 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1882 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1883 DoorbellReg)
1885 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1886 .addUse(AMDGPU::M0);
1887 Register DoorbellRegMasked =
1888 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1889 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
1890 .addUse(DoorbellReg)
1891 .addImm(DoorbellIDMask);
1892 Register SetWaveAbortBit =
1893 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1894 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
1895 .addUse(DoorbellRegMasked)
1896 .addImm(ECQueueWaveAbort);
1897 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1898 .addUse(SetWaveAbortBit);
1899 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
1901 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1902 .addUse(AMDGPU::TTMP2);
1903 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
1904 TrapBB->addSuccessor(HaltLoopBB);
1905
1906 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
1907 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
1908 .addMBB(HaltLoopBB);
1909 MF->push_back(HaltLoopBB);
1910 HaltLoopBB->addSuccessor(HaltLoopBB);
1911
1912 return MBB.getNextNode();
1913}
1914
1916 switch (MI.getOpcode()) {
1917 default:
1918 if (MI.isMetaInstruction())
1919 return 0;
1920 return 1; // FIXME: Do wait states equal cycles?
1921
1922 case AMDGPU::S_NOP:
1923 return MI.getOperand(0).getImm() + 1;
1924 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
1925 // hazard, even if one exist, won't really be visible. Should we handle it?
1926 }
1927}
1928
1930 MachineBasicBlock &MBB = *MI.getParent();
1931 DebugLoc DL = MBB.findDebugLoc(MI);
1933 switch (MI.getOpcode()) {
1934 default: return TargetInstrInfo::expandPostRAPseudo(MI);
1935 case AMDGPU::S_MOV_B64_term:
1936 // This is only a terminator to get the correct spill code placement during
1937 // register allocation.
1938 MI.setDesc(get(AMDGPU::S_MOV_B64));
1939 break;
1940
1941 case AMDGPU::S_MOV_B32_term:
1942 // This is only a terminator to get the correct spill code placement during
1943 // register allocation.
1944 MI.setDesc(get(AMDGPU::S_MOV_B32));
1945 break;
1946
1947 case AMDGPU::S_XOR_B64_term:
1948 // This is only a terminator to get the correct spill code placement during
1949 // register allocation.
1950 MI.setDesc(get(AMDGPU::S_XOR_B64));
1951 break;
1952
1953 case AMDGPU::S_XOR_B32_term:
1954 // This is only a terminator to get the correct spill code placement during
1955 // register allocation.
1956 MI.setDesc(get(AMDGPU::S_XOR_B32));
1957 break;
1958 case AMDGPU::S_OR_B64_term:
1959 // This is only a terminator to get the correct spill code placement during
1960 // register allocation.
1961 MI.setDesc(get(AMDGPU::S_OR_B64));
1962 break;
1963 case AMDGPU::S_OR_B32_term:
1964 // This is only a terminator to get the correct spill code placement during
1965 // register allocation.
1966 MI.setDesc(get(AMDGPU::S_OR_B32));
1967 break;
1968
1969 case AMDGPU::S_ANDN2_B64_term:
1970 // This is only a terminator to get the correct spill code placement during
1971 // register allocation.
1972 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1973 break;
1974
1975 case AMDGPU::S_ANDN2_B32_term:
1976 // This is only a terminator to get the correct spill code placement during
1977 // register allocation.
1978 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
1979 break;
1980
1981 case AMDGPU::S_AND_B64_term:
1982 // This is only a terminator to get the correct spill code placement during
1983 // register allocation.
1984 MI.setDesc(get(AMDGPU::S_AND_B64));
1985 break;
1986
1987 case AMDGPU::S_AND_B32_term:
1988 // This is only a terminator to get the correct spill code placement during
1989 // register allocation.
1990 MI.setDesc(get(AMDGPU::S_AND_B32));
1991 break;
1992
1993 case AMDGPU::S_AND_SAVEEXEC_B64_term:
1994 // This is only a terminator to get the correct spill code placement during
1995 // register allocation.
1996 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
1997 break;
1998
1999 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2000 // This is only a terminator to get the correct spill code placement during
2001 // register allocation.
2002 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2003 break;
2004
2005 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2006 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2007 break;
2008
2009 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2010 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2011 break;
2012 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2013 Register Dst = MI.getOperand(0).getReg();
2014 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2015 MI.setDesc(
2016 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2017 break;
2018 }
2019 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2020 Register Dst = MI.getOperand(0).getReg();
2021 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2022 int64_t Imm = MI.getOperand(1).getImm();
2023
2024 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2025 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2026 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2027 .addImm(SignExtend64<32>(Imm));
2028 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2029 .addImm(SignExtend64<32>(Imm >> 32));
2030 MI.eraseFromParent();
2031 break;
2032 }
2033
2034 [[fallthrough]];
2035 }
2036 case AMDGPU::V_MOV_B64_PSEUDO: {
2037 Register Dst = MI.getOperand(0).getReg();
2038 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2039 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2040
2041 const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2042 const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
2043
2044 const MachineOperand &SrcOp = MI.getOperand(1);
2045 // FIXME: Will this work for 64-bit floating point immediates?
2046 assert(!SrcOp.isFPImm());
2047 if (ST.hasVMovB64Inst() && Mov64RC->contains(Dst)) {
2048 MI.setDesc(Mov64Desc);
2049 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2050 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2051 break;
2052 }
2053 if (SrcOp.isImm()) {
2054 APInt Imm(64, SrcOp.getImm());
2055 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2056 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2057 const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2058 const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
2059
2060 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2061 PkMovRC->contains(Dst)) {
2062 BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2064 .addImm(Lo.getSExtValue())
2066 .addImm(Lo.getSExtValue())
2067 .addImm(0) // op_sel_lo
2068 .addImm(0) // op_sel_hi
2069 .addImm(0) // neg_lo
2070 .addImm(0) // neg_hi
2071 .addImm(0); // clamp
2072 } else {
2073 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2074 .addImm(Lo.getSExtValue());
2075 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2076 .addImm(Hi.getSExtValue());
2077 }
2078 } else {
2079 assert(SrcOp.isReg());
2080 if (ST.hasPkMovB32() &&
2081 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2082 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2083 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2084 .addReg(SrcOp.getReg())
2086 .addReg(SrcOp.getReg())
2087 .addImm(0) // op_sel_lo
2088 .addImm(0) // op_sel_hi
2089 .addImm(0) // neg_lo
2090 .addImm(0) // neg_hi
2091 .addImm(0); // clamp
2092 } else {
2093 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2094 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0));
2095 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2096 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1));
2097 }
2098 }
2099 MI.eraseFromParent();
2100 break;
2101 }
2102 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2104 break;
2105 }
2106 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2107 const MachineOperand &SrcOp = MI.getOperand(1);
2108 assert(!SrcOp.isFPImm());
2109
2110 if (ST.has64BitLiterals()) {
2111 MI.setDesc(get(AMDGPU::S_MOV_B64));
2112 break;
2113 }
2114
2115 APInt Imm(64, SrcOp.getImm());
2116 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2117 MI.setDesc(get(AMDGPU::S_MOV_B64));
2118 break;
2119 }
2120
2121 Register Dst = MI.getOperand(0).getReg();
2122 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2123 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2124
2125 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2126 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2127 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2128 .addImm(Lo.getSExtValue());
2129 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2130 .addImm(Hi.getSExtValue());
2131 MI.eraseFromParent();
2132 break;
2133 }
2134 case AMDGPU::V_SET_INACTIVE_B32: {
2135 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2136 Register DstReg = MI.getOperand(0).getReg();
2137 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2138 .add(MI.getOperand(3))
2139 .add(MI.getOperand(4))
2140 .add(MI.getOperand(1))
2141 .add(MI.getOperand(2))
2142 .add(MI.getOperand(5));
2143 MI.eraseFromParent();
2144 break;
2145 }
2146 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2147 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2148 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2149 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2150 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2151 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2152 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2153 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2154 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2155 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2156 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2157 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2158 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2159 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2160 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2161 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2162 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2163 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2164 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2165 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2166 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2167 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2168 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2169 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2170 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2171 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2172 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2173 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2174 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2175 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2176 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2177 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2178 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2179 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2180
2181 unsigned Opc;
2182 if (RI.hasVGPRs(EltRC)) {
2183 Opc = AMDGPU::V_MOVRELD_B32_e32;
2184 } else {
2185 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2186 : AMDGPU::S_MOVRELD_B32;
2187 }
2188
2189 const MCInstrDesc &OpDesc = get(Opc);
2190 Register VecReg = MI.getOperand(0).getReg();
2191 bool IsUndef = MI.getOperand(1).isUndef();
2192 unsigned SubReg = MI.getOperand(3).getImm();
2193 assert(VecReg == MI.getOperand(1).getReg());
2194
2196 BuildMI(MBB, MI, DL, OpDesc)
2197 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2198 .add(MI.getOperand(2))
2200 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2201
2202 const int ImpDefIdx =
2203 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2204 const int ImpUseIdx = ImpDefIdx + 1;
2205 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2206 MI.eraseFromParent();
2207 break;
2208 }
2209 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2210 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2211 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2212 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2213 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2214 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2215 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2216 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2217 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2218 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2219 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2220 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2221 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2222 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2223 assert(ST.useVGPRIndexMode());
2224 Register VecReg = MI.getOperand(0).getReg();
2225 bool IsUndef = MI.getOperand(1).isUndef();
2226 MachineOperand &Idx = MI.getOperand(3);
2227 Register SubReg = MI.getOperand(4).getImm();
2228
2229 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2230 .add(Idx)
2232 SetOn->getOperand(3).setIsUndef();
2233
2234 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2236 BuildMI(MBB, MI, DL, OpDesc)
2237 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2238 .add(MI.getOperand(2))
2240 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2241
2242 const int ImpDefIdx =
2243 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2244 const int ImpUseIdx = ImpDefIdx + 1;
2245 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2246
2247 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2248
2249 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2250
2251 MI.eraseFromParent();
2252 break;
2253 }
2254 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2255 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2256 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2257 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2258 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2259 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2260 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2261 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2262 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2263 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2264 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2265 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2266 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2267 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2268 assert(ST.useVGPRIndexMode());
2269 Register Dst = MI.getOperand(0).getReg();
2270 Register VecReg = MI.getOperand(1).getReg();
2271 bool IsUndef = MI.getOperand(1).isUndef();
2272 Register SubReg = MI.getOperand(3).getImm();
2273
2274 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2275 .add(MI.getOperand(2))
2277 SetOn->getOperand(3).setIsUndef();
2278
2279 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2280 .addDef(Dst)
2281 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2282 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2283
2284 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2285
2286 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2287
2288 MI.eraseFromParent();
2289 break;
2290 }
2291 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2292 MachineFunction &MF = *MBB.getParent();
2293 Register Reg = MI.getOperand(0).getReg();
2294 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2295 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2296 MachineOperand OpLo = MI.getOperand(1);
2297 MachineOperand OpHi = MI.getOperand(2);
2298
2299 // Create a bundle so these instructions won't be re-ordered by the
2300 // post-RA scheduler.
2301 MIBundleBuilder Bundler(MBB, MI);
2302 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2303
2304 // What we want here is an offset from the value returned by s_getpc (which
2305 // is the address of the s_add_u32 instruction) to the global variable, but
2306 // since the encoding of $symbol starts 4 bytes after the start of the
2307 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2308 // small. This requires us to add 4 to the global variable offset in order
2309 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2310 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2311 // instruction.
2312
2313 int64_t Adjust = 0;
2314 if (ST.hasGetPCZeroExtension()) {
2315 // Fix up hardware that does not sign-extend the 48-bit PC value by
2316 // inserting: s_sext_i32_i16 reghi, reghi
2317 Bundler.append(
2318 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2319 Adjust += 4;
2320 }
2321
2322 if (OpLo.isGlobal())
2323 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2324 Bundler.append(
2325 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2326
2327 if (OpHi.isGlobal())
2328 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2329 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2330 .addReg(RegHi)
2331 .add(OpHi));
2332
2333 finalizeBundle(MBB, Bundler.begin());
2334
2335 MI.eraseFromParent();
2336 break;
2337 }
2338 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2339 MachineFunction &MF = *MBB.getParent();
2340 Register Reg = MI.getOperand(0).getReg();
2341 MachineOperand Op = MI.getOperand(1);
2342
2343 // Create a bundle so these instructions won't be re-ordered by the
2344 // post-RA scheduler.
2345 MIBundleBuilder Bundler(MBB, MI);
2346 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2347 if (Op.isGlobal())
2348 Op.setOffset(Op.getOffset() + 4);
2349 Bundler.append(
2350 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2351
2352 finalizeBundle(MBB, Bundler.begin());
2353
2354 MI.eraseFromParent();
2355 break;
2356 }
2357 case AMDGPU::ENTER_STRICT_WWM: {
2358 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2359 // Whole Wave Mode is entered.
2360 MI.setDesc(get(LMC.OrSaveExecOpc));
2361 break;
2362 }
2363 case AMDGPU::ENTER_STRICT_WQM: {
2364 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2365 // STRICT_WQM is entered.
2366 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2367 .addReg(LMC.ExecReg);
2368 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2369
2370 MI.eraseFromParent();
2371 break;
2372 }
2373 case AMDGPU::EXIT_STRICT_WWM:
2374 case AMDGPU::EXIT_STRICT_WQM: {
2375 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2376 // WWM/STICT_WQM is exited.
2377 MI.setDesc(get(LMC.MovOpc));
2378 break;
2379 }
2380 case AMDGPU::SI_RETURN: {
2381 const MachineFunction *MF = MBB.getParent();
2382 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2383 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2384 // Hiding the return address use with SI_RETURN may lead to extra kills in
2385 // the function and missing live-ins. We are fine in practice because callee
2386 // saved register handling ensures the register value is restored before
2387 // RET, but we need the undef flag here to appease the MachineVerifier
2388 // liveness checks.
2390 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2391 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2392
2393 MIB.copyImplicitOps(MI);
2394 MI.eraseFromParent();
2395 break;
2396 }
2397
2398 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2399 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2400 MI.setDesc(get(AMDGPU::S_MUL_U64));
2401 break;
2402
2403 case AMDGPU::S_GETPC_B64_pseudo:
2404 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2405 if (ST.hasGetPCZeroExtension()) {
2406 Register Dst = MI.getOperand(0).getReg();
2407 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2408 // Fix up hardware that does not sign-extend the 48-bit PC value by
2409 // inserting: s_sext_i32_i16 dsthi, dsthi
2410 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2411 DstHi)
2412 .addReg(DstHi);
2413 }
2414 break;
2415
2416 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2417 assert(ST.hasBF16PackedInsts());
2418 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2419 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2420 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2421 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2422 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2423 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2424 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2425 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2426 break;
2427 }
2428
2429 case AMDGPU::GET_STACK_BASE:
2430 // The stack starts at offset 0 unless we need to reserve some space at the
2431 // bottom.
2432 if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
2433 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2434 // some of the VGPRs. The size of the required scratch space has already
2435 // been computed by prolog epilog insertion.
2436 const SIMachineFunctionInfo *MFI =
2437 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2438 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2439 Register DestReg = MI.getOperand(0).getReg();
2440 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
2443 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2444 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2445 // SCC, so we need to check for 0 manually.
2446 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
2447 // Change the implicif-def of SCC to an explicit use (but first remove
2448 // the dead flag if present).
2449 MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
2450 MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
2451 MI.setDesc(get(AMDGPU::S_CMOVK_I32));
2452 MI.addOperand(MachineOperand::CreateImm(VGPRSize));
2453 } else {
2454 MI.setDesc(get(AMDGPU::S_MOV_B32));
2455 MI.addOperand(MachineOperand::CreateImm(0));
2456 MI.removeOperand(
2457 MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2458 }
2459 break;
2460 }
2461
2462 return true;
2463}
2464
2467 unsigned SubIdx, const MachineInstr &Orig,
2468 LaneBitmask UsedLanes) const {
2469
2470 // Try shrinking the instruction to remat only the part needed for current
2471 // context.
2472 // TODO: Handle more cases.
2473 unsigned Opcode = Orig.getOpcode();
2474 switch (Opcode) {
2475 case AMDGPU::S_MOV_B64:
2476 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2477 if (SubIdx != 0)
2478 break;
2479
2480 if (!Orig.getOperand(1).isImm())
2481 break;
2482
2483 // Shrink S_MOV_B64 to S_MOV_B32 when UsedLanes indicates only a single
2484 // 32-bit lane of the 64-bit value is live at the rematerialization point.
2485 if (UsedLanes.all())
2486 break;
2487
2488 // Determine which half of the 64-bit immediate corresponds to the use.
2489 unsigned OrigSubReg = Orig.getOperand(0).getSubReg();
2490 unsigned LoSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub0);
2491 unsigned HiSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub1);
2492
2493 bool NeedLo = (UsedLanes & RI.getSubRegIndexLaneMask(LoSubReg)).any();
2494 bool NeedHi = (UsedLanes & RI.getSubRegIndexLaneMask(HiSubReg)).any();
2495
2496 if (NeedLo && NeedHi)
2497 break;
2498
2499 int64_t Imm64 = Orig.getOperand(1).getImm();
2500 int32_t Imm32 = NeedLo ? Lo_32(Imm64) : Hi_32(Imm64);
2501
2502 unsigned UseSubReg = NeedLo ? LoSubReg : HiSubReg;
2503
2504 // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
2505 BuildMI(MBB, I, Orig.getDebugLoc(), get(AMDGPU::S_MOV_B32))
2506 .addReg(DestReg, RegState::Define | RegState::Undef, UseSubReg)
2507 .addImm(Imm32);
2508 return;
2509 }
2510
2511 case AMDGPU::S_LOAD_DWORDX16_IMM:
2512 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2513 if (SubIdx != 0)
2514 break;
2515
2516 if (I == MBB.end())
2517 break;
2518
2519 if (I->isBundled())
2520 break;
2521
2522 // Look for a single use of the register that is also a subreg.
2523 Register RegToFind = Orig.getOperand(0).getReg();
2524 MachineOperand *UseMO = nullptr;
2525 for (auto &CandMO : I->operands()) {
2526 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2527 continue;
2528 if (UseMO) {
2529 UseMO = nullptr;
2530 break;
2531 }
2532 UseMO = &CandMO;
2533 }
2534 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2535 break;
2536
2537 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2538 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2539
2540 MachineFunction *MF = MBB.getParent();
2541 MachineRegisterInfo &MRI = MF->getRegInfo();
2542 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2543
2544 unsigned NewOpcode = -1;
2545 if (SubregSize == 256)
2546 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2547 else if (SubregSize == 128)
2548 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2549 else
2550 break;
2551
2552 const MCInstrDesc &TID = get(NewOpcode);
2553 const TargetRegisterClass *NewRC =
2554 RI.getAllocatableClass(getRegClass(TID, 0));
2555 MRI.setRegClass(DestReg, NewRC);
2556
2557 UseMO->setReg(DestReg);
2558 UseMO->setSubReg(AMDGPU::NoSubRegister);
2559
2560 // Use a smaller load with the desired size, possibly with updated offset.
2561 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2562 MI->setDesc(TID);
2563 MI->getOperand(0).setReg(DestReg);
2564 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2565 if (Offset) {
2566 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2567 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2568 OffsetMO->setImm(FinalOffset);
2569 }
2571 for (const MachineMemOperand *MemOp : Orig.memoperands())
2572 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2573 SubregSize / 8));
2574 MI->setMemRefs(*MF, NewMMOs);
2575
2576 MBB.insert(I, MI);
2577 return;
2578 }
2579
2580 default:
2581 break;
2582 }
2583
2584 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, UsedLanes);
2585}
2586
2587std::pair<MachineInstr*, MachineInstr*>
2589 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2590
2591 if (ST.hasVMovB64Inst() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2593 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2594 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2595 return std::pair(&MI, nullptr);
2596 }
2597
2598 MachineBasicBlock &MBB = *MI.getParent();
2599 DebugLoc DL = MBB.findDebugLoc(MI);
2600 MachineFunction *MF = MBB.getParent();
2601 MachineRegisterInfo &MRI = MF->getRegInfo();
2602 Register Dst = MI.getOperand(0).getReg();
2603 unsigned Part = 0;
2604 MachineInstr *Split[2];
2605
2606 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2607 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2608 if (Dst.isPhysical()) {
2609 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2610 } else {
2611 assert(MRI.isSSA());
2612 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2613 MovDPP.addDef(Tmp);
2614 }
2615
2616 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2617 const MachineOperand &SrcOp = MI.getOperand(I);
2618 assert(!SrcOp.isFPImm());
2619 if (SrcOp.isImm()) {
2620 APInt Imm(64, SrcOp.getImm());
2621 Imm.ashrInPlace(Part * 32);
2622 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2623 } else {
2624 assert(SrcOp.isReg());
2625 Register Src = SrcOp.getReg();
2626 if (Src.isPhysical())
2627 MovDPP.addReg(RI.getSubReg(Src, Sub));
2628 else
2629 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2630 }
2631 }
2632
2633 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2634 MovDPP.addImm(MO.getImm());
2635
2636 Split[Part] = MovDPP;
2637 ++Part;
2638 }
2639
2640 if (Dst.isVirtual())
2641 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2642 .addReg(Split[0]->getOperand(0).getReg())
2643 .addImm(AMDGPU::sub0)
2644 .addReg(Split[1]->getOperand(0).getReg())
2645 .addImm(AMDGPU::sub1);
2646
2647 MI.eraseFromParent();
2648 return std::pair(Split[0], Split[1]);
2649}
2650
2651std::optional<DestSourcePair>
2653 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2654 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2655
2656 return std::nullopt;
2657}
2658
2660 AMDGPU::OpName Src0OpName,
2661 MachineOperand &Src1,
2662 AMDGPU::OpName Src1OpName) const {
2663 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2664 if (!Src0Mods)
2665 return false;
2666
2667 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2668 assert(Src1Mods &&
2669 "All commutable instructions have both src0 and src1 modifiers");
2670
2671 int Src0ModsVal = Src0Mods->getImm();
2672 int Src1ModsVal = Src1Mods->getImm();
2673
2674 Src1Mods->setImm(Src0ModsVal);
2675 Src0Mods->setImm(Src1ModsVal);
2676 return true;
2677}
2678
2680 MachineOperand &RegOp,
2681 MachineOperand &NonRegOp) {
2682 Register Reg = RegOp.getReg();
2683 unsigned SubReg = RegOp.getSubReg();
2684 bool IsKill = RegOp.isKill();
2685 bool IsDead = RegOp.isDead();
2686 bool IsUndef = RegOp.isUndef();
2687 bool IsDebug = RegOp.isDebug();
2688
2689 if (NonRegOp.isImm())
2690 RegOp.ChangeToImmediate(NonRegOp.getImm());
2691 else if (NonRegOp.isFI())
2692 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2693 else if (NonRegOp.isGlobal()) {
2694 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2695 NonRegOp.getTargetFlags());
2696 } else
2697 return nullptr;
2698
2699 // Make sure we don't reinterpret a subreg index in the target flags.
2700 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2701
2702 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2703 NonRegOp.setSubReg(SubReg);
2704
2705 return &MI;
2706}
2707
2709 MachineOperand &NonRegOp1,
2710 MachineOperand &NonRegOp2) {
2711 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2712 int64_t NonRegVal = NonRegOp1.getImm();
2713
2714 NonRegOp1.setImm(NonRegOp2.getImm());
2715 NonRegOp2.setImm(NonRegVal);
2716 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2717 NonRegOp2.setTargetFlags(TargetFlags);
2718 return &MI;
2719}
2720
2721bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2722 unsigned OpIdx1) const {
2723 const MCInstrDesc &InstDesc = MI.getDesc();
2724 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2725 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2726
2727 unsigned Opc = MI.getOpcode();
2728 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2729
2730 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2731 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2732
2733 // Swap doesn't breach constant bus or literal limits
2734 // It may move literal to position other than src0, this is not allowed
2735 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2736 // FIXME: After gfx9, literal can be in place other than Src0
2737 if (isVALU(MI, /*AllowLDSDMA=*/true)) {
2738 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2739 !isInlineConstant(MO0, OpInfo1))
2740 return false;
2741 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2742 !isInlineConstant(MO1, OpInfo0))
2743 return false;
2744 }
2745
2746 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2747 if (OpInfo1.RegClass == -1)
2748 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2749 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2750 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2751 }
2752 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2753 if (OpInfo0.RegClass == -1)
2754 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2755 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2756 isLegalRegOperand(MI, OpIdx0, MO1);
2757 }
2758
2759 // No need to check 64-bit literals since swapping does not bring new
2760 // 64-bit literals into current instruction to fold to 32-bit
2761
2762 return isImmOperandLegal(MI, OpIdx1, MO0);
2763}
2764
2766 unsigned Src0Idx,
2767 unsigned Src1Idx) const {
2768 assert(!NewMI && "this should never be used");
2769
2770 unsigned Opc = MI.getOpcode();
2771 int CommutedOpcode = commuteOpcode(Opc);
2772 if (CommutedOpcode == -1)
2773 return nullptr;
2774
2775 if (Src0Idx > Src1Idx)
2776 std::swap(Src0Idx, Src1Idx);
2777
2778 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2779 static_cast<int>(Src0Idx) &&
2780 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2781 static_cast<int>(Src1Idx) &&
2782 "inconsistency with findCommutedOpIndices");
2783
2784 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2785 return nullptr;
2786
2787 MachineInstr *CommutedMI = nullptr;
2788 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2789 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2790 if (Src0.isReg() && Src1.isReg()) {
2791 // Be sure to copy the source modifiers to the right place.
2792 CommutedMI =
2793 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2794 } else if (Src0.isReg() && !Src1.isReg()) {
2795 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2796 } else if (!Src0.isReg() && Src1.isReg()) {
2797 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2798 } else if (Src0.isImm() && Src1.isImm()) {
2799 CommutedMI = swapImmOperands(MI, Src0, Src1);
2800 } else {
2801 // FIXME: Found two non registers to commute. This does happen.
2802 return nullptr;
2803 }
2804
2805 if (CommutedMI) {
2806 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2807 Src1, AMDGPU::OpName::src1_modifiers);
2808
2809 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2810 AMDGPU::OpName::src1_sel);
2811
2812 CommutedMI->setDesc(get(CommutedOpcode));
2813 }
2814
2815 return CommutedMI;
2816}
2817
2818// This needs to be implemented because the source modifiers may be inserted
2819// between the true commutable operands, and the base
2820// TargetInstrInfo::commuteInstruction uses it.
2822 unsigned &SrcOpIdx0,
2823 unsigned &SrcOpIdx1) const {
2824 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2825}
2826
2828 unsigned &SrcOpIdx0,
2829 unsigned &SrcOpIdx1) const {
2830 if (!Desc.isCommutable())
2831 return false;
2832
2833 unsigned Opc = Desc.getOpcode();
2834 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2835 if (Src0Idx == -1)
2836 return false;
2837
2838 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2839 if (Src1Idx == -1)
2840 return false;
2841
2842 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2843}
2844
2846 int64_t BrOffset) const {
2847 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2848 // because its dest block is unanalyzable.
2849 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2850
2851 // Convert to dwords.
2852 BrOffset /= 4;
2853
2854 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2855 // from the next instruction.
2856 BrOffset -= 1;
2857
2858 return isIntN(BranchOffsetBits, BrOffset);
2859}
2860
2863 return MI.getOperand(0).getMBB();
2864}
2865
2867 for (const MachineInstr &MI : MBB->terminators()) {
2868 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2869 MI.getOpcode() == AMDGPU::SI_LOOP)
2870 return true;
2871 }
2872 return false;
2873}
2874
2876 MachineBasicBlock &DestBB,
2877 MachineBasicBlock &RestoreBB,
2878 const DebugLoc &DL, int64_t BrOffset,
2879 RegScavenger *RS) const {
2880 assert(MBB.empty() &&
2881 "new block should be inserted for expanding unconditional branch");
2882 assert(MBB.pred_size() == 1);
2883 assert(RestoreBB.empty() &&
2884 "restore block should be inserted for restoring clobbered registers");
2885
2886 MachineFunction *MF = MBB.getParent();
2887 MachineRegisterInfo &MRI = MF->getRegInfo();
2889 auto I = MBB.end();
2890 auto &MCCtx = MF->getContext();
2891
2892 if (ST.useAddPC64Inst()) {
2893 MCSymbol *Offset =
2894 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2895 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2897 MCSymbol *PostAddPCLabel =
2898 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2899 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2900 auto *OffsetExpr = MCBinaryExpr::createSub(
2901 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2902 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2903 Offset->setVariableValue(OffsetExpr);
2904 return;
2905 }
2906
2907 assert(RS && "RegScavenger required for long branching");
2908
2909 // FIXME: Virtual register workaround for RegScavenger not working with empty
2910 // blocks.
2911 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2912
2913 // Note: as this is used after hazard recognizer we need to apply some hazard
2914 // workarounds directly.
2915 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2916 ST.hasVALUReadSGPRHazard();
2917 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2918 if (FlushSGPRWrites)
2919 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2921 };
2922
2923 // We need to compute the offset relative to the instruction immediately after
2924 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2925 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2926 ApplyHazardWorkarounds();
2927
2928 MCSymbol *PostGetPCLabel =
2929 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2930 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2931
2932 MCSymbol *OffsetLo =
2933 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2934 MCSymbol *OffsetHi =
2935 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2936 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2937 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2938 .addReg(PCReg, {}, AMDGPU::sub0)
2939 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2940 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2941 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2942 .addReg(PCReg, {}, AMDGPU::sub1)
2943 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2944 ApplyHazardWorkarounds();
2945
2946 // Insert the indirect branch after the other terminator.
2947 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2948 .addReg(PCReg);
2949
2950 // If a spill is needed for the pc register pair, we need to insert a spill
2951 // restore block right before the destination block, and insert a short branch
2952 // into the old destination block's fallthrough predecessor.
2953 // e.g.:
2954 //
2955 // s_cbranch_scc0 skip_long_branch:
2956 //
2957 // long_branch_bb:
2958 // spill s[8:9]
2959 // s_getpc_b64 s[8:9]
2960 // s_add_u32 s8, s8, restore_bb
2961 // s_addc_u32 s9, s9, 0
2962 // s_setpc_b64 s[8:9]
2963 //
2964 // skip_long_branch:
2965 // foo;
2966 //
2967 // .....
2968 //
2969 // dest_bb_fallthrough_predecessor:
2970 // bar;
2971 // s_branch dest_bb
2972 //
2973 // restore_bb:
2974 // restore s[8:9]
2975 // fallthrough dest_bb
2976 ///
2977 // dest_bb:
2978 // buzz;
2979
2980 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2981 Register Scav;
2982
2983 // If we've previously reserved a register for long branches
2984 // avoid running the scavenger and just use those registers
2985 if (LongBranchReservedReg) {
2986 RS->enterBasicBlock(MBB);
2987 Scav = LongBranchReservedReg;
2988 } else {
2989 RS->enterBasicBlockEnd(MBB);
2990 Scav = RS->scavengeRegisterBackwards(
2991 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2992 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2993 }
2994 if (Scav) {
2995 RS->setRegUsed(Scav);
2996 MRI.replaceRegWith(PCReg, Scav);
2997 MRI.clearVirtRegs();
2998 } else {
2999 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3000 // SGPR spill.
3001 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3002 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3003 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3004 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3005 MRI.clearVirtRegs();
3006 }
3007
3008 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3009 // Now, the distance could be defined.
3011 MCSymbolRefExpr::create(DestLabel, MCCtx),
3012 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3013 // Add offset assignments.
3014 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3015 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3016 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3017 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3018}
3019
3020unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3021 switch (Cond) {
3022 case SIInstrInfo::SCC_TRUE:
3023 return AMDGPU::S_CBRANCH_SCC1;
3024 case SIInstrInfo::SCC_FALSE:
3025 return AMDGPU::S_CBRANCH_SCC0;
3026 case SIInstrInfo::VCCNZ:
3027 return AMDGPU::S_CBRANCH_VCCNZ;
3028 case SIInstrInfo::VCCZ:
3029 return AMDGPU::S_CBRANCH_VCCZ;
3030 case SIInstrInfo::EXECNZ:
3031 return AMDGPU::S_CBRANCH_EXECNZ;
3032 case SIInstrInfo::EXECZ:
3033 return AMDGPU::S_CBRANCH_EXECZ;
3034 default:
3035 llvm_unreachable("invalid branch predicate");
3036 }
3037}
3038
3039SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3040 switch (Opcode) {
3041 case AMDGPU::S_CBRANCH_SCC0:
3042 return SCC_FALSE;
3043 case AMDGPU::S_CBRANCH_SCC1:
3044 return SCC_TRUE;
3045 case AMDGPU::S_CBRANCH_VCCNZ:
3046 return VCCNZ;
3047 case AMDGPU::S_CBRANCH_VCCZ:
3048 return VCCZ;
3049 case AMDGPU::S_CBRANCH_EXECNZ:
3050 return EXECNZ;
3051 case AMDGPU::S_CBRANCH_EXECZ:
3052 return EXECZ;
3053 default:
3054 return INVALID_BR;
3055 }
3056}
3057
3061 MachineBasicBlock *&FBB,
3063 bool AllowModify) const {
3064 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3065 // Unconditional Branch
3066 TBB = I->getOperand(0).getMBB();
3067 return false;
3068 }
3069
3070 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3071 if (Pred == INVALID_BR)
3072 return true;
3073
3074 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3075 Cond.push_back(MachineOperand::CreateImm(Pred));
3076 Cond.push_back(I->getOperand(1)); // Save the branch register.
3077
3078 ++I;
3079
3080 if (I == MBB.end()) {
3081 // Conditional branch followed by fall-through.
3082 TBB = CondBB;
3083 return false;
3084 }
3085
3086 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3087 TBB = CondBB;
3088 FBB = I->getOperand(0).getMBB();
3089 return false;
3090 }
3091
3092 return true;
3093}
3094
3096 MachineBasicBlock *&FBB,
3098 bool AllowModify) const {
3099 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3100 auto E = MBB.end();
3101 if (I == E)
3102 return false;
3103
3104 // Skip over the instructions that are artificially terminators for special
3105 // exec management.
3106 while (I != E && !I->isBranch() && !I->isReturn()) {
3107 switch (I->getOpcode()) {
3108 case AMDGPU::S_MOV_B64_term:
3109 case AMDGPU::S_XOR_B64_term:
3110 case AMDGPU::S_OR_B64_term:
3111 case AMDGPU::S_ANDN2_B64_term:
3112 case AMDGPU::S_AND_B64_term:
3113 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3114 case AMDGPU::S_MOV_B32_term:
3115 case AMDGPU::S_XOR_B32_term:
3116 case AMDGPU::S_OR_B32_term:
3117 case AMDGPU::S_ANDN2_B32_term:
3118 case AMDGPU::S_AND_B32_term:
3119 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3120 break;
3121 case AMDGPU::SI_IF:
3122 case AMDGPU::SI_ELSE:
3123 case AMDGPU::SI_KILL_I1_TERMINATOR:
3124 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3125 // FIXME: It's messy that these need to be considered here at all.
3126 return true;
3127 default:
3128 llvm_unreachable("unexpected non-branch terminator inst");
3129 }
3130
3131 ++I;
3132 }
3133
3134 if (I == E)
3135 return false;
3136
3137 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3138}
3139
3141 int *BytesRemoved) const {
3142 unsigned Count = 0;
3143 unsigned RemovedSize = 0;
3144 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3145 // Skip over artificial terminators when removing instructions.
3146 if (MI.isBranch() || MI.isReturn()) {
3147 RemovedSize += getInstSizeInBytes(MI);
3148 MI.eraseFromParent();
3149 ++Count;
3150 }
3151 }
3152
3153 if (BytesRemoved)
3154 *BytesRemoved = RemovedSize;
3155
3156 return Count;
3157}
3158
3159// Copy the flags onto the implicit condition register operand.
3161 const MachineOperand &OrigCond) {
3162 CondReg.setIsUndef(OrigCond.isUndef());
3163 CondReg.setIsKill(OrigCond.isKill());
3164}
3165
3168 MachineBasicBlock *FBB,
3170 const DebugLoc &DL,
3171 int *BytesAdded) const {
3172 if (!FBB && Cond.empty()) {
3173 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3174 .addMBB(TBB);
3175 if (BytesAdded)
3176 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3177 return 1;
3178 }
3179
3180 assert(TBB && Cond[0].isImm());
3181
3182 unsigned Opcode
3183 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3184
3185 if (!FBB) {
3186 MachineInstr *CondBr =
3187 BuildMI(&MBB, DL, get(Opcode))
3188 .addMBB(TBB);
3189
3190 // Copy the flags onto the implicit condition register operand.
3191 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3192 fixImplicitOperands(*CondBr);
3193
3194 if (BytesAdded)
3195 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3196 return 1;
3197 }
3198
3199 assert(TBB && FBB);
3200
3201 MachineInstr *CondBr =
3202 BuildMI(&MBB, DL, get(Opcode))
3203 .addMBB(TBB);
3204 fixImplicitOperands(*CondBr);
3205 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3206 .addMBB(FBB);
3207
3208 MachineOperand &CondReg = CondBr->getOperand(1);
3209 CondReg.setIsUndef(Cond[1].isUndef());
3210 CondReg.setIsKill(Cond[1].isKill());
3211
3212 if (BytesAdded)
3213 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3214
3215 return 2;
3216}
3217
3220 if (Cond.size() != 2) {
3221 return true;
3222 }
3223
3224 if (Cond[0].isImm()) {
3225 Cond[0].setImm(-Cond[0].getImm());
3226 return false;
3227 }
3228
3229 return true;
3230}
3231
3234 Register DstReg, Register TrueReg,
3235 Register FalseReg, int &CondCycles,
3236 int &TrueCycles, int &FalseCycles) const {
3237 switch (Cond[0].getImm()) {
3238 case VCCNZ:
3239 case VCCZ: {
3240 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3241 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3242 if (MRI.getRegClass(FalseReg) != RC)
3243 return false;
3244
3245 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3246 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3247
3248 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3249 return RI.hasVGPRs(RC) && NumInsts <= 6;
3250 }
3251 case SCC_TRUE:
3252 case SCC_FALSE: {
3253 // FIXME: We could insert for VGPRs if we could replace the original compare
3254 // with a vector one.
3255 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3256 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3257 if (MRI.getRegClass(FalseReg) != RC)
3258 return false;
3259
3260 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3261
3262 // Multiples of 8 can do s_cselect_b64
3263 if (NumInsts % 2 == 0)
3264 NumInsts /= 2;
3265
3266 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3267 return RI.isSGPRClass(RC);
3268 }
3269 default:
3270 return false;
3271 }
3272}
3273
3277 Register TrueReg, Register FalseReg) const {
3278 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3279 if (Pred == VCCZ || Pred == SCC_FALSE) {
3280 Pred = static_cast<BranchPredicate>(-Pred);
3281 std::swap(TrueReg, FalseReg);
3282 }
3283
3284 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3285 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3286 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3287
3288 if (DstSize == 32) {
3290 if (Pred == SCC_TRUE) {
3291 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3292 .addReg(TrueReg)
3293 .addReg(FalseReg);
3294 } else {
3295 // Instruction's operands are backwards from what is expected.
3296 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3297 .addReg(FalseReg)
3298 .addReg(TrueReg);
3299 }
3300
3301 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3302 return;
3303 }
3304
3305 if (DstSize == 64 && Pred == SCC_TRUE) {
3307 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3308 .addReg(TrueReg)
3309 .addReg(FalseReg);
3310
3311 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3312 return;
3313 }
3314
3315 static const int16_t Sub0_15[] = {
3316 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3317 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3318 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3319 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3320 };
3321
3322 static const int16_t Sub0_15_64[] = {
3323 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3324 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3325 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3326 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3327 };
3328
3329 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3330 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3331 const int16_t *SubIndices = Sub0_15;
3332 int NElts = DstSize / 32;
3333
3334 // 64-bit select is only available for SALU.
3335 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3336 if (Pred == SCC_TRUE) {
3337 if (NElts % 2) {
3338 SelOp = AMDGPU::S_CSELECT_B32;
3339 EltRC = &AMDGPU::SGPR_32RegClass;
3340 } else {
3341 SelOp = AMDGPU::S_CSELECT_B64;
3342 EltRC = &AMDGPU::SGPR_64RegClass;
3343 SubIndices = Sub0_15_64;
3344 NElts /= 2;
3345 }
3346 }
3347
3349 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3350
3351 I = MIB->getIterator();
3352
3354 for (int Idx = 0; Idx != NElts; ++Idx) {
3355 Register DstElt = MRI.createVirtualRegister(EltRC);
3356 Regs.push_back(DstElt);
3357
3358 unsigned SubIdx = SubIndices[Idx];
3359
3361 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3362 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3363 .addReg(FalseReg, {}, SubIdx)
3364 .addReg(TrueReg, {}, SubIdx);
3365 } else {
3366 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3367 .addReg(TrueReg, {}, SubIdx)
3368 .addReg(FalseReg, {}, SubIdx);
3369 }
3370
3371 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3373
3374 MIB.addReg(DstElt)
3375 .addImm(SubIdx);
3376 }
3377}
3378
3380
3381 if (MI.isBranch() || MI.isCall() || MI.isReturn() || MI.isIndirectBranch())
3382 return true;
3383
3384 switch (MI.getOpcode()) {
3385 case AMDGPU::S_ENDPGM:
3386 case AMDGPU::S_ENDPGM_SAVED:
3387 case AMDGPU::S_TRAP:
3388 case AMDGPU::S_GETREG_B32:
3389 case AMDGPU::S_SETREG_B32:
3390 case AMDGPU::S_SETREG_B32_mode:
3391 case AMDGPU::S_SETREG_IMM32_B32:
3392 case AMDGPU::S_SETREG_IMM32_B32_mode:
3393 case AMDGPU::S_SENDMSG:
3394 case AMDGPU::S_SENDMSGHALT:
3395 case AMDGPU::S_SENDMSG_RTN_B32:
3396 case AMDGPU::S_SENDMSG_RTN_B64:
3397 case AMDGPU::S_BARRIER_WAIT:
3398 case AMDGPU::S_BARRIER_SIGNAL_M0:
3399 case AMDGPU::S_BARRIER_SIGNAL_IMM:
3400 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
3401 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
3402 return true;
3403 default:
3404 return false;
3405 }
3406}
3407
3409 switch (MI.getOpcode()) {
3410 case AMDGPU::V_MOV_B16_t16_e32:
3411 case AMDGPU::V_MOV_B16_t16_e64:
3412 case AMDGPU::V_MOV_B32_e32:
3413 case AMDGPU::V_MOV_B32_e64:
3414 case AMDGPU::V_MOV_B64_PSEUDO:
3415 case AMDGPU::V_MOV_B64_e32:
3416 case AMDGPU::V_MOV_B64_e64:
3417 case AMDGPU::S_MOV_B32:
3418 case AMDGPU::S_MOV_B64:
3419 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3420 case AMDGPU::COPY:
3421 case AMDGPU::WWM_COPY:
3422 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3423 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3424 case AMDGPU::V_ACCVGPR_MOV_B32:
3425 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3426 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3427 return true;
3428 default:
3429 return false;
3430 }
3431}
3432
3434 switch (MI.getOpcode()) {
3435 case AMDGPU::V_MOV_B16_t16_e32:
3436 case AMDGPU::V_MOV_B16_t16_e64:
3437 return 2;
3438 case AMDGPU::V_MOV_B32_e32:
3439 case AMDGPU::V_MOV_B32_e64:
3440 case AMDGPU::V_MOV_B64_PSEUDO:
3441 case AMDGPU::V_MOV_B64_e32:
3442 case AMDGPU::V_MOV_B64_e64:
3443 case AMDGPU::S_MOV_B32:
3444 case AMDGPU::S_MOV_B64:
3445 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3446 case AMDGPU::COPY:
3447 case AMDGPU::WWM_COPY:
3448 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3449 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3450 case AMDGPU::V_ACCVGPR_MOV_B32:
3451 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3452 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3453 return 1;
3454 default:
3455 llvm_unreachable("MI is not a foldable copy");
3456 }
3457}
3458
3459static constexpr AMDGPU::OpName ModifierOpNames[] = {
3460 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3461 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3462 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3463
3465 unsigned Opc = MI.getOpcode();
3466 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3467 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3468 if (Idx >= 0)
3469 MI.removeOperand(Idx);
3470 }
3471}
3472
3474 const MCInstrDesc &NewDesc) const {
3475 MI.setDesc(NewDesc);
3476
3477 // Remove any leftover implicit operands from mutating the instruction. e.g.
3478 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3479 // anymore.
3480 const MCInstrDesc &Desc = MI.getDesc();
3481 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3482 Desc.implicit_defs().size();
3483
3484 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3485 MI.removeOperand(I);
3486}
3487
3488std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3489 unsigned SubRegIndex) {
3490 switch (SubRegIndex) {
3491 case AMDGPU::NoSubRegister:
3492 return Imm;
3493 case AMDGPU::sub0:
3494 return SignExtend64<32>(Imm);
3495 case AMDGPU::sub1:
3496 return SignExtend64<32>(Imm >> 32);
3497 case AMDGPU::lo16:
3498 return SignExtend64<16>(Imm);
3499 case AMDGPU::hi16:
3500 return SignExtend64<16>(Imm >> 16);
3501 case AMDGPU::sub1_lo16:
3502 return SignExtend64<16>(Imm >> 32);
3503 case AMDGPU::sub1_hi16:
3504 return SignExtend64<16>(Imm >> 48);
3505 default:
3506 return std::nullopt;
3507 }
3508
3509 llvm_unreachable("covered subregister switch");
3510}
3511
3512static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3513 switch (Opc) {
3514 case AMDGPU::V_MAC_F16_e32:
3515 case AMDGPU::V_MAC_F16_e64:
3516 case AMDGPU::V_MAD_F16_e64:
3517 return AMDGPU::V_MADAK_F16;
3518 case AMDGPU::V_MAC_F32_e32:
3519 case AMDGPU::V_MAC_F32_e64:
3520 case AMDGPU::V_MAD_F32_e64:
3521 return AMDGPU::V_MADAK_F32;
3522 case AMDGPU::V_FMAC_F32_e32:
3523 case AMDGPU::V_FMAC_F32_e64:
3524 case AMDGPU::V_FMA_F32_e64:
3525 return AMDGPU::V_FMAAK_F32;
3526 case AMDGPU::V_FMAC_F16_e32:
3527 case AMDGPU::V_FMAC_F16_e64:
3528 case AMDGPU::V_FMAC_F16_t16_e64:
3529 case AMDGPU::V_FMAC_F16_fake16_e64:
3530 case AMDGPU::V_FMAC_F16_t16_e32:
3531 case AMDGPU::V_FMAC_F16_fake16_e32:
3532 case AMDGPU::V_FMA_F16_e64:
3533 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3534 ? AMDGPU::V_FMAAK_F16_t16
3535 : AMDGPU::V_FMAAK_F16_fake16
3536 : AMDGPU::V_FMAAK_F16;
3537 case AMDGPU::V_FMAC_F64_e32:
3538 case AMDGPU::V_FMAC_F64_e64:
3539 case AMDGPU::V_FMA_F64_e64:
3540 return AMDGPU::V_FMAAK_F64;
3541 default:
3542 llvm_unreachable("invalid instruction");
3543 }
3544}
3545
3546static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3547 switch (Opc) {
3548 case AMDGPU::V_MAC_F16_e32:
3549 case AMDGPU::V_MAC_F16_e64:
3550 case AMDGPU::V_MAD_F16_e64:
3551 return AMDGPU::V_MADMK_F16;
3552 case AMDGPU::V_MAC_F32_e32:
3553 case AMDGPU::V_MAC_F32_e64:
3554 case AMDGPU::V_MAD_F32_e64:
3555 return AMDGPU::V_MADMK_F32;
3556 case AMDGPU::V_FMAC_F32_e32:
3557 case AMDGPU::V_FMAC_F32_e64:
3558 case AMDGPU::V_FMA_F32_e64:
3559 return AMDGPU::V_FMAMK_F32;
3560 case AMDGPU::V_FMAC_F16_e32:
3561 case AMDGPU::V_FMAC_F16_e64:
3562 case AMDGPU::V_FMAC_F16_t16_e64:
3563 case AMDGPU::V_FMAC_F16_fake16_e64:
3564 case AMDGPU::V_FMAC_F16_t16_e32:
3565 case AMDGPU::V_FMAC_F16_fake16_e32:
3566 case AMDGPU::V_FMA_F16_e64:
3567 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3568 ? AMDGPU::V_FMAMK_F16_t16
3569 : AMDGPU::V_FMAMK_F16_fake16
3570 : AMDGPU::V_FMAMK_F16;
3571 case AMDGPU::V_FMAC_F64_e32:
3572 case AMDGPU::V_FMAC_F64_e64:
3573 case AMDGPU::V_FMA_F64_e64:
3574 return AMDGPU::V_FMAMK_F64;
3575 default:
3576 llvm_unreachable("invalid instruction");
3577 }
3578}
3579
3581 Register Reg, MachineRegisterInfo *MRI) const {
3582 int64_t Imm;
3583 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3584 return false;
3585
3586 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3587
3588 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3589
3590 unsigned Opc = UseMI.getOpcode();
3591 if (Opc == AMDGPU::COPY) {
3592 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3593
3594 Register DstReg = UseMI.getOperand(0).getReg();
3595 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3596
3597 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3598
3599 if (HasMultipleUses) {
3600 // TODO: This should fold in more cases with multiple use, but we need to
3601 // more carefully consider what those uses are.
3602 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3603
3604 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3605 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3606 return false;
3607
3608 // Most of the time folding a 32-bit inline constant is free (though this
3609 // might not be true if we can't later fold it into a real user).
3610 //
3611 // FIXME: This isInlineConstant check is imprecise if
3612 // getConstValDefinedInReg handled the tricky non-mov cases.
3613 if (ImmDefSize == 32 &&
3615 return false;
3616 }
3617
3618 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3619 RI.getSubRegIdxSize(UseSubReg) == 16;
3620
3621 if (Is16Bit) {
3622 if (RI.hasVGPRs(DstRC))
3623 return false; // Do not clobber vgpr_hi16
3624
3625 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3626 return false;
3627 }
3628
3629 MachineFunction *MF = UseMI.getMF();
3630
3631 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3632 MCRegister MovDstPhysReg =
3633 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3634
3635 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3636
3637 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3638 for (unsigned MovOp :
3639 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3640 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3641 const MCInstrDesc &MovDesc = get(MovOp);
3642
3643 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3644 if (Is16Bit) {
3645 // We just need to find a correctly sized register class, so the
3646 // subregister index compatibility doesn't matter since we're statically
3647 // extracting the immediate value.
3648 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3649 if (!MovDstRC)
3650 continue;
3651
3652 if (MovDstPhysReg) {
3653 // FIXME: We probably should not do this. If there is a live value in
3654 // the high half of the register, it will be corrupted.
3655 MovDstPhysReg =
3656 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3657 if (!MovDstPhysReg)
3658 continue;
3659 }
3660 }
3661
3662 // Result class isn't the right size, try the next instruction.
3663 if (MovDstPhysReg) {
3664 if (!MovDstRC->contains(MovDstPhysReg))
3665 return false;
3666 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3667 // TODO: This will be overly conservative in the case of 16-bit virtual
3668 // SGPRs. We could hack up the virtual register uses to use a compatible
3669 // 32-bit class.
3670 continue;
3671 }
3672
3673 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3674
3675 // Ensure the interpreted immediate value is a valid operand in the new
3676 // mov.
3677 //
3678 // FIXME: isImmOperandLegal should have form that doesn't require existing
3679 // MachineInstr or MachineOperand
3680 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3681 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3682 break;
3683
3684 NewOpc = MovOp;
3685 break;
3686 }
3687
3688 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3689 return false;
3690
3691 if (Is16Bit) {
3692 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3693 if (MovDstPhysReg)
3694 UseMI.getOperand(0).setReg(MovDstPhysReg);
3695 assert(UseMI.getOperand(1).getReg().isVirtual());
3696 }
3697
3698 const MCInstrDesc &NewMCID = get(NewOpc);
3699 UseMI.setDesc(NewMCID);
3700 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3701 UseMI.addImplicitDefUseOperands(*MF);
3702 return true;
3703 }
3704
3705 if (HasMultipleUses)
3706 return false;
3707
3708 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3709 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3710 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3711 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3712 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3713 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3714 Opc == AMDGPU::V_FMAC_F64_e64) {
3715 // Don't fold if we are using source or output modifiers. The new VOP2
3716 // instructions don't have them.
3718 return false;
3719
3720 // If this is a free constant, there's no reason to do this.
3721 // TODO: We could fold this here instead of letting SIFoldOperands do it
3722 // later.
3723 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3724
3725 // Any src operand can be used for the legality check.
3726 if (isInlineConstant(UseMI, Src0Idx, Imm))
3727 return false;
3728
3729 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3730
3731 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3732 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3733
3734 auto CopyRegOperandToNarrowerRC =
3735 [MRI, this](MachineInstr &MI, unsigned OpNo,
3736 const TargetRegisterClass *NewRC) -> void {
3737 if (!MI.getOperand(OpNo).isReg())
3738 return;
3739 Register Reg = MI.getOperand(OpNo).getReg();
3740 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3741 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3742 return;
3743 Register Tmp = MRI->createVirtualRegister(NewRC);
3744 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3745 get(AMDGPU::COPY), Tmp)
3746 .addReg(Reg);
3747 MI.getOperand(OpNo).setReg(Tmp);
3748 MI.getOperand(OpNo).setIsKill();
3749 };
3750
3751 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3752 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3753 (Src1->isReg() && Src1->getReg() == Reg)) {
3754 MachineOperand *RegSrc =
3755 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3756 if (!RegSrc->isReg())
3757 return false;
3758 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3759 ST.getConstantBusLimit(Opc) < 2)
3760 return false;
3761
3762 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3763 return false;
3764
3765 // If src2 is also a literal constant then we have to choose which one to
3766 // fold. In general it is better to choose madak so that the other literal
3767 // can be materialized in an sgpr instead of a vgpr:
3768 // s_mov_b32 s0, literal
3769 // v_madak_f32 v0, s0, v0, literal
3770 // Instead of:
3771 // v_mov_b32 v1, literal
3772 // v_madmk_f32 v0, v0, literal, v1
3773 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3774 if (Def && Def->isMoveImmediate() &&
3775 !isInlineConstant(Def->getOperand(1)))
3776 return false;
3777
3778 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3779 if (pseudoToMCOpcode(NewOpc) == -1)
3780 return false;
3781
3782 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3783 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3784
3785 // FIXME: This would be a lot easier if we could return a new instruction
3786 // instead of having to modify in place.
3787
3788 Register SrcReg = RegSrc->getReg();
3789 unsigned SrcSubReg = RegSrc->getSubReg();
3790 Src0->setReg(SrcReg);
3791 Src0->setSubReg(SrcSubReg);
3792 Src0->setIsKill(RegSrc->isKill());
3793
3794 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3795 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3796 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3797 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3798 UseMI.untieRegOperand(
3799 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3800
3801 Src1->ChangeToImmediate(*SubRegImm);
3802
3804 UseMI.setDesc(get(NewOpc));
3805
3806 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3807 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3808 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3809 Register Tmp = MRI->createVirtualRegister(NewRC);
3810 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3811 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3812 UseMI.getOperand(0).getReg())
3813 .addReg(Tmp, RegState::Kill);
3814 UseMI.getOperand(0).setReg(Tmp);
3815 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3816 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3817 }
3818
3819 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3820 if (DeleteDef)
3821 DefMI.eraseFromParent();
3822
3823 return true;
3824 }
3825
3826 // Added part is the constant: Use v_madak_{f16, f32}.
3827 if (Src2->isReg() && Src2->getReg() == Reg) {
3828 if (ST.getConstantBusLimit(Opc) < 2) {
3829 // Not allowed to use constant bus for another operand.
3830 // We can however allow an inline immediate as src0.
3831 bool Src0Inlined = false;
3832 if (Src0->isReg()) {
3833 // Try to inline constant if possible.
3834 // If the Def moves immediate and the use is single
3835 // We are saving VGPR here.
3836 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3837 if (Def && Def->isMoveImmediate() &&
3838 isInlineConstant(Def->getOperand(1)) &&
3839 MRI->hasOneNonDBGUse(Src0->getReg())) {
3840 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3841 Src0Inlined = true;
3842 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3843 RI.isSGPRReg(*MRI, Src0->getReg())) {
3844 return false;
3845 }
3846 // VGPR is okay as Src0 - fallthrough
3847 }
3848
3849 if (Src1->isReg() && !Src0Inlined) {
3850 // We have one slot for inlinable constant so far - try to fill it
3851 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3852 if (Def && Def->isMoveImmediate() &&
3853 isInlineConstant(Def->getOperand(1)) &&
3854 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3855 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3856 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3857 return false;
3858 // VGPR is okay as Src1 - fallthrough
3859 }
3860 }
3861
3862 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3863 if (pseudoToMCOpcode(NewOpc) == -1)
3864 return false;
3865
3866 // FIXME: This would be a lot easier if we could return a new instruction
3867 // instead of having to modify in place.
3868
3869 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3870 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3871 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3872 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3873 UseMI.untieRegOperand(
3874 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3875
3876 const std::optional<int64_t> SubRegImm =
3877 extractSubregFromImm(Imm, Src2->getSubReg());
3878
3879 // ChangingToImmediate adds Src2 back to the instruction.
3880 Src2->ChangeToImmediate(*SubRegImm);
3881
3882 // These come before src2.
3884 UseMI.setDesc(get(NewOpc));
3885
3886 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3887 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3888 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3889 Register Tmp = MRI->createVirtualRegister(NewRC);
3890 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3891 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3892 UseMI.getOperand(0).getReg())
3893 .addReg(Tmp, RegState::Kill);
3894 UseMI.getOperand(0).setReg(Tmp);
3895 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3896 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3897 }
3898
3899 // It might happen that UseMI was commuted
3900 // and we now have SGPR as SRC1. If so 2 inlined
3901 // constant and SGPR are illegal.
3903
3904 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3905 if (DeleteDef)
3906 DefMI.eraseFromParent();
3907
3908 return true;
3909 }
3910 }
3911
3912 return false;
3913}
3914
3915static bool
3918 if (BaseOps1.size() != BaseOps2.size())
3919 return false;
3920 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3921 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3922 return false;
3923 }
3924 return true;
3925}
3926
3927static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3928 LocationSize WidthB, int OffsetB) {
3929 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3930 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3931 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3932 return LowWidth.hasValue() &&
3933 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3934}
3935
3936bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3937 const MachineInstr &MIb) const {
3938 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3939 int64_t Offset0, Offset1;
3940 LocationSize Dummy0 = LocationSize::precise(0);
3941 LocationSize Dummy1 = LocationSize::precise(0);
3942 bool Offset0IsScalable, Offset1IsScalable;
3943 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3944 Dummy0, &RI) ||
3945 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3946 Dummy1, &RI))
3947 return false;
3948
3949 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3950 return false;
3951
3952 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3953 // FIXME: Handle ds_read2 / ds_write2.
3954 return false;
3955 }
3956 LocationSize Width0 = MIa.memoperands().front()->getSize();
3957 LocationSize Width1 = MIb.memoperands().front()->getSize();
3958 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3959}
3960
3962 const MachineInstr &MIb) const {
3963 assert(MIa.mayLoadOrStore() &&
3964 "MIa must load from or modify a memory location");
3965 assert(MIb.mayLoadOrStore() &&
3966 "MIb must load from or modify a memory location");
3967
3969 return false;
3970
3971 // XXX - Can we relax this between address spaces?
3972 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3973 return false;
3974
3975 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3976 return false;
3977
3978 if (MIa.isBundle() || MIb.isBundle())
3979 return false;
3980
3981 // TODO: Should we check the address space from the MachineMemOperand? That
3982 // would allow us to distinguish objects we know don't alias based on the
3983 // underlying address space, even if it was lowered to a different one,
3984 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3985 // buffer.
3986 if (isDS(MIa)) {
3987 if (isDS(MIb))
3988 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3989
3990 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3991 }
3992
3993 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3994 if (isMUBUF(MIb) || isMTBUF(MIb))
3995 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3996
3997 if (isFLAT(MIb))
3998 return isFLATScratch(MIb);
3999
4000 return !isSMRD(MIb);
4001 }
4002
4003 if (isSMRD(MIa)) {
4004 if (isSMRD(MIb))
4005 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4006
4007 if (isFLAT(MIb))
4008 return isFLATScratch(MIb);
4009
4010 return !isMUBUF(MIb) && !isMTBUF(MIb);
4011 }
4012
4013 if (isFLAT(MIa)) {
4014 if (isFLAT(MIb)) {
4015 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4016 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4017 return true;
4018
4019 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4020 }
4021
4022 return false;
4023 }
4024
4025 return false;
4026}
4027
4029 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4030 if (Reg.isPhysical())
4031 return false;
4032 auto *Def = MRI.getUniqueVRegDef(Reg);
4033 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4034 Imm = Def->getOperand(1).getImm();
4035 if (DefMI)
4036 *DefMI = Def;
4037 return true;
4038 }
4039 return false;
4040}
4041
4042static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4043 MachineInstr **DefMI = nullptr) {
4044 if (!MO->isReg())
4045 return false;
4046 const MachineFunction *MF = MO->getParent()->getMF();
4047 const MachineRegisterInfo &MRI = MF->getRegInfo();
4048 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4049}
4050
4052 MachineInstr &NewMI) {
4053 if (LV) {
4054 unsigned NumOps = MI.getNumOperands();
4055 for (unsigned I = 1; I < NumOps; ++I) {
4056 MachineOperand &Op = MI.getOperand(I);
4057 if (Op.isReg() && Op.isKill())
4058 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4059 }
4060 }
4061}
4062
4063static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4064 switch (Opc) {
4065 case AMDGPU::V_MAC_F16_e32:
4066 case AMDGPU::V_MAC_F16_e64:
4067 return AMDGPU::V_MAD_F16_e64;
4068 case AMDGPU::V_MAC_F32_e32:
4069 case AMDGPU::V_MAC_F32_e64:
4070 return AMDGPU::V_MAD_F32_e64;
4071 case AMDGPU::V_MAC_LEGACY_F32_e32:
4072 case AMDGPU::V_MAC_LEGACY_F32_e64:
4073 return AMDGPU::V_MAD_LEGACY_F32_e64;
4074 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4075 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4076 return AMDGPU::V_FMA_LEGACY_F32_e64;
4077 case AMDGPU::V_FMAC_F16_e32:
4078 case AMDGPU::V_FMAC_F16_e64:
4079 case AMDGPU::V_FMAC_F16_t16_e64:
4080 case AMDGPU::V_FMAC_F16_fake16_e64:
4081 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4082 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4083 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4084 : AMDGPU::V_FMA_F16_gfx9_e64;
4085 case AMDGPU::V_FMAC_F32_e32:
4086 case AMDGPU::V_FMAC_F32_e64:
4087 return AMDGPU::V_FMA_F32_e64;
4088 case AMDGPU::V_FMAC_F64_e32:
4089 case AMDGPU::V_FMAC_F64_e64:
4090 return AMDGPU::V_FMA_F64_e64;
4091 default:
4092 llvm_unreachable("invalid instruction");
4093 }
4094}
4095
4096/// Helper struct for the implementation of 3-address conversion to communicate
4097/// updates made to instruction operands.
4099 /// Other instruction whose def is no longer used by the converted
4100 /// instruction.
4102};
4103
4105 LiveVariables *LV,
4106 LiveIntervals *LIS) const {
4107 MachineBasicBlock &MBB = *MI.getParent();
4108 MachineInstr *CandidateMI = &MI;
4109
4110 if (MI.isBundle()) {
4111 // This is a temporary placeholder for bundle handling that enables us to
4112 // exercise the relevant code paths in the two-address instruction pass.
4113 if (MI.getBundleSize() != 1)
4114 return nullptr;
4115 CandidateMI = MI.getNextNode();
4116 }
4117
4119 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4120 if (!NewMI)
4121 return nullptr;
4122
4123 if (MI.isBundle()) {
4124 CandidateMI->eraseFromBundle();
4125
4126 for (MachineOperand &MO : MI.all_defs()) {
4127 if (MO.isTied())
4128 MI.untieRegOperand(MO.getOperandNo());
4129 }
4130 } else {
4131 updateLiveVariables(LV, MI, *NewMI);
4132 if (LIS) {
4133 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4134 // SlotIndex of defs needs to be updated when converting to early-clobber
4135 MachineOperand &Def = NewMI->getOperand(0);
4136 if (Def.isEarlyClobber() && Def.isReg() &&
4137 LIS->hasInterval(Def.getReg())) {
4138 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4139 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4140 auto &LI = LIS->getInterval(Def.getReg());
4141 auto UpdateDefIndex = [&](LiveRange &LR) {
4142 auto *S = LR.find(OldIndex);
4143 if (S != LR.end() && S->start == OldIndex) {
4144 assert(S->valno && S->valno->def == OldIndex);
4145 S->start = NewIndex;
4146 S->valno->def = NewIndex;
4147 }
4148 };
4149 UpdateDefIndex(LI);
4150 for (auto &SR : LI.subranges())
4151 UpdateDefIndex(SR);
4152 }
4153 }
4154 }
4155
4156 if (U.RemoveMIUse) {
4157 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4158 // The only user is the instruction which will be killed.
4159 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4160
4161 if (MRI.hasOneNonDBGUse(DefReg)) {
4162 // We cannot just remove the DefMI here, calling pass will crash.
4163 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4164 U.RemoveMIUse->getOperand(0).setIsDead(true);
4165 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4166 U.RemoveMIUse->removeOperand(I);
4167 if (LV)
4168 LV->getVarInfo(DefReg).AliveBlocks.clear();
4169 }
4170
4171 if (MI.isBundle()) {
4172 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4173 if (!VRI.Reads && !VRI.Writes) {
4174 for (MachineOperand &MO : MI.all_uses()) {
4175 if (MO.isReg() && MO.getReg() == DefReg) {
4176 assert(MO.getSubReg() == 0 &&
4177 "tied sub-registers in bundles currently not supported");
4178 MI.removeOperand(MO.getOperandNo());
4179 break;
4180 }
4181 }
4182
4183 if (LIS)
4184 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4185 }
4186 } else if (LIS) {
4187 LiveInterval &DefLI = LIS->getInterval(DefReg);
4188
4189 // We cannot delete the original instruction here, so hack out the use
4190 // in the original instruction with a dummy register so we can use
4191 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4192 // not have the complexity of deleting a use to consider here.
4193 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4194 for (MachineOperand &MIOp : MI.uses()) {
4195 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4196 MIOp.setIsUndef(true);
4197 MIOp.setReg(DummyReg);
4198 }
4199 }
4200
4201 if (MI.isBundle()) {
4202 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4203 if (!VRI.Reads && !VRI.Writes) {
4204 for (MachineOperand &MIOp : MI.uses()) {
4205 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4206 MIOp.setIsUndef(true);
4207 MIOp.setReg(DummyReg);
4208 }
4209 }
4210 }
4211
4212 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4213 false, /*isUndef=*/true));
4214 }
4215
4216 LIS->shrinkToUses(&DefLI);
4217 }
4218 }
4219
4220 return MI.isBundle() ? &MI : NewMI;
4221}
4222
4224SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4225 ThreeAddressUpdates &U) const {
4226 MachineBasicBlock &MBB = *MI.getParent();
4227 unsigned Opc = MI.getOpcode();
4228
4229 // Handle MFMA.
4230 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4231 if (NewMFMAOpc != -1) {
4233 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4234 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4235 MIB.add(MI.getOperand(I));
4236 return MIB;
4237 }
4238
4239 if (SIInstrInfo::isWMMA(MI)) {
4240 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4241 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4242 .setMIFlags(MI.getFlags());
4243 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4244 MIB->addOperand(MI.getOperand(I));
4245 return MIB;
4246 }
4247
4248 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4249 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4250 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4251 "present pre-RA");
4252
4253 // Handle MAC/FMAC.
4254 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4255 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4256 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4257 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4258 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4259 bool Src0Literal = false;
4260
4261 switch (Opc) {
4262 default:
4263 return nullptr;
4264 case AMDGPU::V_MAC_F16_e64:
4265 case AMDGPU::V_FMAC_F16_e64:
4266 case AMDGPU::V_FMAC_F16_t16_e64:
4267 case AMDGPU::V_FMAC_F16_fake16_e64:
4268 case AMDGPU::V_MAC_F32_e64:
4269 case AMDGPU::V_MAC_LEGACY_F32_e64:
4270 case AMDGPU::V_FMAC_F32_e64:
4271 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4272 case AMDGPU::V_FMAC_F64_e64:
4273 break;
4274 case AMDGPU::V_MAC_F16_e32:
4275 case AMDGPU::V_FMAC_F16_e32:
4276 case AMDGPU::V_MAC_F32_e32:
4277 case AMDGPU::V_MAC_LEGACY_F32_e32:
4278 case AMDGPU::V_FMAC_F32_e32:
4279 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4280 case AMDGPU::V_FMAC_F64_e32: {
4281 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4282 AMDGPU::OpName::src0);
4283 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4284 if (!Src0->isReg() && !Src0->isImm())
4285 return nullptr;
4286
4287 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4288 Src0Literal = true;
4289
4290 break;
4291 }
4292 }
4293
4294 MachineInstrBuilder MIB;
4295 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4296 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4297 const MachineOperand *Src0Mods =
4298 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4299 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4300 const MachineOperand *Src1Mods =
4301 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4302 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4303 const MachineOperand *Src2Mods =
4304 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4305 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4306 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4307 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4308
4309 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4310 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4311 // If we have an SGPR input, we will violate the constant bus restriction.
4312 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4313 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4314 MachineInstr *DefMI;
4315
4316 int64_t Imm;
4317 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4318 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4319 if (pseudoToMCOpcode(NewOpc) != -1) {
4320 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4321 .add(*Dst)
4322 .add(*Src0)
4323 .add(*Src1)
4324 .addImm(Imm)
4325 .setMIFlags(MI.getFlags());
4326 U.RemoveMIUse = DefMI;
4327 return MIB;
4328 }
4329 }
4330 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4331 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4332 if (pseudoToMCOpcode(NewOpc) != -1) {
4333 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4334 .add(*Dst)
4335 .add(*Src0)
4336 .addImm(Imm)
4337 .add(*Src2)
4338 .setMIFlags(MI.getFlags());
4339 U.RemoveMIUse = DefMI;
4340 return MIB;
4341 }
4342 }
4343 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4344 if (Src0Literal) {
4345 Imm = Src0->getImm();
4346 DefMI = nullptr;
4347 }
4348 if (pseudoToMCOpcode(NewOpc) != -1 &&
4350 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4351 Src1)) {
4352 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4353 .add(*Dst)
4354 .add(*Src1)
4355 .addImm(Imm)
4356 .add(*Src2)
4357 .setMIFlags(MI.getFlags());
4358 U.RemoveMIUse = DefMI;
4359 return MIB;
4360 }
4361 }
4362 }
4363
4364 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4365 // if VOP3 does not allow a literal operand.
4366 if (Src0Literal && !ST.hasVOP3Literal())
4367 return nullptr;
4368
4369 unsigned NewOpc = getNewFMAInst(ST, Opc);
4370
4371 if (pseudoToMCOpcode(NewOpc) == -1)
4372 return nullptr;
4373
4374 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4375 .add(*Dst)
4376 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4377 .add(*Src0)
4378 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4379 .add(*Src1)
4380 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4381 .add(*Src2)
4382 .addImm(Clamp ? Clamp->getImm() : 0)
4383 .addImm(Omod ? Omod->getImm() : 0)
4384 .setMIFlags(MI.getFlags());
4385 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4386 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4387 return MIB;
4388}
4389
4390// It's not generally safe to move VALU instructions across these since it will
4391// start using the register as a base index rather than directly.
4392// XXX - Why isn't hasSideEffects sufficient for these?
4394 switch (MI.getOpcode()) {
4395 case AMDGPU::S_SET_GPR_IDX_ON:
4396 case AMDGPU::S_SET_GPR_IDX_MODE:
4397 case AMDGPU::S_SET_GPR_IDX_OFF:
4398 return true;
4399 default:
4400 return false;
4401 }
4402}
4403
4405 const MachineBasicBlock *MBB,
4406 const MachineFunction &MF) const {
4407 // Skipping the check for SP writes in the base implementation. The reason it
4408 // was added was apparently due to compile time concerns.
4409 //
4410 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4411 // but is probably avoidable.
4412
4413 // Copied from base implementation.
4414 // Terminators and labels can't be scheduled around.
4415 if (MI.isTerminator() || MI.isPosition())
4416 return true;
4417
4418 // INLINEASM_BR can jump to another block
4419 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4420 return true;
4421
4422 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4423 return true;
4424
4425 // Target-independent instructions do not have an implicit-use of EXEC, even
4426 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4427 // boundaries prevents incorrect movements of such instructions.
4428 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4429 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4430 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4431 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4432 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4434}
4435
4437 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4438 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4439 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4440}
4441
4443 // Instructions that access scratch use FLAT encoding or BUF encodings.
4444 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4445 return false;
4446
4447 // SCRATCH instructions always access scratch.
4448 if (isFLATScratch(MI))
4449 return true;
4450
4451 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4452 // via the aperture.
4453 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4454 return false;
4455
4456 // If there are no memory operands then conservatively assume the flat
4457 // operation may access scratch.
4458 if (MI.memoperands_empty())
4459 return true;
4460
4461 // See if any memory operand specifies an address space that involves scratch.
4462 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4463 unsigned AS = Memop->getAddrSpace();
4464 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4465 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4466 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4467 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4468 }
4469 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4470 });
4471}
4472
4474 assert(isFLAT(MI));
4475
4476 // All flat instructions use the VMEM counter except prefetch.
4477 if (!usesVM_CNT(MI))
4478 return false;
4479
4480 // If there are no memory operands then conservatively assume the flat
4481 // operation may access VMEM.
4482 if (MI.memoperands_empty())
4483 return true;
4484
4485 // See if any memory operand specifies an address space that involves VMEM.
4486 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4487 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4488 // (GDS) address space is not supported by flat operations. Therefore, simply
4489 // return true unless only the LDS address space is found.
4490 for (const MachineMemOperand *Memop : MI.memoperands()) {
4491 unsigned AS = Memop->getAddrSpace();
4493 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4494 return true;
4495 }
4496
4497 return false;
4498}
4499
4501 assert(isFLAT(MI));
4502
4503 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4504 if (!usesLGKM_CNT(MI))
4505 return false;
4506
4507 // If in tgsplit mode then there can be no use of LDS.
4508 if (ST.isTgSplitEnabled())
4509 return false;
4510
4511 // If there are no memory operands then conservatively assume the flat
4512 // operation may access LDS.
4513 if (MI.memoperands_empty())
4514 return true;
4515
4516 // See if any memory operand specifies an address space that involves LDS.
4517 for (const MachineMemOperand *Memop : MI.memoperands()) {
4518 unsigned AS = Memop->getAddrSpace();
4520 return true;
4521 }
4522
4523 return false;
4524}
4525
4527 // Skip the full operand and register alias search modifiesRegister
4528 // does. There's only a handful of instructions that touch this, it's only an
4529 // implicit def, and doesn't alias any other registers.
4530 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4531}
4532
4534 unsigned Opcode = MI.getOpcode();
4535
4536 if (MI.mayStore() && isSMRD(MI))
4537 return true; // scalar store or atomic
4538
4539 // This will terminate the function when other lanes may need to continue.
4540 if (MI.isReturn())
4541 return true;
4542
4543 // These instructions cause shader I/O that may cause hardware lockups
4544 // when executed with an empty EXEC mask.
4545 //
4546 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4547 // EXEC = 0, but checking for that case here seems not worth it
4548 // given the typical code patterns.
4549 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4550 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4551 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4552 Opcode == AMDGPU::S_SETHALT)
4553 return true;
4554
4555 if (MI.isCall() || MI.isInlineAsm())
4556 return true; // conservative assumption
4557
4558 // Assume that barrier interactions are only intended with active lanes.
4559 if (isBarrier(Opcode))
4560 return true;
4561
4562 // A mode change is a scalar operation that influences vector instructions.
4564 return true;
4565
4566 // These are like SALU instructions in terms of effects, so it's questionable
4567 // whether we should return true for those.
4568 //
4569 // However, executing them with EXEC = 0 causes them to operate on undefined
4570 // data, which we avoid by returning true here.
4571 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4572 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4573 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4574 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4575 return true;
4576
4577 return false;
4578}
4579
4581 const MachineInstr &MI) const {
4582 if (MI.isMetaInstruction())
4583 return false;
4584
4585 // This won't read exec if this is an SGPR->SGPR copy.
4586 if (MI.isCopyLike()) {
4587 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4588 return true;
4589
4590 // Make sure this isn't copying exec as a normal operand
4591 return MI.readsRegister(AMDGPU::EXEC, &RI);
4592 }
4593
4594 // Make a conservative assumption about the callee.
4595 if (MI.isCall())
4596 return true;
4597
4598 // Be conservative with any unhandled generic opcodes.
4599 if (!isTargetSpecificOpcode(MI.getOpcode()))
4600 return true;
4601
4602 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4603}
4604
4605bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4606 switch (Imm.getBitWidth()) {
4607 case 1: // This likely will be a condition code mask.
4608 return true;
4609
4610 case 32:
4611 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4612 ST.hasInv2PiInlineImm());
4613 case 64:
4614 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4615 ST.hasInv2PiInlineImm());
4616 case 16:
4617 return ST.has16BitInsts() &&
4618 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4619 ST.hasInv2PiInlineImm());
4620 default:
4621 llvm_unreachable("invalid bitwidth");
4622 }
4623}
4624
4626 APInt IntImm = Imm.bitcastToAPInt();
4627 int64_t IntImmVal = IntImm.getSExtValue();
4628 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4629 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4630 default:
4631 llvm_unreachable("invalid fltSemantics");
4634 return isInlineConstant(IntImm);
4636 return ST.has16BitInsts() &&
4637 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4639 return ST.has16BitInsts() &&
4640 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4641 }
4642}
4643
4644bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4645 // MachineOperand provides no way to tell the true operand size, since it only
4646 // records a 64-bit value. We need to know the size to determine if a 32-bit
4647 // floating point immediate bit pattern is legal for an integer immediate. It
4648 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4649 switch (OperandType) {
4659 int32_t Trunc = static_cast<int32_t>(Imm);
4660 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4661 }
4669 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4672 // We would expect inline immediates to not be concerned with an integer/fp
4673 // distinction. However, in the case of 16-bit integer operations, the
4674 // "floating point" values appear to not work. It seems read the low 16-bits
4675 // of 32-bit immediates, which happens to always work for the integer
4676 // values.
4677 //
4678 // See llvm bugzilla 46302.
4679 //
4680 // TODO: Theoretically we could use op-sel to use the high bits of the
4681 // 32-bit FP values.
4690 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4695 return false;
4698 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4699 // A few special case instructions have 16-bit operands on subtargets
4700 // where 16-bit instructions are not legal.
4701 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4702 // constants in these cases
4703 int16_t Trunc = static_cast<int16_t>(Imm);
4704 return ST.has16BitInsts() &&
4705 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4706 }
4707
4708 return false;
4709 }
4712 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4713 int16_t Trunc = static_cast<int16_t>(Imm);
4714 return ST.has16BitInsts() &&
4715 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4716 }
4717 return false;
4718 }
4722 return false;
4724 return isLegalAV64PseudoImm(Imm);
4727 // Always embedded in the instruction for free.
4728 return true;
4738 // Just ignore anything else.
4739 return true;
4740 default:
4741 llvm_unreachable("invalid operand type");
4742 }
4743}
4744
4745static bool compareMachineOp(const MachineOperand &Op0,
4746 const MachineOperand &Op1) {
4747 if (Op0.getType() != Op1.getType())
4748 return false;
4749
4750 switch (Op0.getType()) {
4752 return Op0.getReg() == Op1.getReg();
4754 return Op0.getImm() == Op1.getImm();
4755 default:
4756 llvm_unreachable("Didn't expect to be comparing these operand types");
4757 }
4758}
4759
4761 const MCOperandInfo &OpInfo) const {
4762 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4763 return true;
4764
4765 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4766 return false;
4767
4768 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4769 return true;
4770
4771 return ST.hasVOP3Literal();
4772}
4773
4774bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4775 int64_t ImmVal) const {
4776 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4777 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4778 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4779 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4780 AMDGPU::OpName::src2))
4781 return false;
4782 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4783 }
4784
4785 return isLiteralOperandLegal(InstDesc, OpInfo);
4786}
4787
4788bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4789 const MachineOperand &MO) const {
4790 if (MO.isImm())
4791 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4792
4793 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4794 "unexpected imm-like operand kind");
4795 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4796 return isLiteralOperandLegal(InstDesc, OpInfo);
4797}
4798
4800 // 2 32-bit inline constants packed into one.
4801 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4802 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4803}
4804
4805bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4806 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4807 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4808 return false;
4809
4810 int Op32 = AMDGPU::getVOPe32(Opcode);
4811 if (Op32 == -1)
4812 return false;
4813
4814 return pseudoToMCOpcode(Op32) != -1;
4815}
4816
4817bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4818 // The src0_modifier operand is present on all instructions
4819 // that have modifiers.
4820
4821 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4822}
4823
4825 AMDGPU::OpName OpName) const {
4826 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4827 return Mods && Mods->getImm();
4828}
4829
4831 return any_of(ModifierOpNames,
4832 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4833}
4834
4836 const MachineRegisterInfo &MRI) const {
4837 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4838 // Can't shrink instruction with three operands.
4839 if (Src2) {
4840 switch (MI.getOpcode()) {
4841 default: return false;
4842
4843 case AMDGPU::V_ADDC_U32_e64:
4844 case AMDGPU::V_SUBB_U32_e64:
4845 case AMDGPU::V_SUBBREV_U32_e64: {
4846 const MachineOperand *Src1
4847 = getNamedOperand(MI, AMDGPU::OpName::src1);
4848 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4849 return false;
4850 // Additional verification is needed for sdst/src2.
4851 return true;
4852 }
4853 case AMDGPU::V_MAC_F16_e64:
4854 case AMDGPU::V_MAC_F32_e64:
4855 case AMDGPU::V_MAC_LEGACY_F32_e64:
4856 case AMDGPU::V_FMAC_F16_e64:
4857 case AMDGPU::V_FMAC_F16_t16_e64:
4858 case AMDGPU::V_FMAC_F16_fake16_e64:
4859 case AMDGPU::V_FMAC_F32_e64:
4860 case AMDGPU::V_FMAC_F64_e64:
4861 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4862 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4863 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4864 return false;
4865 break;
4866
4867 case AMDGPU::V_CNDMASK_B32_e64:
4868 break;
4869 }
4870 }
4871
4872 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4873 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4874 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4875 return false;
4876
4877 // We don't need to check src0, all input types are legal, so just make sure
4878 // src0 isn't using any modifiers.
4879 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4880 return false;
4881
4882 // Can it be shrunk to a valid 32 bit opcode?
4883 if (!hasVALU32BitEncoding(MI.getOpcode()))
4884 return false;
4885
4886 // Check output modifiers
4887 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4888 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4889 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4890 // TODO: Can we avoid checking bound_ctrl/fi here?
4891 // They are only used by permlane*_swap special case.
4892 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4893 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4894}
4895
4896// Set VCC operand with all flags from \p Orig, except for setting it as
4897// implicit.
4899 const MachineOperand &Orig) {
4900
4901 for (MachineOperand &Use : MI.implicit_operands()) {
4902 if (Use.isUse() &&
4903 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4904 Use.setIsUndef(Orig.isUndef());
4905 Use.setIsKill(Orig.isKill());
4906 return;
4907 }
4908 }
4909}
4910
4912 unsigned Op32) const {
4913 MachineBasicBlock *MBB = MI.getParent();
4914
4915 const MCInstrDesc &Op32Desc = get(Op32);
4916 MachineInstrBuilder Inst32 =
4917 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4918 .setMIFlags(MI.getFlags());
4919
4920 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4921 // For VOPC instructions, this is replaced by an implicit def of vcc.
4922
4923 // We assume the defs of the shrunk opcode are in the same order, and the
4924 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4925 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4926 Inst32.add(MI.getOperand(I));
4927
4928 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4929
4930 int Idx = MI.getNumExplicitDefs();
4931 for (const MachineOperand &Use : MI.explicit_uses()) {
4932 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4934 continue;
4935
4936 if (&Use == Src2) {
4937 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4938 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4939 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4940 // of vcc was already added during the initial BuildMI, but we
4941 // 1) may need to change vcc to vcc_lo to preserve the original register
4942 // 2) have to preserve the original flags.
4943 copyFlagsToImplicitVCC(*Inst32, *Src2);
4944 continue;
4945 }
4946 }
4947
4948 Inst32.add(Use);
4949 }
4950
4951 // FIXME: Losing implicit operands
4952 fixImplicitOperands(*Inst32);
4953 return Inst32;
4954}
4955
4957 // Null is free
4958 Register Reg = RegOp.getReg();
4959 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4960 return false;
4961
4962 // SGPRs use the constant bus
4963
4964 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4965 // physical register operands should also count, except for exec.
4966 if (RegOp.isImplicit())
4967 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4968
4969 // SGPRs use the constant bus
4970 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4971 AMDGPU::SReg_64RegClass.contains(Reg);
4972}
4973
4975 const MachineRegisterInfo &MRI) const {
4976 Register Reg = RegOp.getReg();
4977 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4978 : physRegUsesConstantBus(RegOp);
4979}
4980
4982 const MachineOperand &MO,
4983 const MCOperandInfo &OpInfo) const {
4984 // Literal constants use the constant bus.
4985 if (!MO.isReg())
4986 return !isInlineConstant(MO, OpInfo);
4987
4988 Register Reg = MO.getReg();
4989 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4991}
4992
4994 for (const MachineOperand &MO : MI.implicit_operands()) {
4995 // We only care about reads.
4996 if (MO.isDef())
4997 continue;
4998
4999 switch (MO.getReg()) {
5000 case AMDGPU::VCC:
5001 case AMDGPU::VCC_LO:
5002 case AMDGPU::VCC_HI:
5003 case AMDGPU::M0:
5004 case AMDGPU::FLAT_SCR:
5005 return MO.getReg();
5006
5007 default:
5008 break;
5009 }
5010 }
5011
5012 return Register();
5013}
5014
5015static bool shouldReadExec(const MachineInstr &MI) {
5016 if (SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true)) {
5017 switch (MI.getOpcode()) {
5018 case AMDGPU::V_READLANE_B32:
5019 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5020 case AMDGPU::V_WRITELANE_B32:
5021 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5022 return false;
5023 }
5024
5025 return true;
5026 }
5027
5028 if (MI.isPreISelOpcode() ||
5029 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5032 return false;
5033
5034 return true;
5035}
5036
5037static bool isRegOrFI(const MachineOperand &MO) {
5038 return MO.isReg() || MO.isFI();
5039}
5040
5041static bool isSubRegOf(const SIRegisterInfo &TRI,
5042 const MachineOperand &SuperVec,
5043 const MachineOperand &SubReg) {
5044 if (SubReg.getReg().isPhysical())
5045 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5046
5047 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5048 SubReg.getReg() == SuperVec.getReg();
5049}
5050
5051// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5052bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5053 const MachineRegisterInfo &MRI,
5054 StringRef &ErrInfo) const {
5055 Register DstReg = MI.getOperand(0).getReg();
5056 Register SrcReg = MI.getOperand(1).getReg();
5057 // This is a check for copy from vector register to SGPR
5058 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5059 ErrInfo = "illegal copy from vector register to SGPR";
5060 return false;
5061 }
5062 return true;
5063}
5064
5066 StringRef &ErrInfo) const {
5067 uint32_t Opcode = MI.getOpcode();
5068 const MachineFunction *MF = MI.getMF();
5069 const MachineRegisterInfo &MRI = MF->getRegInfo();
5070
5071 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5072 // Find a better property to recognize the point where instruction selection
5073 // is just done.
5074 // We can only enforce this check after SIFixSGPRCopies pass so that the
5075 // illegal copies are legalized and thereafter we don't expect a pass
5076 // inserting similar copies.
5077 if (!MRI.isSSA() && MI.isCopy())
5078 return verifyCopy(MI, MRI, ErrInfo);
5079
5080 if (SIInstrInfo::isGenericOpcode(Opcode))
5081 return true;
5082
5083 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5084 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5085 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5086 int Src3Idx = -1;
5087 if (Src0Idx == -1) {
5088 // VOPD V_DUAL_* instructions use different operand names.
5089 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5090 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5091 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5092 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5093 }
5094
5095 // Make sure the number of operands is correct.
5096 const MCInstrDesc &Desc = get(Opcode);
5097 if (!Desc.isVariadic() &&
5098 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5099 ErrInfo = "Instruction has wrong number of operands.";
5100 return false;
5101 }
5102
5103 if (MI.isInlineAsm()) {
5104 // Verify register classes for inlineasm constraints.
5105 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5106 I != E; ++I) {
5107 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5108 if (!RC)
5109 continue;
5110
5111 const MachineOperand &Op = MI.getOperand(I);
5112 if (!Op.isReg())
5113 continue;
5114
5115 Register Reg = Op.getReg();
5116 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5117 ErrInfo = "inlineasm operand has incorrect register class.";
5118 return false;
5119 }
5120 }
5121
5122 return true;
5123 }
5124
5125 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5126 ErrInfo = "missing memory operand from image instruction.";
5127 return false;
5128 }
5129
5130 // Make sure the register classes are correct.
5131 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5132 const MachineOperand &MO = MI.getOperand(i);
5133 if (MO.isFPImm()) {
5134 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5135 "all fp values to integers.";
5136 return false;
5137 }
5138
5139 const MCOperandInfo &OpInfo = Desc.operands()[i];
5140 int16_t RegClass = getOpRegClassID(OpInfo);
5141
5142 switch (OpInfo.OperandType) {
5144 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5145 ErrInfo = "Illegal immediate value for operand.";
5146 return false;
5147 }
5148 break;
5160 break;
5162 break;
5163 break;
5177 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5178 ErrInfo = "Illegal immediate value for operand.";
5179 return false;
5180 }
5181 break;
5182 }
5187 if (ST.has64BitLiterals() && Desc.getSize() != 4 && MO.isImm() &&
5188 !isInlineConstant(MI, i) &&
5190 OpInfo.OperandType ==
5192 ErrInfo = "illegal 64-bit immediate value for operand.";
5193 return false;
5194 }
5195 break;
5198 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5199 ErrInfo = "Expected inline constant for operand.";
5200 return false;
5201 }
5202 break;
5205 break;
5210 // Check if this operand is an immediate.
5211 // FrameIndex operands will be replaced by immediates, so they are
5212 // allowed.
5213 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5214 ErrInfo = "Expected immediate, but got non-immediate";
5215 return false;
5216 }
5217 break;
5221 break;
5222 default:
5223 if (OpInfo.isGenericType())
5224 continue;
5225 break;
5226 }
5227
5228 if (!MO.isReg())
5229 continue;
5230 Register Reg = MO.getReg();
5231 if (!Reg)
5232 continue;
5233
5234 // FIXME: Ideally we would have separate instruction definitions with the
5235 // aligned register constraint.
5236 // FIXME: We do not verify inline asm operands, but custom inline asm
5237 // verification is broken anyway
5238 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5239 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5240 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5241 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5242 if (const TargetRegisterClass *SubRC =
5243 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5244 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5245 if (RC)
5246 RC = SubRC;
5247 }
5248 }
5249
5250 // Check that this is the aligned version of the class.
5251 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5252 ErrInfo = "Subtarget requires even aligned vector registers";
5253 return false;
5254 }
5255 }
5256
5257 if (RegClass != -1) {
5258 if (Reg.isVirtual())
5259 continue;
5260
5261 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5262 if (!RC->contains(Reg)) {
5263 ErrInfo = "Operand has incorrect register class.";
5264 return false;
5265 }
5266 }
5267 }
5268
5269 // Verify SDWA
5270 if (isSDWA(MI)) {
5271 if (!ST.hasSDWA()) {
5272 ErrInfo = "SDWA is not supported on this target";
5273 return false;
5274 }
5275
5276 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5277 AMDGPU::OpName::dst_sel}) {
5278 const MachineOperand *MO = getNamedOperand(MI, Op);
5279 if (!MO)
5280 continue;
5281 int64_t Imm = MO->getImm();
5282 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5283 ErrInfo = "Invalid SDWA selection";
5284 return false;
5285 }
5286 }
5287
5288 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5289
5290 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5291 if (OpIdx == -1)
5292 continue;
5293 const MachineOperand &MO = MI.getOperand(OpIdx);
5294
5295 if (!ST.hasSDWAScalar()) {
5296 // Only VGPRS on VI
5297 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5298 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5299 return false;
5300 }
5301 } else {
5302 // No immediates on GFX9
5303 if (!MO.isReg()) {
5304 ErrInfo =
5305 "Only reg allowed as operands in SDWA instructions on GFX9+";
5306 return false;
5307 }
5308 }
5309 }
5310
5311 if (!ST.hasSDWAOmod()) {
5312 // No omod allowed on VI
5313 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5314 if (OMod != nullptr &&
5315 (!OMod->isImm() || OMod->getImm() != 0)) {
5316 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5317 return false;
5318 }
5319 }
5320
5321 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5322 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5323 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5324 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5325 const MachineOperand *Src0ModsMO =
5326 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5327 unsigned Mods = Src0ModsMO->getImm();
5328 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5329 Mods & SISrcMods::SEXT) {
5330 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5331 return false;
5332 }
5333 }
5334
5335 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5336 if (isVOPC(BasicOpcode)) {
5337 if (!ST.hasSDWASdst() && DstIdx != -1) {
5338 // Only vcc allowed as dst on VI for VOPC
5339 const MachineOperand &Dst = MI.getOperand(DstIdx);
5340 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5341 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5342 return false;
5343 }
5344 } else if (!ST.hasSDWAOutModsVOPC()) {
5345 // No clamp allowed on GFX9 for VOPC
5346 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5347 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5348 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5349 return false;
5350 }
5351
5352 // No omod allowed on GFX9 for VOPC
5353 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5354 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5355 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5356 return false;
5357 }
5358 }
5359 }
5360
5361 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5362 if (DstUnused && DstUnused->isImm() &&
5363 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5364 const MachineOperand &Dst = MI.getOperand(DstIdx);
5365 if (!Dst.isReg() || !Dst.isTied()) {
5366 ErrInfo = "Dst register should have tied register";
5367 return false;
5368 }
5369
5370 const MachineOperand &TiedMO =
5371 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5372 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5373 ErrInfo =
5374 "Dst register should be tied to implicit use of preserved register";
5375 return false;
5376 }
5377 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5378 ErrInfo = "Dst register should use same physical register as preserved";
5379 return false;
5380 }
5381 }
5382 }
5383
5384 if (isDPP(MI) && !ST.hasDPPSrc1SGPR() && Src1Idx != -1) {
5385 const MachineOperand &Src1MO = MI.getOperand(Src1Idx);
5386 if (Src1MO.isReg() && RI.isSGPRReg(MRI, Src1MO.getReg())) {
5387 ErrInfo = "DPP src1 cannot be SGPR on this subtarget";
5388 return false;
5389 }
5390 }
5391
5392 // Verify MIMG / VIMAGE / VSAMPLE
5393 if (isImage(Opcode) && !MI.mayStore()) {
5394 // Ensure that the return type used is large enough for all the options
5395 // being used TFE/LWE require an extra result register.
5396 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5397 if (DMask) {
5398 uint64_t DMaskImm = DMask->getImm();
5399 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5400 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5401 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5402 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5403
5404 // Adjust for packed 16 bit values
5405 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5406 RegCount = divideCeil(RegCount, 2);
5407
5408 // Adjust if using LWE or TFE
5409 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5410 RegCount += 1;
5411
5412 const uint32_t DstIdx =
5413 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5414 const MachineOperand &Dst = MI.getOperand(DstIdx);
5415 if (Dst.isReg()) {
5416 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5417 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5418 if (RegCount > DstSize) {
5419 ErrInfo = "Image instruction returns too many registers for dst "
5420 "register class";
5421 return false;
5422 }
5423 }
5424 }
5425 }
5426
5427 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5428 if (isVALU(MI, /*AllowLDSDMA=*/true) &&
5429 Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5430 unsigned ConstantBusCount = 0;
5431 bool UsesLiteral = false;
5432 const MachineOperand *LiteralVal = nullptr;
5433
5434 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5435 if (ImmIdx != -1) {
5436 ++ConstantBusCount;
5437 UsesLiteral = true;
5438 LiteralVal = &MI.getOperand(ImmIdx);
5439 }
5440
5441 SmallVector<Register, 2> SGPRsUsed;
5442 Register SGPRUsed;
5443
5444 // Only look at the true operands. Only a real operand can use the constant
5445 // bus, and we don't want to check pseudo-operands like the source modifier
5446 // flags.
5447 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5448 if (OpIdx == -1)
5449 continue;
5450 const MachineOperand &MO = MI.getOperand(OpIdx);
5451 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5452 if (MO.isReg()) {
5453 SGPRUsed = MO.getReg();
5454 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5455 ++ConstantBusCount;
5456 SGPRsUsed.push_back(SGPRUsed);
5457 }
5458 } else if (!MO.isFI()) { // Treat FI like a register.
5459 if (!UsesLiteral) {
5460 ++ConstantBusCount;
5461 UsesLiteral = true;
5462 LiteralVal = &MO;
5463 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5464 assert(isVOP2(MI) || isVOP3(MI));
5465 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5466 return false;
5467 }
5468 }
5469 }
5470 }
5471
5472 SGPRUsed = findImplicitSGPRRead(MI);
5473 if (SGPRUsed) {
5474 // Implicit uses may safely overlap true operands
5475 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5476 return !RI.regsOverlap(SGPRUsed, SGPR);
5477 })) {
5478 ++ConstantBusCount;
5479 SGPRsUsed.push_back(SGPRUsed);
5480 }
5481 }
5482
5483 // v_writelane_b32 is an exception from constant bus restriction:
5484 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5485 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5486 Opcode != AMDGPU::V_WRITELANE_B32) {
5487 ErrInfo = "VOP* instruction violates constant bus restriction";
5488 return false;
5489 }
5490
5491 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5492 ErrInfo = "VOP3 instruction uses literal";
5493 return false;
5494 }
5495 }
5496
5497 // Special case for writelane - this can break the multiple constant bus rule,
5498 // but still can't use more than one SGPR register
5499 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5500 unsigned SGPRCount = 0;
5501 Register SGPRUsed;
5502
5503 for (int OpIdx : {Src0Idx, Src1Idx}) {
5504 if (OpIdx == -1)
5505 break;
5506
5507 const MachineOperand &MO = MI.getOperand(OpIdx);
5508
5509 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5510 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5511 if (MO.getReg() != SGPRUsed)
5512 ++SGPRCount;
5513 SGPRUsed = MO.getReg();
5514 }
5515 }
5516 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5517 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5518 return false;
5519 }
5520 }
5521 }
5522
5523 // Verify misc. restrictions on specific instructions.
5524 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5525 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5526 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5527 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5528 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5529 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5530 if (!compareMachineOp(Src0, Src1) &&
5531 !compareMachineOp(Src0, Src2)) {
5532 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5533 return false;
5534 }
5535 }
5536 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5537 SISrcMods::ABS) ||
5538 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5539 SISrcMods::ABS) ||
5540 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5541 SISrcMods::ABS)) {
5542 ErrInfo = "ABS not allowed in VOP3B instructions";
5543 return false;
5544 }
5545 }
5546
5547 if (isSOP2(MI) || isSOPC(MI)) {
5548 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5549 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5550
5551 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5552 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5553 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5554 !Src0.isIdenticalTo(Src1)) {
5555 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5556 return false;
5557 }
5558 }
5559
5560 if (isSOPK(MI)) {
5561 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5562 if (Desc.isBranch()) {
5563 if (!Op->isMBB()) {
5564 ErrInfo = "invalid branch target for SOPK instruction";
5565 return false;
5566 }
5567 } else {
5568 uint64_t Imm = Op->getImm();
5569 if (sopkIsZext(Opcode)) {
5570 if (!isUInt<16>(Imm)) {
5571 ErrInfo = "invalid immediate for SOPK instruction";
5572 return false;
5573 }
5574 } else {
5575 if (!isInt<16>(Imm)) {
5576 ErrInfo = "invalid immediate for SOPK instruction";
5577 return false;
5578 }
5579 }
5580 }
5581 }
5582
5583 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5584 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5585 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5586 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5587 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5588 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5589
5590 const unsigned StaticNumOps =
5591 Desc.getNumOperands() + Desc.implicit_uses().size();
5592 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5593
5594 // Require additional implicit operands. This allows a fixup done by the
5595 // post RA scheduler where the main implicit operand is killed and
5596 // implicit-defs are added for sub-registers that remain live after this
5597 // instruction.
5598 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5599 ErrInfo = "missing implicit register operands";
5600 return false;
5601 }
5602
5603 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5604 if (IsDst) {
5605 if (!Dst->isUse()) {
5606 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5607 return false;
5608 }
5609
5610 unsigned UseOpIdx;
5611 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5612 UseOpIdx != StaticNumOps + 1) {
5613 ErrInfo = "movrel implicit operands should be tied";
5614 return false;
5615 }
5616 }
5617
5618 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5619 const MachineOperand &ImpUse
5620 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5621 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5622 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5623 ErrInfo = "src0 should be subreg of implicit vector use";
5624 return false;
5625 }
5626 }
5627
5628 // Make sure we aren't losing exec uses in the td files. This mostly requires
5629 // being careful when using let Uses to try to add other use registers.
5630 if (shouldReadExec(MI)) {
5631 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5632 ErrInfo = "VALU instruction does not implicitly read exec mask";
5633 return false;
5634 }
5635 }
5636
5637 if (isSMRD(MI)) {
5638 if (MI.mayStore() &&
5639 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5640 // The register offset form of scalar stores may only use m0 as the
5641 // soffset register.
5642 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5643 if (Soff && Soff->getReg() != AMDGPU::M0) {
5644 ErrInfo = "scalar stores must use m0 as offset register";
5645 return false;
5646 }
5647 }
5648 }
5649
5650 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5651 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5652 if (Offset->getImm() != 0) {
5653 ErrInfo = "subtarget does not support offsets in flat instructions";
5654 return false;
5655 }
5656 }
5657
5658 if (isDS(MI) && !ST.hasGDS()) {
5659 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5660 if (GDSOp && GDSOp->getImm() != 0) {
5661 ErrInfo = "GDS is not supported on this subtarget";
5662 return false;
5663 }
5664 }
5665
5666 if (isImage(MI)) {
5667 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5668 if (DimOp) {
5669 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5670 AMDGPU::OpName::vaddr0);
5671 AMDGPU::OpName RSrcOpName =
5672 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5673 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5674 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5675 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5676 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5677 const AMDGPU::MIMGDimInfo *Dim =
5679
5680 if (!Dim) {
5681 ErrInfo = "dim is out of range";
5682 return false;
5683 }
5684
5685 bool IsA16 = false;
5686 if (ST.hasR128A16()) {
5687 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5688 IsA16 = R128A16->getImm() != 0;
5689 } else if (ST.hasA16()) {
5690 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5691 IsA16 = A16->getImm() != 0;
5692 }
5693
5694 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5695
5696 unsigned AddrWords =
5697 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5698
5699 unsigned VAddrWords;
5700 if (IsNSA) {
5701 VAddrWords = RsrcIdx - VAddr0Idx;
5702 if (ST.hasPartialNSAEncoding() &&
5703 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5704 unsigned LastVAddrIdx = RsrcIdx - 1;
5705 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5706 }
5707 } else {
5708 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5709 if (AddrWords > 12)
5710 AddrWords = 16;
5711 }
5712
5713 if (VAddrWords != AddrWords) {
5714 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5715 << " but got " << VAddrWords << "\n");
5716 ErrInfo = "bad vaddr size";
5717 return false;
5718 }
5719 }
5720 }
5721
5722 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5723 if (DppCt) {
5724 using namespace AMDGPU::DPP;
5725
5726 unsigned DC = DppCt->getImm();
5727 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5728 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5729 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5730 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5731 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5732 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5733 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5734 ErrInfo = "Invalid dpp_ctrl value";
5735 return false;
5736 }
5737 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5738 !ST.hasDPPWavefrontShifts()) {
5739 ErrInfo = "Invalid dpp_ctrl value: "
5740 "wavefront shifts are not supported on GFX10+";
5741 return false;
5742 }
5743 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5744 !ST.hasDPPBroadcasts()) {
5745 ErrInfo = "Invalid dpp_ctrl value: "
5746 "broadcasts are not supported on GFX10+";
5747 return false;
5748 }
5749 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5750 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5751 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5752 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5753 !ST.hasGFX90AInsts()) {
5754 ErrInfo = "Invalid dpp_ctrl value: "
5755 "row_newbroadcast/row_share is not supported before "
5756 "GFX90A/GFX10";
5757 return false;
5758 }
5759 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5760 ErrInfo = "Invalid dpp_ctrl value: "
5761 "row_share and row_xmask are not supported before GFX10";
5762 return false;
5763 }
5764 }
5765
5766 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5768 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5769 ErrInfo = "Invalid dpp_ctrl value: "
5770 "DP ALU dpp only support row_newbcast";
5771 return false;
5772 }
5773 }
5774
5775 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5776 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5777 AMDGPU::OpName DataName =
5778 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5779 const MachineOperand *Data = getNamedOperand(MI, DataName);
5780 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5781 if (Data && !Data->isReg())
5782 Data = nullptr;
5783
5784 if (ST.hasGFX90AInsts()) {
5785 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5786 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5787 ErrInfo = "Invalid register class: "
5788 "vdata and vdst should be both VGPR or AGPR";
5789 return false;
5790 }
5791 if (Data && Data2 &&
5792 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5793 ErrInfo = "Invalid register class: "
5794 "both data operands should be VGPR or AGPR";
5795 return false;
5796 }
5797 } else {
5798 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5799 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5800 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5801 ErrInfo = "Invalid register class: "
5802 "agpr loads and stores not supported on this GPU";
5803 return false;
5804 }
5805 }
5806 }
5807
5808 if (ST.needsAlignedVGPRs()) {
5809 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5811 if (!Op)
5812 return true;
5813 Register Reg = Op->getReg();
5814 if (Reg.isPhysical())
5815 return !(RI.getHWRegIndex(Reg) & 1);
5816 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5817 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5818 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5819 };
5820
5821 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5822 Opcode == AMDGPU::DS_GWS_BARRIER) {
5823
5824 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5825 ErrInfo = "Subtarget requires even aligned vector registers "
5826 "for DS_GWS instructions";
5827 return false;
5828 }
5829 }
5830
5831 if (isMIMG(MI)) {
5832 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5833 ErrInfo = "Subtarget requires even aligned vector registers "
5834 "for vaddr operand of image instructions";
5835 return false;
5836 }
5837 }
5838 }
5839
5840 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5841 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5842 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5843 ErrInfo = "Invalid register class: "
5844 "v_accvgpr_write with an SGPR is not supported on this GPU";
5845 return false;
5846 }
5847 }
5848
5849 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5850 const MachineOperand &SrcOp = MI.getOperand(1);
5851 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5852 ErrInfo = "pseudo expects only physical SGPRs";
5853 return false;
5854 }
5855 }
5856
5857 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5858 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5859 if (!ST.hasScaleOffset()) {
5860 ErrInfo = "Subtarget does not support offset scaling";
5861 return false;
5862 }
5863 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5864 ErrInfo = "Instruction does not support offset scaling";
5865 return false;
5866 }
5867 }
5868 }
5869
5870 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand for more
5871 // information.
5873 for (unsigned I = 0; I < 3; ++I) {
5875 return false;
5876 }
5877 }
5878
5879 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5880 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5881 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5882 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5883 &AMDGPU::SReg_64RegClass) ||
5884 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5885 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5886 return false;
5887 }
5888 }
5889
5890 return true;
5891}
5892
5894 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
5895 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5896 return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg())
5897 ? AMDGPU::COPY
5898 : AMDGPU::V_MOV_B32_e32;
5899 }
5900 return getVALUOp(MI.getOpcode());
5901}
5902
5903// It is more readable to list mapped opcodes on the same line.
5904// clang-format off
5905
5906unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
5907 switch (Opc) {
5908 default: return AMDGPU::INSTRUCTION_LIST_END;
5909 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5910 case AMDGPU::COPY: return AMDGPU::COPY;
5911 case AMDGPU::PHI: return AMDGPU::PHI;
5912 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5913 case AMDGPU::WQM: return AMDGPU::WQM;
5914 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5915 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5916 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5917 case AMDGPU::S_ADD_I32:
5918 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5919 case AMDGPU::S_ADDC_U32:
5920 return AMDGPU::V_ADDC_U32_e32;
5921 case AMDGPU::S_SUB_I32:
5922 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5923 // FIXME: These are not consistently handled, and selected when the carry is
5924 // used.
5925 case AMDGPU::S_ADD_U32:
5926 return AMDGPU::V_ADD_CO_U32_e32;
5927 case AMDGPU::S_SUB_U32:
5928 return AMDGPU::V_SUB_CO_U32_e32;
5929 case AMDGPU::S_ADD_U64_PSEUDO:
5930 return AMDGPU::V_ADD_U64_PSEUDO;
5931 case AMDGPU::S_SUB_U64_PSEUDO:
5932 return AMDGPU::V_SUB_U64_PSEUDO;
5933 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5934 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5935 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5936 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5937 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5938 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5939 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5940 case AMDGPU::S_XNOR_B32:
5941 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5942 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5943 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5944 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5945 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5946 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5947 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5948 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5949 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5950 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5951 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5952 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5953 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5954 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5955 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5956 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5957 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5958 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5959 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5960 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5961 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5962 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5963 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5964 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5965 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5966 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5967 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5968 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5969 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5970 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5971 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5972 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5973 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5974 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5975 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5976 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5977 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5978 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5979 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5980 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5981 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5982 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5983 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5984 case AMDGPU::S_CVT_F32_F16:
5985 case AMDGPU::S_CVT_HI_F32_F16:
5986 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5987 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5988 case AMDGPU::S_CVT_F16_F32:
5989 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5990 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5991 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5992 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5993 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5994 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5995 case AMDGPU::S_CEIL_F16:
5996 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5997 : AMDGPU::V_CEIL_F16_fake16_e64;
5998 case AMDGPU::S_FLOOR_F16:
5999 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6000 : AMDGPU::V_FLOOR_F16_fake16_e64;
6001 case AMDGPU::S_TRUNC_F16:
6002 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6003 : AMDGPU::V_TRUNC_F16_fake16_e64;
6004 case AMDGPU::S_RNDNE_F16:
6005 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6006 : AMDGPU::V_RNDNE_F16_fake16_e64;
6007 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6008 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6009 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6010 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6011 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6012 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6013 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6014 case AMDGPU::S_ADD_F16:
6015 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6016 : AMDGPU::V_ADD_F16_fake16_e64;
6017 case AMDGPU::S_SUB_F16:
6018 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6019 : AMDGPU::V_SUB_F16_fake16_e64;
6020 case AMDGPU::S_MIN_F16:
6021 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6022 : AMDGPU::V_MIN_F16_fake16_e64;
6023 case AMDGPU::S_MAX_F16:
6024 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6025 : AMDGPU::V_MAX_F16_fake16_e64;
6026 case AMDGPU::S_MINIMUM_F16:
6027 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6028 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6029 case AMDGPU::S_MAXIMUM_F16:
6030 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6031 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6032 case AMDGPU::S_MUL_F16:
6033 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6034 : AMDGPU::V_MUL_F16_fake16_e64;
6035 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6036 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6037 case AMDGPU::S_FMAC_F16:
6038 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6039 : AMDGPU::V_FMAC_F16_fake16_e64;
6040 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6041 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6042 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6043 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6044 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6045 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6046 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6047 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6048 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6049 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6050 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6051 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6052 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6053 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6054 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6055 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6056 case AMDGPU::S_CMP_LT_F16:
6057 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6058 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6059 case AMDGPU::S_CMP_EQ_F16:
6060 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6061 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6062 case AMDGPU::S_CMP_LE_F16:
6063 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6064 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6065 case AMDGPU::S_CMP_GT_F16:
6066 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6067 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6068 case AMDGPU::S_CMP_LG_F16:
6069 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6070 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6071 case AMDGPU::S_CMP_GE_F16:
6072 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6073 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6074 case AMDGPU::S_CMP_O_F16:
6075 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6076 : AMDGPU::V_CMP_O_F16_fake16_e64;
6077 case AMDGPU::S_CMP_U_F16:
6078 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6079 : AMDGPU::V_CMP_U_F16_fake16_e64;
6080 case AMDGPU::S_CMP_NGE_F16:
6081 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6082 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6083 case AMDGPU::S_CMP_NLG_F16:
6084 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6085 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6086 case AMDGPU::S_CMP_NGT_F16:
6087 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6088 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6089 case AMDGPU::S_CMP_NLE_F16:
6090 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6091 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6092 case AMDGPU::S_CMP_NEQ_F16:
6093 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6094 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6095 case AMDGPU::S_CMP_NLT_F16:
6096 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6097 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6098 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6099 case AMDGPU::V_S_EXP_F16_e64:
6100 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6101 : AMDGPU::V_EXP_F16_fake16_e64;
6102 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6103 case AMDGPU::V_S_LOG_F16_e64:
6104 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6105 : AMDGPU::V_LOG_F16_fake16_e64;
6106 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6107 case AMDGPU::V_S_RCP_F16_e64:
6108 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6109 : AMDGPU::V_RCP_F16_fake16_e64;
6110 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6111 case AMDGPU::V_S_RSQ_F16_e64:
6112 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6113 : AMDGPU::V_RSQ_F16_fake16_e64;
6114 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6115 case AMDGPU::V_S_SQRT_F16_e64:
6116 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6117 : AMDGPU::V_SQRT_F16_fake16_e64;
6118 }
6120 "Unexpected scalar opcode without corresponding vector one!");
6121}
6122
6123// clang-format on
6124
6128 const DebugLoc &DL, Register Reg,
6129 bool IsSCCLive,
6130 SlotIndexes *Indexes) const {
6131 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6132 const SIInstrInfo *TII = ST.getInstrInfo();
6134 if (IsSCCLive) {
6135 // Insert two move instructions, one to save the original value of EXEC and
6136 // the other to turn on all bits in EXEC. This is required as we can't use
6137 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6138 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6140 auto FlipExecMI =
6141 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6142 if (Indexes) {
6143 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6144 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6145 }
6146 } else {
6147 auto SaveExec =
6148 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6149 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6150 if (Indexes)
6151 Indexes->insertMachineInstrInMaps(*SaveExec);
6152 }
6153}
6154
6157 const DebugLoc &DL, Register Reg,
6158 SlotIndexes *Indexes) const {
6160 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6161 .addReg(Reg, RegState::Kill);
6162 if (Indexes)
6163 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6164}
6165
6169 "Not a whole wave func");
6170 MachineBasicBlock &MBB = *MF.begin();
6171 for (MachineInstr &MI : MBB)
6172 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6173 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6174 return &MI;
6175
6176 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6177}
6178
6180 unsigned OpNo) const {
6181 const MCInstrDesc &Desc = get(MI.getOpcode());
6182 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6183 Desc.operands()[OpNo].RegClass == -1) {
6184 Register Reg = MI.getOperand(OpNo).getReg();
6185
6186 if (Reg.isVirtual()) {
6187 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6188 return MRI.getRegClass(Reg);
6189 }
6190 return RI.getPhysRegBaseClass(Reg);
6191 }
6192
6193 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6194 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6195}
6196
6199 MachineBasicBlock *MBB = MI.getParent();
6200 MachineOperand &MO = MI.getOperand(OpIdx);
6201 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6202 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6203 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6204 unsigned Size = RI.getRegSizeInBits(*RC);
6205 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6206 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6207 : AMDGPU::V_MOV_B32_e32;
6208 if (MO.isReg())
6209 Opcode = AMDGPU::COPY;
6210 else if (RI.isSGPRClass(RC))
6211 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6212
6213 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6214 Register Reg = MRI.createVirtualRegister(VRC);
6215 DebugLoc DL = MBB->findDebugLoc(I);
6216 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6217 MO.ChangeToRegister(Reg, false);
6218}
6219
6222 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6223 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6224 if (!SuperReg.getReg().isVirtual())
6225 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6226
6227 MachineBasicBlock *MBB = MI->getParent();
6228 const DebugLoc &DL = MI->getDebugLoc();
6229 Register SubReg = MRI.createVirtualRegister(SubRC);
6230
6231 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6232 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6233 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6234 return SubReg;
6235}
6236
6239 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6240 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6241 if (Op.isImm()) {
6242 if (SubIdx == AMDGPU::sub0)
6243 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6244 if (SubIdx == AMDGPU::sub1)
6245 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6246
6247 llvm_unreachable("Unhandled register index for immediate");
6248 }
6249
6250 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6251 SubIdx, SubRC);
6252 return MachineOperand::CreateReg(SubReg, false);
6253}
6254
6255// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6256void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6257 assert(Inst.getNumExplicitOperands() == 3);
6258 MachineOperand Op1 = Inst.getOperand(1);
6259 Inst.removeOperand(1);
6260 Inst.addOperand(Op1);
6261}
6262
6264 const MCOperandInfo &OpInfo,
6265 const MachineOperand &MO) const {
6266 if (!MO.isReg())
6267 return false;
6268
6269 Register Reg = MO.getReg();
6270
6271 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6272 if (Reg.isPhysical())
6273 return DRC->contains(Reg);
6274
6275 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6276
6277 if (MO.getSubReg()) {
6278 const MachineFunction *MF = MO.getParent()->getMF();
6279 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6280 if (!SuperRC)
6281 return false;
6282 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6283 }
6284
6285 return RI.getCommonSubClass(DRC, RC) != nullptr;
6286}
6287
6289 const MachineOperand &MO) const {
6290 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6291 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6292 unsigned Opc = MI.getOpcode();
6293
6294 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand for more
6295 // information.
6296 if (AMDGPU::isPackedFP32or64BitInst(MI.getOpcode()) &&
6297 AMDGPU::isGFX12Plus(ST) && MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6298 constexpr AMDGPU::OpName OpNames[] = {
6299 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6300
6301 for (auto [I, OpName] : enumerate(OpNames)) {
6302 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6303 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6305 return false;
6306 }
6307 }
6308
6309 if (!isLegalRegOperand(MRI, OpInfo, MO))
6310 return false;
6311
6312 // check Accumulate GPR operand
6313 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6314 if (IsAGPR && !ST.hasMAIInsts())
6315 return false;
6316 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6317 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6318 return false;
6319 // Atomics should have both vdst and vdata either vgpr or agpr.
6320 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6321 const int DataIdx = AMDGPU::getNamedOperandIdx(
6322 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6323 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6324 MI.getOperand(DataIdx).isReg() &&
6325 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6326 return false;
6327 if ((int)OpIdx == DataIdx) {
6328 if (VDstIdx != -1 &&
6329 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6330 return false;
6331 // DS instructions with 2 src operands also must have tied RC.
6332 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6333 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6334 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6335 return false;
6336 }
6337
6338 // Check V_ACCVGPR_WRITE_B32_e64
6339 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6340 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6341 RI.isSGPRReg(MRI, MO.getReg()))
6342 return false;
6343
6344 if (ST.hasFlatScratchHiInB64InstHazard() &&
6345 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6346 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6347 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6348 64)
6349 return false;
6350 }
6351 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6352 return false;
6353 }
6354 if (!ST.hasDPPSrc1SGPR() && isDPP(MI) && RI.isSGPRReg(MRI, MO.getReg()) &&
6355 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1))
6356 return false;
6357
6358 return true;
6359}
6360
6362 const MCOperandInfo &OpInfo,
6363 const MachineOperand &MO) const {
6364 if (MO.isReg())
6365 return isLegalRegOperand(MRI, OpInfo, MO);
6366
6367 // Handle non-register types that are treated like immediates.
6368 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6369 return true;
6370}
6371
6373 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6374 const MachineOperand *MO) const {
6375 constexpr unsigned NumOps = 3;
6376 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6377 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6378 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6379 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6380
6381 assert(SrcN < NumOps);
6382
6383 if (!MO) {
6384 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6385 if (SrcIdx == -1)
6386 return true;
6387 MO = &MI.getOperand(SrcIdx);
6388 }
6389
6390 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6391 return true;
6392
6393 int ModsIdx =
6394 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6395 if (ModsIdx == -1)
6396 return false;
6397
6398 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6399 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6400 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6401
6402 return !OpSel && !OpSelHi;
6403}
6404
6406 const MachineOperand *MO) const {
6407 const MachineFunction &MF = *MI.getMF();
6408 const MachineRegisterInfo &MRI = MF.getRegInfo();
6409 const MCInstrDesc &InstDesc = MI.getDesc();
6410 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6411 int64_t RegClass = getOpRegClassID(OpInfo);
6412 const TargetRegisterClass *DefinedRC =
6413 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6414 if (!MO)
6415 MO = &MI.getOperand(OpIdx);
6416
6417 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6418
6419 if (isVALU(MI, /*AllowLDSDMA=*/true) && !IsInlineConst &&
6420 usesConstantBus(MRI, *MO, OpInfo)) {
6421 const MachineOperand *UsedLiteral = nullptr;
6422
6423 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6424 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6425
6426 // TODO: Be more permissive with frame indexes.
6427 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6428 if (!LiteralLimit--)
6429 return false;
6430
6431 UsedLiteral = MO;
6432 }
6433
6435 if (MO->isReg())
6436 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6437
6438 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6439 if (i == OpIdx)
6440 continue;
6441 const MachineOperand &Op = MI.getOperand(i);
6442 if (Op.isReg()) {
6443 if (Op.isUse()) {
6444 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6445 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6446 if (--ConstantBusLimit <= 0)
6447 return false;
6448 }
6449 }
6450 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6451 !isInlineConstant(Op, InstDesc.operands()[i])) {
6452 // The same literal may be used multiple times.
6453 if (!UsedLiteral)
6454 UsedLiteral = &Op;
6455 else if (UsedLiteral->isIdenticalTo(Op))
6456 continue;
6457
6458 if (!LiteralLimit--)
6459 return false;
6460 if (--ConstantBusLimit <= 0)
6461 return false;
6462 }
6463 }
6464 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6465 // There can be at most one literal operand, but it can be repeated.
6466 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6467 if (i == OpIdx)
6468 continue;
6469 const MachineOperand &Op = MI.getOperand(i);
6470 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6471 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6472 !Op.isIdenticalTo(*MO))
6473 return false;
6474
6475 // Do not fold a non-inlineable and non-register operand into an
6476 // instruction that already has a frame index. The frame index handling
6477 // code could not handle well when a frame index co-exists with another
6478 // non-register operand, unless that operand is an inlineable immediate.
6479 if (Op.isFI())
6480 return false;
6481 }
6482 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6483 isF16PseudoScalarTrans(MI.getOpcode())) {
6484 return false;
6485 }
6486
6487 if (MO->isReg()) {
6488 if (!DefinedRC)
6489 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6490 return isLegalRegOperand(MI, OpIdx, *MO);
6491 }
6492
6493 if (MO->isImm()) {
6494 uint64_t Imm = MO->getImm();
6495 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
6496 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP64;
6497 bool Is64BitOp = Is64BitFPOp ||
6498 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6499 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6500 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32 ||
6501 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT64;
6502 if (Is64BitOp &&
6503 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6504 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6505 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6506 return false;
6507
6508 // FIXME: We can use sign extended 64-bit literals, but only for signed
6509 // operands. At the moment we do not know if an operand is signed.
6510 // Such operand will be encoded as its low 32 bits and then either
6511 // correctly sign extended or incorrectly zero extended by HW.
6512 // If 64-bit literals are supported and the literal will be encoded
6513 // as full 64 bit we still can use it.
6514 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6515 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6516 return false;
6517 }
6518 }
6519
6520 // Handle non-register types that are treated like immediates.
6521 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6522
6523 if (!DefinedRC) {
6524 // This operand expects an immediate.
6525 return true;
6526 }
6527
6528 return isImmOperandLegal(MI, OpIdx, *MO);
6529}
6530
6532 bool IsGFX950Only = ST.hasGFX950Insts();
6533 bool IsGFX940Only = ST.hasGFX940Insts();
6534
6535 if (!IsGFX950Only && !IsGFX940Only)
6536 return false;
6537
6538 if (!isVALU(MI, /*AllowLDSDMA=*/true))
6539 return false;
6540
6541 // V_COS, V_EXP, V_RCP, etc.
6542 if (isTRANS(MI))
6543 return true;
6544
6545 // DOT2, DOT2C, DOT4, etc.
6546 if (isDOT(MI))
6547 return true;
6548
6549 // MFMA, SMFMA
6550 if (isMFMA(MI))
6551 return true;
6552
6553 unsigned Opcode = MI.getOpcode();
6554 switch (Opcode) {
6555 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6556 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6557 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6558 case AMDGPU::V_MQSAD_U32_U8_e64:
6559 case AMDGPU::V_PK_ADD_F16:
6560 case AMDGPU::V_PK_ADD_F32:
6561 case AMDGPU::V_PK_ADD_I16:
6562 case AMDGPU::V_PK_ADD_U16:
6563 case AMDGPU::V_PK_ASHRREV_I16:
6564 case AMDGPU::V_PK_FMA_F16:
6565 case AMDGPU::V_PK_FMA_F32:
6566 case AMDGPU::V_PK_FMAC_F16_e32:
6567 case AMDGPU::V_PK_FMAC_F16_e64:
6568 case AMDGPU::V_PK_LSHLREV_B16:
6569 case AMDGPU::V_PK_LSHRREV_B16:
6570 case AMDGPU::V_PK_MAD_I16:
6571 case AMDGPU::V_PK_MAD_U16:
6572 case AMDGPU::V_PK_MAX_F16:
6573 case AMDGPU::V_PK_MAX_I16:
6574 case AMDGPU::V_PK_MAX_U16:
6575 case AMDGPU::V_PK_MIN_F16:
6576 case AMDGPU::V_PK_MIN_I16:
6577 case AMDGPU::V_PK_MIN_U16:
6578 case AMDGPU::V_PK_MOV_B32:
6579 case AMDGPU::V_PK_MUL_F16:
6580 case AMDGPU::V_PK_MUL_F32:
6581 case AMDGPU::V_PK_MUL_LO_U16:
6582 case AMDGPU::V_PK_SUB_I16:
6583 case AMDGPU::V_PK_SUB_U16:
6584 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6585 return true;
6586 default:
6587 return false;
6588 }
6589}
6590
6592 MachineInstr &MI) const {
6593 unsigned Opc = MI.getOpcode();
6594 const MCInstrDesc &InstrDesc = get(Opc);
6595
6596 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6597 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6598
6599 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6600 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6601
6602 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6603 // we need to only have one constant bus use before GFX10.
6604 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6605 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6606 RI.isSGPRReg(MRI, Src0.getReg()))
6607 legalizeOpWithMove(MI, Src0Idx);
6608
6609 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6610 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6611 // src0/src1 with V_READFIRSTLANE.
6612 if (Opc == AMDGPU::V_WRITELANE_B32) {
6613 const DebugLoc &DL = MI.getDebugLoc();
6614 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6615 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6616 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6617 .add(Src0);
6618 Src0.ChangeToRegister(Reg, false);
6619 }
6620 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6621 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6622 const DebugLoc &DL = MI.getDebugLoc();
6623 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6624 .add(Src1);
6625 Src1.ChangeToRegister(Reg, false);
6626 }
6627 return;
6628 }
6629
6630 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6631 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6632 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6633 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6634 legalizeOpWithMove(MI, Src2Idx);
6635 }
6636
6637 // VOP2 src0 instructions support all operand types, so we don't need to check
6638 // their legality. If src1 is already legal, we don't need to do anything.
6639 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6640 return;
6641
6642 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6643 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6644 // select is uniform.
6645 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6646 RI.isVGPR(MRI, Src1.getReg())) {
6647 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6648 const DebugLoc &DL = MI.getDebugLoc();
6649 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6650 .add(Src1);
6651 Src1.ChangeToRegister(Reg, false);
6652 return;
6653 }
6654
6655 // We do not use commuteInstruction here because it is too aggressive and will
6656 // commute if it is possible. We only want to commute here if it improves
6657 // legality. This can be called a fairly large number of times so don't waste
6658 // compile time pointlessly swapping and checking legality again.
6659 if (HasImplicitSGPR || !MI.isCommutable()) {
6660 legalizeOpWithMove(MI, Src1Idx);
6661 return;
6662 }
6663
6664 // If src0 can be used as src1, commuting will make the operands legal.
6665 // Otherwise we have to give up and insert a move.
6666 //
6667 // TODO: Other immediate-like operand kinds could be commuted if there was a
6668 // MachineOperand::ChangeTo* for them.
6669 if ((!Src1.isImm() && !Src1.isReg()) ||
6670 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6671 legalizeOpWithMove(MI, Src1Idx);
6672 return;
6673 }
6674
6675 int CommutedOpc = commuteOpcode(MI);
6676 if (CommutedOpc == -1) {
6677 legalizeOpWithMove(MI, Src1Idx);
6678 return;
6679 }
6680
6681 MI.setDesc(get(CommutedOpc));
6682
6683 Register Src0Reg = Src0.getReg();
6684 unsigned Src0SubReg = Src0.getSubReg();
6685 bool Src0Kill = Src0.isKill();
6686
6687 if (Src1.isImm())
6688 Src0.ChangeToImmediate(Src1.getImm());
6689 else if (Src1.isReg()) {
6690 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6691 Src0.setSubReg(Src1.getSubReg());
6692 } else
6693 llvm_unreachable("Should only have register or immediate operands");
6694
6695 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6696 Src1.setSubReg(Src0SubReg);
6698}
6699
6700// Legalize VOP3 operands. All operand types are supported for any operand
6701// but only one literal constant and only starting from GFX10.
6703 MachineInstr &MI) const {
6704 unsigned Opc = MI.getOpcode();
6705
6706 int VOP3Idx[3] = {
6707 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6708 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6709 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6710 };
6711
6712 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6713 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6714 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6715 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6716 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6717 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6718 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6719 // src1 and src2 must be scalar
6720 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6721 const DebugLoc &DL = MI.getDebugLoc();
6722 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6723 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6724 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6725 .add(Src1);
6726 Src1.ChangeToRegister(Reg, false);
6727 }
6728 if (VOP3Idx[2] != -1) {
6729 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6730 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6731 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6732 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6733 .add(Src2);
6734 Src2.ChangeToRegister(Reg, false);
6735 }
6736 }
6737 }
6738
6739 // Find the one SGPR operand we are allowed to use.
6740 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6741 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6742 SmallDenseSet<unsigned> SGPRsUsed;
6743 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6744 if (SGPRReg) {
6745 SGPRsUsed.insert(SGPRReg);
6746 --ConstantBusLimit;
6747 }
6748
6749 for (int Idx : VOP3Idx) {
6750 if (Idx == -1)
6751 break;
6752 MachineOperand &MO = MI.getOperand(Idx);
6753
6754 if (!MO.isReg()) {
6755 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6756 continue;
6757
6758 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6759 --LiteralLimit;
6760 --ConstantBusLimit;
6761 continue;
6762 }
6763
6764 --LiteralLimit;
6765 --ConstantBusLimit;
6766 legalizeOpWithMove(MI, Idx);
6767 continue;
6768 }
6769
6770 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6771 continue; // VGPRs are legal
6772
6773 // We can use one SGPR in each VOP3 instruction prior to GFX10
6774 // and two starting from GFX10.
6775 if (SGPRsUsed.count(MO.getReg()))
6776 continue;
6777 if (ConstantBusLimit > 0) {
6778 SGPRsUsed.insert(MO.getReg());
6779 --ConstantBusLimit;
6780 continue;
6781 }
6782
6783 // If we make it this far, then the operand is not legal and we must
6784 // legalize it.
6785 legalizeOpWithMove(MI, Idx);
6786 }
6787
6788 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6789 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6790 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6791 legalizeOpWithMove(MI, VOP3Idx[2]);
6792
6793 // Fix the register class of packed FP32 instructions on gfx12+. See
6794 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand for more
6795 // information.
6797 for (unsigned I = 0; I < 3; ++I) {
6799 legalizeOpWithMove(MI, VOP3Idx[I]);
6800 }
6801 }
6802}
6803
6806 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6807 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6808 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6809 if (DstRC)
6810 SRC = RI.getCommonSubClass(SRC, DstRC);
6811
6812 Register DstReg = MRI.createVirtualRegister(SRC);
6813 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6814
6815 if (RI.hasAGPRs(VRC)) {
6816 VRC = RI.getEquivalentVGPRClass(VRC);
6817 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6818 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6819 get(TargetOpcode::COPY), NewSrcReg)
6820 .addReg(SrcReg);
6821 SrcReg = NewSrcReg;
6822 }
6823
6824 if (SubRegs == 1) {
6825 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6826 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6827 .addReg(SrcReg);
6828 return DstReg;
6829 }
6830
6832 for (unsigned i = 0; i < SubRegs; ++i) {
6833 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6834 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6835 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6836 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6837 SRegs.push_back(SGPR);
6838 }
6839
6841 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6842 get(AMDGPU::REG_SEQUENCE), DstReg);
6843 for (unsigned i = 0; i < SubRegs; ++i) {
6844 MIB.addReg(SRegs[i]);
6845 MIB.addImm(RI.getSubRegFromChannel(i));
6846 }
6847 return DstReg;
6848}
6849
6851 MachineInstr &MI) const {
6852
6853 // If the pointer is store in VGPRs, then we need to move them to
6854 // SGPRs using v_readfirstlane. This is safe because we only select
6855 // loads with uniform pointers to SMRD instruction so we know the
6856 // pointer value is uniform.
6857 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6858 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6859 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6860 SBase->setReg(SGPR);
6861 }
6862 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6863 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6864 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6865 SOff->setReg(SGPR);
6866 }
6867}
6868
6870 unsigned Opc = Inst.getOpcode();
6871 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6872 if (OldSAddrIdx < 0)
6873 return false;
6874
6875 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6876
6877 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6878 if (NewOpc < 0)
6880 if (NewOpc < 0)
6881 return false;
6882
6883 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6884 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6885 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6886 return false;
6887
6888 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6889 if (NewVAddrIdx < 0)
6890 return false;
6891
6892 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6893
6894 // Check vaddr, it shall be zero or absent.
6895 MachineInstr *VAddrDef = nullptr;
6896 if (OldVAddrIdx >= 0) {
6897 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6898 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6899 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6900 !VAddrDef->getOperand(1).isImm() ||
6901 VAddrDef->getOperand(1).getImm() != 0)
6902 return false;
6903 }
6904
6905 const MCInstrDesc &NewDesc = get(NewOpc);
6906 Inst.setDesc(NewDesc);
6907
6908 // Callers expect iterator to be valid after this call, so modify the
6909 // instruction in place.
6910 if (OldVAddrIdx == NewVAddrIdx) {
6911 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6912 // Clear use list from the old vaddr holding a zero register.
6913 MRI.removeRegOperandFromUseList(&NewVAddr);
6914 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6915 Inst.removeOperand(OldSAddrIdx);
6916 // Update the use list with the pointer we have just moved from vaddr to
6917 // saddr position. Otherwise new vaddr will be missing from the use list.
6918 MRI.removeRegOperandFromUseList(&NewVAddr);
6919 MRI.addRegOperandToUseList(&NewVAddr);
6920 } else {
6921 assert(OldSAddrIdx == NewVAddrIdx);
6922
6923 if (OldVAddrIdx >= 0) {
6924 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6925 AMDGPU::OpName::vdst_in);
6926
6927 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6928 // it asserts. Untie the operands for now and retie them afterwards.
6929 if (NewVDstIn != -1) {
6930 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6931 Inst.untieRegOperand(OldVDstIn);
6932 }
6933
6934 Inst.removeOperand(OldVAddrIdx);
6935
6936 if (NewVDstIn != -1) {
6937 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6938 Inst.tieOperands(NewVDst, NewVDstIn);
6939 }
6940 }
6941 }
6942
6943 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6944 VAddrDef->eraseFromParent();
6945
6946 return true;
6947}
6948
6949// FIXME: Remove this when SelectionDAG is obsoleted.
6951 MachineInstr &MI) const {
6952 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6953 return;
6954
6955 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6956 // thinks they are uniform, so a readfirstlane should be valid.
6957 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6958 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6959 return;
6960
6962 return;
6963
6964 const TargetRegisterClass *DeclaredRC =
6965 getRegClass(MI.getDesc(), SAddr->getOperandNo());
6966
6967 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6968 SAddr->setReg(ToSGPR);
6969}
6970
6973 const TargetRegisterClass *DstRC,
6976 const DebugLoc &DL) const {
6977 Register OpReg = Op.getReg();
6978 unsigned OpSubReg = Op.getSubReg();
6979
6980 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6981 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6982
6983 // Check if operand is already the correct register class.
6984 if (DstRC == OpRC)
6985 return;
6986
6987 Register DstReg = MRI.createVirtualRegister(DstRC);
6988 auto Copy =
6989 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6990 Op.setReg(DstReg);
6991
6992 MachineInstr *Def = MRI.getVRegDef(OpReg);
6993 if (!Def)
6994 return;
6995
6996 // Try to eliminate the copy if it is copying an immediate value.
6997 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6998 foldImmediate(*Copy, *Def, OpReg, &MRI);
6999
7000 bool ImpDef = Def->isImplicitDef();
7001 while (!ImpDef && Def && Def->isCopy()) {
7002 if (Def->getOperand(1).getReg().isPhysical())
7003 break;
7004 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7005 ImpDef = Def && Def->isImplicitDef();
7006 }
7007 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7008 !ImpDef)
7009 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7010}
7011
7012// Emit the actual waterfall loop, executing the wrapped instruction for each
7013// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7014// iteration, in the worst case we execute 64 (once per lane).
7017 MachineBasicBlock &BodyBB, const DebugLoc &DL,
7018 ArrayRef<MachineOperand *> ScalarOps, ArrayRef<Register> PhySGPRs = {}) {
7019 MachineFunction &MF = *LoopBB.getParent();
7021 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7023 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7024
7026 Register CondReg;
7027 for (auto [Idx, ScalarOp] : enumerate(ScalarOps)) {
7028 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7029 unsigned NumSubRegs = RegSize / 32;
7030 Register VScalarOp = ScalarOp->getReg();
7031
7032 const TargetRegisterClass *RFLSrcRC =
7033 TII.getRegClass(TII.get(AMDGPU::V_READFIRSTLANE_B32), 1);
7034
7035 if (NumSubRegs == 1) {
7036 const TargetRegisterClass *VScalarOpRC = MRI.getRegClass(VScalarOp);
7037 if (const TargetRegisterClass *Common =
7038 TRI->getCommonSubClass(VScalarOpRC, RFLSrcRC);
7039 Common != VScalarOpRC) {
7040 Register VRReg = MRI.createVirtualRegister(Common);
7041 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::COPY), VRReg).addReg(VScalarOp);
7042 VScalarOp = VRReg;
7043 }
7044 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7045
7046 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7047 .addReg(VScalarOp);
7048
7049 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7050
7051 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7052 .addReg(CurReg)
7053 .addReg(VScalarOp);
7054
7055 // Combine the comparison results with AND.
7056 if (!CondReg) // First.
7057 CondReg = NewCondReg;
7058 else { // If not the first, we create an AND.
7059 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7060 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7061 .addReg(CondReg)
7062 .addReg(NewCondReg);
7063 CondReg = AndReg;
7064 }
7065
7066 // Update ScalarOp operand to use the SGPR ScalarOp.
7067 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7068 ScalarOp->setReg(CurReg);
7069 else {
7070 // Insert into the same block of use
7071 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7072 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7073 .addReg(CurReg);
7074 ScalarOp->setReg(PhySGPRs[Idx]);
7075 }
7076 ScalarOp->setIsKill();
7077 } else {
7078 SmallVector<Register, 8> ReadlanePieces;
7079 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7080 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7081 "Unhandled register size");
7082
7083 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7084 Register CurRegLo =
7085 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7086 Register CurRegHi =
7087 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7088
7089 // Read the next variant <- also loop target.
7090 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7091 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7092
7093 // Read the next variant <- also loop target.
7094 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7095 .addReg(VScalarOp, VScalarOpUndef,
7096 TRI->getSubRegFromChannel(Idx + 1));
7097
7098 ReadlanePieces.push_back(CurRegLo);
7099 ReadlanePieces.push_back(CurRegHi);
7100
7101 // Comparison is to be done as 64-bit.
7102 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7103 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7104 .addReg(CurRegLo)
7105 .addImm(AMDGPU::sub0)
7106 .addReg(CurRegHi)
7107 .addImm(AMDGPU::sub1);
7108
7109 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7110 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7111 NewCondReg)
7112 .addReg(CurReg);
7113 if (NumSubRegs <= 2)
7114 Cmp.addReg(VScalarOp);
7115 else
7116 Cmp.addReg(VScalarOp, VScalarOpUndef,
7117 TRI->getSubRegFromChannel(Idx, 2));
7118
7119 // Combine the comparison results with AND.
7120 if (!CondReg) // First.
7121 CondReg = NewCondReg;
7122 else { // If not the first, we create an AND.
7123 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7124 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7125 .addReg(CondReg)
7126 .addReg(NewCondReg);
7127 CondReg = AndReg;
7128 }
7129 } // End for loop.
7130
7131 const auto *SScalarOpRC =
7132 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7133 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7134
7135 // Build scalar ScalarOp.
7136 auto Merge =
7137 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7138 unsigned Channel = 0;
7139 for (Register Piece : ReadlanePieces) {
7140 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7141 }
7142
7143 // Update ScalarOp operand to use the SGPR ScalarOp.
7144 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7145 ScalarOp->setReg(SScalarOp);
7146 else {
7147 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7148 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7149 .addReg(SScalarOp);
7150 ScalarOp->setReg(PhySGPRs[Idx]);
7151 }
7152 ScalarOp->setIsKill();
7153 }
7154 }
7155
7156 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7157 MRI.setSimpleHint(SaveExec, CondReg);
7158
7159 // Update EXEC to matching lanes, saving original to SaveExec.
7160 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7161 .addReg(CondReg, RegState::Kill);
7162
7163 // The original instruction is here; we insert the terminators after it.
7164 I = BodyBB.end();
7165
7166 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7167 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7168 .addReg(LMC.ExecReg)
7169 .addReg(SaveExec);
7170
7171 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7172}
7173
7174// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7175// with SGPRs by iterating over all unique values across all lanes.
7176// Returns the loop basic block that now contains \p MI.
7177static MachineBasicBlock *
7181 MachineBasicBlock::iterator Begin = nullptr,
7182 MachineBasicBlock::iterator End = nullptr,
7183 ArrayRef<Register> PhySGPRs = {}) {
7184 assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) &&
7185 "Physical SGPRs must be empty or match the number of scalar operands");
7186 MachineBasicBlock &MBB = *MI.getParent();
7187 MachineFunction &MF = *MBB.getParent();
7189 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7190 MachineRegisterInfo &MRI = MF.getRegInfo();
7191 if (!Begin.isValid())
7192 Begin = &MI;
7193 if (!End.isValid()) {
7194 End = &MI;
7195 ++End;
7196 }
7197 const DebugLoc &DL = MI.getDebugLoc();
7199 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7200
7201 // Save SCC. Waterfall Loop may overwrite SCC.
7202 Register SaveSCCReg;
7203
7204 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7205 // rather than unlimited scan everywhere
7206 bool SCCNotDead =
7207 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7208 std::numeric_limits<unsigned>::max()) !=
7210 if (SCCNotDead) {
7211 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7212 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7213 .addImm(1)
7214 .addImm(0);
7215 }
7216
7217 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7218
7219 // Save the EXEC mask
7220 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7221
7222 // Killed uses in the instruction we are waterfalling around will be
7223 // incorrect due to the added control-flow.
7225 ++AfterMI;
7226 for (auto I = Begin; I != AfterMI; I++) {
7227 for (auto &MO : I->all_uses())
7228 MRI.clearKillFlags(MO.getReg());
7229 }
7230
7231 // To insert the loop we need to split the block. Move everything after this
7232 // point to a new block, and insert a new empty block between the two.
7235 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7237 ++MBBI;
7238
7239 MF.insert(MBBI, LoopBB);
7240 MF.insert(MBBI, BodyBB);
7241 MF.insert(MBBI, RemainderBB);
7242
7243 LoopBB->addSuccessor(BodyBB);
7244 BodyBB->addSuccessor(LoopBB);
7245 BodyBB->addSuccessor(RemainderBB);
7246
7247 // Move Begin to MI to the BodyBB, and the remainder of the block to
7248 // RemainderBB.
7249 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7250 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7251 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7252
7253 MBB.addSuccessor(LoopBB);
7254
7255 // Update dominators. We know that MBB immediately dominates LoopBB, that
7256 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7257 // RemainderBB. RemainderBB immediately dominates all of the successors
7258 // transferred to it from MBB that MBB used to properly dominate.
7259 if (MDT) {
7260 MDT->addNewBlock(LoopBB, &MBB);
7261 MDT->addNewBlock(BodyBB, LoopBB);
7262 MDT->addNewBlock(RemainderBB, BodyBB);
7263 for (auto &Succ : RemainderBB->successors()) {
7264 if (MDT->properlyDominates(&MBB, Succ)) {
7265 MDT->changeImmediateDominator(Succ, RemainderBB);
7266 }
7267 }
7268 }
7269
7270 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps,
7271 PhySGPRs);
7272
7273 MachineBasicBlock::iterator First = RemainderBB->begin();
7274 // Restore SCC
7275 if (SCCNotDead) {
7276 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7277 .addReg(SaveSCCReg, RegState::Kill)
7278 .addImm(0);
7279 }
7280
7281 // Restore the EXEC mask
7282 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7283 .addReg(SaveExec);
7284 return BodyBB;
7285}
7286
7287// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7288static std::tuple<unsigned, unsigned>
7290 MachineBasicBlock &MBB = *MI.getParent();
7291 MachineFunction &MF = *MBB.getParent();
7292 MachineRegisterInfo &MRI = MF.getRegInfo();
7293
7294 // Extract the ptr from the resource descriptor.
7295 unsigned RsrcPtr =
7296 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7297 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7298
7299 // Create an empty resource descriptor
7300 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7301 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7302 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7303 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7304 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7305
7306 // Zero64 = 0
7307 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7308 .addImm(0);
7309
7310 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7311 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7312 .addImm(Lo_32(RsrcDataFormat));
7313
7314 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7315 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7316 .addImm(Hi_32(RsrcDataFormat));
7317
7318 // NewSRsrc = {Zero64, SRsrcFormat}
7319 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7320 .addReg(Zero64)
7321 .addImm(AMDGPU::sub0_sub1)
7322 .addReg(SRsrcFormatLo)
7323 .addImm(AMDGPU::sub2)
7324 .addReg(SRsrcFormatHi)
7325 .addImm(AMDGPU::sub3);
7326
7327 return std::tuple(RsrcPtr, NewSRsrc);
7328}
7329
7332 MachineDominatorTree *MDT) const {
7333 MachineFunction &MF = *MI.getMF();
7334 MachineRegisterInfo &MRI = MF.getRegInfo();
7335 MachineBasicBlock *CreatedBB = nullptr;
7336
7337 // Legalize VOP2
7338 if (isVOP2(MI) || isVOPC(MI)) {
7340 return CreatedBB;
7341 }
7342
7343 // Legalize VOP3
7344 if (isVOP3(MI)) {
7346 return CreatedBB;
7347 }
7348
7349 // Legalize SMRD
7350 if (isSMRD(MI)) {
7352 return CreatedBB;
7353 }
7354
7355 // Legalize FLAT
7356 if (isFLAT(MI)) {
7358 return CreatedBB;
7359 }
7360
7361 // Legalize PHI
7362 // The register class of the operands must be the same type as the register
7363 // class of the output.
7364 if (MI.getOpcode() == AMDGPU::PHI) {
7365 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7366 assert(!RI.isSGPRClass(VRC));
7367
7368 // Update all the operands so they have the same type.
7369 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7370 MachineOperand &Op = MI.getOperand(I);
7371 if (!Op.isReg() || !Op.getReg().isVirtual())
7372 continue;
7373
7374 // MI is a PHI instruction.
7375 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7377
7378 // Avoid creating no-op copies with the same src and dst reg class. These
7379 // confuse some of the machine passes.
7380 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7381 }
7382 }
7383
7384 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7385 // VGPR dest type and SGPR sources, insert copies so all operands are
7386 // VGPRs. This seems to help operand folding / the register coalescer.
7387 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7388 MachineBasicBlock *MBB = MI.getParent();
7389 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7390 if (RI.hasVGPRs(DstRC)) {
7391 // Update all the operands so they are VGPR register classes. These may
7392 // not be the same register class because REG_SEQUENCE supports mixing
7393 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7394 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7395 MachineOperand &Op = MI.getOperand(I);
7396 if (!Op.isReg() || !Op.getReg().isVirtual())
7397 continue;
7398
7399 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7400 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7401 if (VRC == OpRC)
7402 continue;
7403
7404 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7405 Op.setIsKill();
7406 }
7407 }
7408
7409 return CreatedBB;
7410 }
7411
7412 // Legalize INSERT_SUBREG
7413 // src0 must have the same register class as dst
7414 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7415 Register Dst = MI.getOperand(0).getReg();
7416 Register Src0 = MI.getOperand(1).getReg();
7417 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7418 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7419 if (DstRC != Src0RC) {
7420 MachineBasicBlock *MBB = MI.getParent();
7421 MachineOperand &Op = MI.getOperand(1);
7422 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7423 }
7424 return CreatedBB;
7425 }
7426
7427 // Legalize SI_INIT_M0
7428 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7429 MachineOperand &Src = MI.getOperand(0);
7430 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7431 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7432 return CreatedBB;
7433 }
7434
7435 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7436 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7437 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7438 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7439 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7440 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7441 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7442 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7443 MachineOperand &Src = MI.getOperand(1);
7444 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7445 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7446 return CreatedBB;
7447 }
7448
7449 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7450 //
7451 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7452 // scratch memory access. In both cases, the legalization never involves
7453 // conversion to the addr64 form.
7455 (isMUBUF(MI) || isMTBUF(MI)))) {
7456 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7457 ? AMDGPU::OpName::rsrc
7458 : AMDGPU::OpName::srsrc;
7459 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7460 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7461 CreatedBB = generateWaterFallLoop(*this, MI, {SRsrc}, MDT);
7462
7463 AMDGPU::OpName SampOpName =
7464 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7465 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7466 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7467 CreatedBB = generateWaterFallLoop(*this, MI, {SSamp}, MDT);
7468
7469 return CreatedBB;
7470 }
7471
7472 // Legalize SI_CALL
7473 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7474 MachineOperand *Dest = &MI.getOperand(0);
7475 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7476 createWaterFallForSiCall(&MI, MDT, {Dest});
7477 }
7478 }
7479
7480 // Legalize s_sleep_var.
7481 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7482 const DebugLoc &DL = MI.getDebugLoc();
7483 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7484 int Src0Idx =
7485 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7486 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7487 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7488 .add(Src0);
7489 Src0.ChangeToRegister(Reg, false);
7490 return nullptr;
7491 }
7492
7493 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7494 // operands are scalar.
7495 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7496 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7497 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7498 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7499 for (MachineOperand &Src : MI.explicit_operands()) {
7500 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7501 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7502 }
7503 return CreatedBB;
7504 }
7505
7506 // Legalize MUBUF instructions.
7507 bool isSoffsetLegal = true;
7508 int SoffsetIdx =
7509 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7510 if (SoffsetIdx != -1) {
7511 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7512 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7513 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7514 isSoffsetLegal = false;
7515 }
7516 }
7517
7518 bool isRsrcLegal = true;
7519 int RsrcIdx =
7520 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7521 if (RsrcIdx != -1) {
7522 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7523 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7524 isRsrcLegal = false;
7525 }
7526
7527 // The operands are legal.
7528 if (isRsrcLegal && isSoffsetLegal)
7529 return CreatedBB;
7530
7531 if (!isRsrcLegal) {
7532 // Legalize a VGPR Rsrc
7533 //
7534 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7535 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7536 // a zero-value SRsrc.
7537 //
7538 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7539 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7540 // above.
7541 //
7542 // Otherwise we are on non-ADDR64 hardware, and/or we have
7543 // idxen/offen/bothen and we fall back to a waterfall loop.
7544
7545 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7546 MachineBasicBlock &MBB = *MI.getParent();
7547
7548 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7549 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7550 // This is already an ADDR64 instruction so we need to add the pointer
7551 // extracted from the resource descriptor to the current value of VAddr.
7552 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7553 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7554 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7555
7556 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7557 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7558 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7559
7560 unsigned RsrcPtr, NewSRsrc;
7561 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7562
7563 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7564 const DebugLoc &DL = MI.getDebugLoc();
7565 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7566 .addDef(CondReg0)
7567 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7568 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7569 .addImm(0);
7570
7571 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7572 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7573 .addDef(CondReg1, RegState::Dead)
7574 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7575 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7576 .addReg(CondReg0, RegState::Kill)
7577 .addImm(0);
7578
7579 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7580 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7581 .addReg(NewVAddrLo)
7582 .addImm(AMDGPU::sub0)
7583 .addReg(NewVAddrHi)
7584 .addImm(AMDGPU::sub1);
7585
7586 VAddr->setReg(NewVAddr);
7587 Rsrc->setReg(NewSRsrc);
7588 } else if (!VAddr && ST.hasAddr64()) {
7589 // This instructions is the _OFFSET variant, so we need to convert it to
7590 // ADDR64.
7591 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7592 "FIXME: Need to emit flat atomics here");
7593
7594 unsigned RsrcPtr, NewSRsrc;
7595 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7596
7597 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7598 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7599 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7600 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7601 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7602
7603 // Atomics with return have an additional tied operand and are
7604 // missing some of the special bits.
7605 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7606 MachineInstr *Addr64;
7607
7608 if (!VDataIn) {
7609 // Regular buffer load / store.
7611 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7612 .add(*VData)
7613 .addReg(NewVAddr)
7614 .addReg(NewSRsrc)
7615 .add(*SOffset)
7616 .add(*Offset);
7617
7618 if (const MachineOperand *CPol =
7619 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7620 MIB.addImm(CPol->getImm());
7621 }
7622
7623 if (const MachineOperand *TFE =
7624 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7625 MIB.addImm(TFE->getImm());
7626 }
7627
7628 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7629
7630 MIB.cloneMemRefs(MI);
7631 Addr64 = MIB;
7632 } else {
7633 // Atomics with return.
7634 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7635 .add(*VData)
7636 .add(*VDataIn)
7637 .addReg(NewVAddr)
7638 .addReg(NewSRsrc)
7639 .add(*SOffset)
7640 .add(*Offset)
7641 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7642 .cloneMemRefs(MI);
7643 }
7644
7645 MI.removeFromParent();
7646
7647 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7648 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7649 NewVAddr)
7650 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7651 .addImm(AMDGPU::sub0)
7652 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7653 .addImm(AMDGPU::sub1);
7654 } else {
7655 // Legalize a VGPR Rsrc and soffset together.
7656 if (!isSoffsetLegal) {
7657 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7658 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc, Soffset}, MDT);
7659 return CreatedBB;
7660 }
7661 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc}, MDT);
7662 return CreatedBB;
7663 }
7664 }
7665
7666 // Legalize a VGPR soffset.
7667 if (!isSoffsetLegal) {
7668 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7669 CreatedBB = generateWaterFallLoop(*this, MI, {Soffset}, MDT);
7670 return CreatedBB;
7671 }
7672 return CreatedBB;
7673}
7674
7676 InstrList.insert(MI);
7677 // Add MBUF instructiosn to deferred list.
7678 int RsrcIdx =
7679 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7680 if (RsrcIdx != -1) {
7681 DeferredList.insert(MI);
7682 }
7683}
7684
7686 return DeferredList.contains(MI);
7687}
7688
7689// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7690// lowering (change sgpr to vgpr).
7691// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7692// size. Need to legalize the size of the operands during the vgpr lowering
7693// chain. This can be removed after we have sgpr16 in place
7695 MachineRegisterInfo &MRI) const {
7696 if (!ST.useRealTrue16Insts())
7697 return;
7698
7699 unsigned Opcode = MI.getOpcode();
7700 MachineBasicBlock *MBB = MI.getParent();
7701 // Legalize operands and check for size mismatch
7702 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7703 OpIdx >= get(Opcode).getNumOperands() ||
7704 get(Opcode).operands()[OpIdx].RegClass == -1)
7705 return;
7706
7707 MachineOperand &Op = MI.getOperand(OpIdx);
7708 if (!Op.isReg() || !Op.getReg().isVirtual())
7709 return;
7710
7711 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7712 if (!RI.isVGPRClass(CurrRC))
7713 return;
7714
7715 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7716 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7717 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7718 Op.setSubReg(AMDGPU::lo16);
7719 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7720 const DebugLoc &DL = MI.getDebugLoc();
7721 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7722 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7723 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7724 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7725 .addReg(Op.getReg())
7726 .addImm(AMDGPU::lo16)
7727 .addReg(Undef)
7728 .addImm(AMDGPU::hi16);
7729 Op.setReg(NewDstReg);
7730 }
7731}
7733 MachineRegisterInfo &MRI) const {
7734 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7736}
7737
7741 ArrayRef<Register> PhySGPRs) const {
7742 assert(MI->getOpcode() == AMDGPU::SI_CALL_ISEL &&
7743 "This only handle waterfall for SI_CALL_ISEL");
7744 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7745 // following copies, we also need to move copies from and to physical
7746 // registers into the loop block.
7747 // Also move the copies to physical registers into the loop block
7748 MachineBasicBlock &MBB = *MI->getParent();
7750 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
7751 --Start;
7753 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
7754 ++End;
7755
7756 // Also include following copies of the return value
7757 ++End;
7758 while (End != MBB.end() && End->isCopy() &&
7759 MI->definesRegister(End->getOperand(1).getReg(), &RI))
7760 ++End;
7761
7762 generateWaterFallLoop(*this, *MI, ScalarOps, MDT, Start, End, PhySGPRs);
7763}
7764
7766 MachineDominatorTree *MDT) const {
7768 DenseMap<MachineInstr *, bool> V2SPhyCopiesToErase;
7769 while (!Worklist.empty()) {
7770 MachineInstr &Inst = *Worklist.top();
7771 Worklist.erase_top();
7772 // Skip MachineInstr in the deferred list.
7773 if (Worklist.isDeferred(&Inst))
7774 continue;
7775 moveToVALUImpl(Worklist, MDT, Inst, WaterFalls, V2SPhyCopiesToErase);
7776 }
7777
7778 // Deferred list of instructions will be processed once
7779 // all the MachineInstr in the worklist are done.
7780 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7781 moveToVALUImpl(Worklist, MDT, *Inst, WaterFalls, V2SPhyCopiesToErase);
7782 assert(Worklist.empty() &&
7783 "Deferred MachineInstr are not supposed to re-populate worklist");
7784 }
7785
7786 for (std::pair<MachineInstr *, V2PhysSCopyInfo> &Entry : WaterFalls) {
7787 if (Entry.first->getOpcode() == AMDGPU::SI_CALL_ISEL)
7788 createWaterFallForSiCall(Entry.first, MDT, Entry.second.MOs,
7789 Entry.second.SGPRs);
7790 }
7791
7792 for (std::pair<MachineInstr *, bool> Entry : V2SPhyCopiesToErase)
7793 if (Entry.second)
7794 Entry.first->eraseFromParent();
7795}
7797 MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const {
7798 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7799 // hope for the best.
7800 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI, DstReg);
7801 ArrayRef<int16_t> SubRegIndices = RI.getRegSplitParts(DstRC, 4);
7802 if (SubRegIndices.size() <= 1) {
7803 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7804 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7805 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7806 .add(Inst.getOperand(1));
7807 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7808 DstReg)
7809 .addReg(NewDst);
7810 } else {
7812 for (int16_t Indice : SubRegIndices) {
7813 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7814 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7815 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7816 .addReg(Inst.getOperand(1).getReg(), {}, Indice);
7817
7818 DstRegs.push_back(NewDst);
7819 }
7821 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7822 get(AMDGPU::REG_SEQUENCE), DstReg);
7823 for (unsigned i = 0; i < SubRegIndices.size(); ++i) {
7824 MIB.addReg(DstRegs[i]);
7825 MIB.addImm(RI.getSubRegFromChannel(i));
7826 }
7827 }
7828}
7829
7831 SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst,
7834 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7835 if (DstReg == AMDGPU::M0) {
7836 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7837 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7838 return;
7839 }
7840 Register SrcReg = Inst.getOperand(1).getReg();
7843 // Only search current block since phyreg's def & use cannot cross
7844 // blocks when MF.NoPhi = false.
7845 while (++I != E) {
7846 // For SI_CALL_ISEL users, replace the phys SGPR with the VGPR source
7847 // and record the operand for later waterfall loop generation.
7848 if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) {
7849 MachineInstr *UseMI = &*I;
7850 for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) {
7851 if (UseMI->getOperand(i).isReg() &&
7852 UseMI->getOperand(i).getReg() == DstReg) {
7853 MachineOperand *MO = &UseMI->getOperand(i);
7854 MO->setReg(SrcReg);
7855 V2PhysSCopyInfo &V2SCopyInfo = WaterFalls[UseMI];
7856 V2SCopyInfo.MOs.push_back(MO);
7857 V2SCopyInfo.SGPRs.push_back(DstReg);
7858 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7859 }
7860 }
7861 } else if (I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG &&
7862 I->getOperand(0).isReg() &&
7863 I->getOperand(0).getReg() == DstReg) {
7864 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7865 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7866 } else if (I->readsRegister(DstReg, &RI)) {
7867 // COPY cannot be erased if other type of inst uses it.
7868 V2SPhyCopiesToErase[&Inst] = false;
7869 }
7870 if (I->findRegisterDefOperand(DstReg, &RI))
7871 break;
7872 }
7873}
7874
7876 SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst,
7878 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7879
7881 if (!MBB)
7882 return;
7883 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7884 unsigned Opcode = Inst.getOpcode();
7885 unsigned NewOpcode = getVALUOp(Inst);
7886 const DebugLoc &DL = Inst.getDebugLoc();
7887
7888 // Handle some special cases
7889 switch (Opcode) {
7890 default:
7891 break;
7892 case AMDGPU::S_ADD_I32:
7893 case AMDGPU::S_SUB_I32: {
7894 // FIXME: The u32 versions currently selected use the carry.
7895 bool Changed;
7896 MachineBasicBlock *CreatedBBTmp = nullptr;
7897 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7898 if (Changed)
7899 return;
7900
7901 // Default handling
7902 break;
7903 }
7904
7905 case AMDGPU::S_MUL_U64:
7906 if (ST.hasVMulU64Inst()) {
7907 NewOpcode = AMDGPU::V_MUL_U64_e64;
7908 break;
7909 }
7910 // Split s_mul_u64 in 32-bit vector multiplications.
7911 splitScalarSMulU64(Worklist, Inst, MDT);
7912 Inst.eraseFromParent();
7913 return;
7914
7915 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7916 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7917 // This is a special case of s_mul_u64 where all the operands are either
7918 // zero extended or sign extended.
7919 splitScalarSMulPseudo(Worklist, Inst, MDT);
7920 Inst.eraseFromParent();
7921 return;
7922
7923 case AMDGPU::S_AND_B64:
7924 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7925 Inst.eraseFromParent();
7926 return;
7927
7928 case AMDGPU::S_OR_B64:
7929 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7930 Inst.eraseFromParent();
7931 return;
7932
7933 case AMDGPU::S_XOR_B64:
7934 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7935 Inst.eraseFromParent();
7936 return;
7937
7938 case AMDGPU::S_NAND_B64:
7939 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7940 Inst.eraseFromParent();
7941 return;
7942
7943 case AMDGPU::S_NOR_B64:
7944 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7945 Inst.eraseFromParent();
7946 return;
7947
7948 case AMDGPU::S_XNOR_B64:
7949 if (ST.hasDLInsts())
7950 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7951 else
7952 splitScalar64BitXnor(Worklist, Inst, MDT);
7953 Inst.eraseFromParent();
7954 return;
7955
7956 case AMDGPU::S_ANDN2_B64:
7957 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7958 Inst.eraseFromParent();
7959 return;
7960
7961 case AMDGPU::S_ORN2_B64:
7962 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7963 Inst.eraseFromParent();
7964 return;
7965
7966 case AMDGPU::S_BREV_B64:
7967 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7968 Inst.eraseFromParent();
7969 return;
7970
7971 case AMDGPU::S_NOT_B64:
7972 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7973 Inst.eraseFromParent();
7974 return;
7975
7976 case AMDGPU::S_BCNT1_I32_B64:
7977 splitScalar64BitBCNT(Worklist, Inst);
7978 Inst.eraseFromParent();
7979 return;
7980
7981 case AMDGPU::S_BFE_I64:
7982 splitScalar64BitBFE(Worklist, Inst);
7983 Inst.eraseFromParent();
7984 return;
7985
7986 case AMDGPU::S_FLBIT_I32_B64:
7987 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7988 Inst.eraseFromParent();
7989 return;
7990 case AMDGPU::S_FF1_I32_B64:
7991 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7992 Inst.eraseFromParent();
7993 return;
7994
7995 case AMDGPU::S_LSHL_B32:
7996 if (ST.hasOnlyRevVALUShifts()) {
7997 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7998 swapOperands(Inst);
7999 }
8000 break;
8001 case AMDGPU::S_ASHR_I32:
8002 if (ST.hasOnlyRevVALUShifts()) {
8003 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
8004 swapOperands(Inst);
8005 }
8006 break;
8007 case AMDGPU::S_LSHR_B32:
8008 if (ST.hasOnlyRevVALUShifts()) {
8009 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
8010 swapOperands(Inst);
8011 }
8012 break;
8013 case AMDGPU::S_LSHL_B64:
8014 if (ST.hasOnlyRevVALUShifts()) {
8015 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
8016 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
8017 : AMDGPU::V_LSHLREV_B64_e64;
8018 swapOperands(Inst);
8019 }
8020 break;
8021 case AMDGPU::S_ASHR_I64:
8022 if (ST.hasOnlyRevVALUShifts()) {
8023 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
8024 swapOperands(Inst);
8025 }
8026 break;
8027 case AMDGPU::S_LSHR_B64:
8028 if (ST.hasOnlyRevVALUShifts()) {
8029 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
8030 swapOperands(Inst);
8031 }
8032 break;
8033
8034 case AMDGPU::S_ABS_I32:
8035 lowerScalarAbs(Worklist, Inst);
8036 Inst.eraseFromParent();
8037 return;
8038
8039 case AMDGPU::S_ABSDIFF_I32:
8040 lowerScalarAbsDiff(Worklist, Inst);
8041 Inst.eraseFromParent();
8042 return;
8043
8044 case AMDGPU::S_CBRANCH_SCC0:
8045 case AMDGPU::S_CBRANCH_SCC1: {
8046 // Clear unused bits of vcc
8047 Register CondReg = Inst.getOperand(1).getReg();
8048 bool IsSCC = CondReg == AMDGPU::SCC;
8050 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
8051 .addReg(LMC.ExecReg)
8052 .addReg(IsSCC ? LMC.VccReg : CondReg);
8053 Inst.removeOperand(1);
8054 } break;
8055
8056 case AMDGPU::S_BFE_U64:
8057 case AMDGPU::S_BFM_B64:
8058 llvm_unreachable("Moving this op to VALU not implemented");
8059
8060 case AMDGPU::S_PACK_LL_B32_B16:
8061 case AMDGPU::S_PACK_LH_B32_B16:
8062 case AMDGPU::S_PACK_HL_B32_B16:
8063 case AMDGPU::S_PACK_HH_B32_B16:
8064 movePackToVALU(Worklist, MRI, Inst);
8065 Inst.eraseFromParent();
8066 return;
8067
8068 case AMDGPU::S_XNOR_B32:
8069 lowerScalarXnor(Worklist, Inst);
8070 Inst.eraseFromParent();
8071 return;
8072
8073 case AMDGPU::S_NAND_B32:
8074 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
8075 Inst.eraseFromParent();
8076 return;
8077
8078 case AMDGPU::S_NOR_B32:
8079 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
8080 Inst.eraseFromParent();
8081 return;
8082
8083 case AMDGPU::S_ANDN2_B32:
8084 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
8085 Inst.eraseFromParent();
8086 return;
8087
8088 case AMDGPU::S_ORN2_B32:
8089 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
8090 Inst.eraseFromParent();
8091 return;
8092
8093 // TODO: remove as soon as everything is ready
8094 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8095 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8096 // can only be selected from the uniform SDNode.
8097 case AMDGPU::S_ADD_CO_PSEUDO:
8098 case AMDGPU::S_SUB_CO_PSEUDO: {
8099 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8100 ? AMDGPU::V_ADDC_U32_e64
8101 : AMDGPU::V_SUBB_U32_e64;
8102 const auto *CarryRC = RI.getWaveMaskRegClass();
8103
8104 Register CarryInReg = Inst.getOperand(4).getReg();
8105 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8106 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8107 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8108 .addReg(CarryInReg);
8109 }
8110
8111 Register CarryOutReg = Inst.getOperand(1).getReg();
8112
8113 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8114 MRI.getRegClass(Inst.getOperand(0).getReg())));
8115 MachineInstr *CarryOp =
8116 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8117 .addReg(CarryOutReg, RegState::Define)
8118 .add(Inst.getOperand(2))
8119 .add(Inst.getOperand(3))
8120 .addReg(CarryInReg)
8121 .addImm(0);
8122 legalizeOperands(*CarryOp);
8123 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8124 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8125 Inst.eraseFromParent();
8126 }
8127 return;
8128 case AMDGPU::S_UADDO_PSEUDO:
8129 case AMDGPU::S_USUBO_PSEUDO: {
8130 MachineOperand &Dest0 = Inst.getOperand(0);
8131 MachineOperand &Dest1 = Inst.getOperand(1);
8132 MachineOperand &Src0 = Inst.getOperand(2);
8133 MachineOperand &Src1 = Inst.getOperand(3);
8134
8135 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8136 ? AMDGPU::V_ADD_CO_U32_e64
8137 : AMDGPU::V_SUB_CO_U32_e64;
8138 const TargetRegisterClass *NewRC =
8139 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8140 Register DestReg = MRI.createVirtualRegister(NewRC);
8141 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8142 .addReg(Dest1.getReg(), RegState::Define)
8143 .add(Src0)
8144 .add(Src1)
8145 .addImm(0); // clamp bit
8146
8147 legalizeOperands(*NewInstr, MDT);
8148 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8149 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8150 Inst.eraseFromParent();
8151 }
8152 return;
8153 case AMDGPU::S_LSHL1_ADD_U32:
8154 case AMDGPU::S_LSHL2_ADD_U32:
8155 case AMDGPU::S_LSHL3_ADD_U32:
8156 case AMDGPU::S_LSHL4_ADD_U32: {
8157 MachineOperand &Dest = Inst.getOperand(0);
8158 MachineOperand &Src0 = Inst.getOperand(1);
8159 MachineOperand &Src1 = Inst.getOperand(2);
8160 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8161 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8162 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8163 : 4);
8164
8165 const TargetRegisterClass *NewRC =
8166 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8167 Register DestReg = MRI.createVirtualRegister(NewRC);
8168 MachineInstr *NewInstr =
8169 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8170 .add(Src0)
8171 .addImm(ShiftAmt)
8172 .add(Src1);
8173
8174 legalizeOperands(*NewInstr, MDT);
8175 MRI.replaceRegWith(Dest.getReg(), DestReg);
8176 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8177 Inst.eraseFromParent();
8178 }
8179 return;
8180 case AMDGPU::S_CSELECT_B32:
8181 case AMDGPU::S_CSELECT_B64:
8182 lowerSelect(Worklist, Inst, MDT);
8183 Inst.eraseFromParent();
8184 return;
8185 case AMDGPU::S_CMP_EQ_I32:
8186 case AMDGPU::S_CMP_LG_I32:
8187 case AMDGPU::S_CMP_GT_I32:
8188 case AMDGPU::S_CMP_GE_I32:
8189 case AMDGPU::S_CMP_LT_I32:
8190 case AMDGPU::S_CMP_LE_I32:
8191 case AMDGPU::S_CMP_EQ_U32:
8192 case AMDGPU::S_CMP_LG_U32:
8193 case AMDGPU::S_CMP_GT_U32:
8194 case AMDGPU::S_CMP_GE_U32:
8195 case AMDGPU::S_CMP_LT_U32:
8196 case AMDGPU::S_CMP_LE_U32:
8197 case AMDGPU::S_CMP_EQ_U64:
8198 case AMDGPU::S_CMP_LG_U64:
8199 case AMDGPU::S_CMP_LT_F32:
8200 case AMDGPU::S_CMP_EQ_F32:
8201 case AMDGPU::S_CMP_LE_F32:
8202 case AMDGPU::S_CMP_GT_F32:
8203 case AMDGPU::S_CMP_LG_F32:
8204 case AMDGPU::S_CMP_GE_F32:
8205 case AMDGPU::S_CMP_O_F32:
8206 case AMDGPU::S_CMP_U_F32:
8207 case AMDGPU::S_CMP_NGE_F32:
8208 case AMDGPU::S_CMP_NLG_F32:
8209 case AMDGPU::S_CMP_NGT_F32:
8210 case AMDGPU::S_CMP_NLE_F32:
8211 case AMDGPU::S_CMP_NEQ_F32:
8212 case AMDGPU::S_CMP_NLT_F32: {
8213 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8214 auto NewInstr =
8215 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8216 .setMIFlags(Inst.getFlags());
8217 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8218 0) {
8219 NewInstr
8220 .addImm(0) // src0_modifiers
8221 .add(Inst.getOperand(0)) // src0
8222 .addImm(0) // src1_modifiers
8223 .add(Inst.getOperand(1)) // src1
8224 .addImm(0); // clamp
8225 } else {
8226 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8227 }
8228 legalizeOperands(*NewInstr, MDT);
8229 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8230 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8231 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8232 Inst.eraseFromParent();
8233 return;
8234 }
8235 case AMDGPU::S_CMP_LT_F16:
8236 case AMDGPU::S_CMP_EQ_F16:
8237 case AMDGPU::S_CMP_LE_F16:
8238 case AMDGPU::S_CMP_GT_F16:
8239 case AMDGPU::S_CMP_LG_F16:
8240 case AMDGPU::S_CMP_GE_F16:
8241 case AMDGPU::S_CMP_O_F16:
8242 case AMDGPU::S_CMP_U_F16:
8243 case AMDGPU::S_CMP_NGE_F16:
8244 case AMDGPU::S_CMP_NLG_F16:
8245 case AMDGPU::S_CMP_NGT_F16:
8246 case AMDGPU::S_CMP_NLE_F16:
8247 case AMDGPU::S_CMP_NEQ_F16:
8248 case AMDGPU::S_CMP_NLT_F16: {
8249 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8250 auto NewInstr =
8251 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8252 .setMIFlags(Inst.getFlags());
8253 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8254 NewInstr
8255 .addImm(0) // src0_modifiers
8256 .add(Inst.getOperand(0)) // src0
8257 .addImm(0) // src1_modifiers
8258 .add(Inst.getOperand(1)) // src1
8259 .addImm(0); // clamp
8260 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8261 NewInstr.addImm(0); // op_sel0
8262 } else {
8263 NewInstr
8264 .add(Inst.getOperand(0))
8265 .add(Inst.getOperand(1));
8266 }
8267 legalizeOperandsVALUt16(*NewInstr, MRI);
8268 legalizeOperands(*NewInstr, MDT);
8269 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8270 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8271 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8272 Inst.eraseFromParent();
8273 return;
8274 }
8275 case AMDGPU::S_CVT_HI_F32_F16: {
8276 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8277 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8278 if (ST.useRealTrue16Insts()) {
8279 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8280 .add(Inst.getOperand(1));
8281 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8282 .addImm(0) // src0_modifiers
8283 .addReg(TmpReg, {}, AMDGPU::hi16)
8284 .addImm(0) // clamp
8285 .addImm(0) // omod
8286 .addImm(0); // op_sel0
8287 } else {
8288 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8289 .addImm(16)
8290 .add(Inst.getOperand(1));
8291 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8292 .addImm(0) // src0_modifiers
8293 .addReg(TmpReg)
8294 .addImm(0) // clamp
8295 .addImm(0); // omod
8296 }
8297
8298 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8299 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8300 Inst.eraseFromParent();
8301 return;
8302 }
8303 case AMDGPU::S_MINIMUM_F32:
8304 case AMDGPU::S_MAXIMUM_F32: {
8305 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8306 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8307 .addImm(0) // src0_modifiers
8308 .add(Inst.getOperand(1))
8309 .addImm(0) // src1_modifiers
8310 .add(Inst.getOperand(2))
8311 .addImm(0) // clamp
8312 .addImm(0); // omod
8313 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8314
8315 legalizeOperands(*NewInstr, MDT);
8316 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8317 Inst.eraseFromParent();
8318 return;
8319 }
8320 case AMDGPU::S_MINIMUM_F16:
8321 case AMDGPU::S_MAXIMUM_F16: {
8322 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8323 ? &AMDGPU::VGPR_16RegClass
8324 : &AMDGPU::VGPR_32RegClass);
8325 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8326 .addImm(0) // src0_modifiers
8327 .add(Inst.getOperand(1))
8328 .addImm(0) // src1_modifiers
8329 .add(Inst.getOperand(2))
8330 .addImm(0) // clamp
8331 .addImm(0) // omod
8332 .addImm(0); // opsel0
8333 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8334 legalizeOperandsVALUt16(*NewInstr, MRI);
8335 legalizeOperands(*NewInstr, MDT);
8336 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8337 Inst.eraseFromParent();
8338 return;
8339 }
8340 case AMDGPU::V_S_EXP_F16_e64:
8341 case AMDGPU::V_S_LOG_F16_e64:
8342 case AMDGPU::V_S_RCP_F16_e64:
8343 case AMDGPU::V_S_RSQ_F16_e64:
8344 case AMDGPU::V_S_SQRT_F16_e64: {
8345 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8346 ? &AMDGPU::VGPR_16RegClass
8347 : &AMDGPU::VGPR_32RegClass);
8348 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8349 .add(Inst.getOperand(1)) // src0_modifiers
8350 .add(Inst.getOperand(2))
8351 .add(Inst.getOperand(3)) // clamp
8352 .add(Inst.getOperand(4)) // omod
8353 .setMIFlags(Inst.getFlags());
8354 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8355 NewInstr.addImm(0); // opsel0
8356 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8357 legalizeOperandsVALUt16(*NewInstr, MRI);
8358 legalizeOperands(*NewInstr, MDT);
8359 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8360 Inst.eraseFromParent();
8361 return;
8362 }
8363 }
8364
8365 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8366 // We cannot move this instruction to the VALU, so we should try to
8367 // legalize its operands instead.
8368 legalizeOperands(Inst, MDT);
8369 return;
8370 }
8371 // Handle converting generic instructions like COPY-to-SGPR into
8372 // COPY-to-VGPR.
8373 if (NewOpcode == Opcode) {
8374 Register DstReg = Inst.getOperand(0).getReg();
8375 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8376
8377 if (Inst.isCopy() && DstReg.isPhysical() &&
8378 Inst.getOperand(1).getReg().isVirtual()) {
8379 handleCopyToPhysHelper(Worklist, DstReg, Inst, MRI, WaterFalls,
8380 V2SPhyCopiesToErase);
8381 return;
8382 }
8383
8384 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8385 Register NewDstReg = Inst.getOperand(1).getReg();
8386 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8387 if (const TargetRegisterClass *CommonRC =
8388 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8389 // Instead of creating a copy where src and dst are the same register
8390 // class, we just replace all uses of dst with src. These kinds of
8391 // copies interfere with the heuristics MachineSink uses to decide
8392 // whether or not to split a critical edge. Since the pass assumes
8393 // that copies will end up as machine instructions and not be
8394 // eliminated.
8395 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8396 MRI.replaceRegWith(DstReg, NewDstReg);
8397 MRI.clearKillFlags(NewDstReg);
8398 Inst.getOperand(0).setReg(DstReg);
8399
8400 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8401 llvm_unreachable("failed to constrain register");
8402
8403 Inst.eraseFromParent();
8404
8405 for (MachineOperand &UseMO :
8406 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8407 MachineInstr &UseMI = *UseMO.getParent();
8408
8409 // Legalize t16 operands since replaceReg is called after
8410 // addUsersToVALU.
8412
8413 unsigned OpIdx = UseMI.getOperandNo(&UseMO);
8414 if (const TargetRegisterClass *OpRC =
8415 getRegClass(UseMI.getDesc(), OpIdx))
8416 MRI.constrainRegClass(NewDstReg, OpRC);
8417 }
8418
8419 return;
8420 }
8421 }
8422
8423 // If this is a v2s copy between 16bit and 32bit reg,
8424 // replace vgpr copy to reg_sequence/extract_subreg
8425 // This can be remove after we have sgpr16 in place
8426 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8427 Inst.getOperand(1).getReg().isVirtual() &&
8428 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8429 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8430 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8431 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8432 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8433 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8434 get(AMDGPU::IMPLICIT_DEF), Undef);
8435 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8436 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8437 .addReg(Inst.getOperand(1).getReg())
8438 .addImm(AMDGPU::lo16)
8439 .addReg(Undef)
8440 .addImm(AMDGPU::hi16);
8441 Inst.eraseFromParent();
8442 MRI.replaceRegWith(DstReg, NewDstReg);
8443 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8444 return;
8445 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8446 AMDGPU::lo16)) {
8447 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8448 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8449 MRI.replaceRegWith(DstReg, NewDstReg);
8450 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8451 return;
8452 }
8453 }
8454
8455 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8456 MRI.replaceRegWith(DstReg, NewDstReg);
8457 legalizeOperands(Inst, MDT);
8458 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8459 return;
8460 }
8461
8462 // Use the new VALU Opcode.
8463 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8464 .setMIFlags(Inst.getFlags());
8465 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8466 // Intersperse VOP3 modifiers among the SALU operands.
8467 NewInstr->addOperand(Inst.getOperand(0));
8468 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8469 AMDGPU::OpName::src0_modifiers) >= 0)
8470 NewInstr.addImm(0);
8471 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8472 const MachineOperand &Src = Inst.getOperand(1);
8473 NewInstr->addOperand(Src);
8474 }
8475
8476 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8477 // We are converting these to a BFE, so we need to add the missing
8478 // operands for the size and offset.
8479 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8480 NewInstr.addImm(0);
8481 NewInstr.addImm(Size);
8482 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8483 // The VALU version adds the second operand to the result, so insert an
8484 // extra 0 operand.
8485 NewInstr.addImm(0);
8486 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8487 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8488 // If we need to move this to VGPRs, we need to unpack the second
8489 // operand back into the 2 separate ones for bit offset and width.
8490 assert(OffsetWidthOp.isImm() &&
8491 "Scalar BFE is only implemented for constant width and offset");
8492 uint32_t Imm = OffsetWidthOp.getImm();
8493
8494 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8495 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8496 NewInstr.addImm(Offset);
8497 NewInstr.addImm(BitWidth);
8498 } else {
8499 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8500 AMDGPU::OpName::src1_modifiers) >= 0)
8501 NewInstr.addImm(0);
8502 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8503 NewInstr->addOperand(Inst.getOperand(2));
8504 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8505 AMDGPU::OpName::src2_modifiers) >= 0)
8506 NewInstr.addImm(0);
8507 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8508 NewInstr->addOperand(Inst.getOperand(3));
8509 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8510 NewInstr.addImm(0);
8511 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8512 NewInstr.addImm(0);
8513 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8514 NewInstr.addImm(0);
8515 }
8516 } else {
8517 // Just copy the SALU operands.
8518 for (const MachineOperand &Op : Inst.explicit_operands())
8519 NewInstr->addOperand(Op);
8520 }
8521
8522 // Remove any references to SCC. Vector instructions can't read from it, and
8523 // We're just about to add the implicit use / defs of VCC, and we don't want
8524 // both.
8525 for (MachineOperand &Op : Inst.implicit_operands()) {
8526 if (Op.getReg() == AMDGPU::SCC) {
8527 // Only propagate through live-def of SCC.
8528 if (Op.isDef() && !Op.isDead())
8529 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8530 if (Op.isUse())
8531 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8532 }
8533 }
8534 Inst.eraseFromParent();
8535 Register NewDstReg;
8536 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8537 Register DstReg = NewInstr->getOperand(0).getReg();
8538 assert(DstReg.isVirtual());
8539 // Update the destination register class.
8540 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8541 assert(NewDstRC);
8542 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8543 MRI.replaceRegWith(DstReg, NewDstReg);
8544 }
8545 fixImplicitOperands(*NewInstr);
8546
8547 legalizeOperandsVALUt16(*NewInstr, MRI);
8548
8549 // Legalize the operands
8550 legalizeOperands(*NewInstr, MDT);
8551 if (NewDstReg)
8552 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8553}
8554
8555// Add/sub require special handling to deal with carry outs.
8556std::pair<bool, MachineBasicBlock *>
8557SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8558 MachineDominatorTree *MDT) const {
8559 if (ST.hasAddNoCarryInsts()) {
8560 // Assume there is no user of scc since we don't select this in that case.
8561 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8562 // is used.
8563
8564 MachineBasicBlock &MBB = *Inst.getParent();
8565 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8566
8567 Register OldDstReg = Inst.getOperand(0).getReg();
8568 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8569
8570 unsigned Opc = Inst.getOpcode();
8571 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8572
8573 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8574 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8575
8576 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8577 Inst.removeOperand(3);
8578
8579 Inst.setDesc(get(NewOpc));
8580 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8581 Inst.addImplicitDefUseOperands(*MBB.getParent());
8582 MRI.replaceRegWith(OldDstReg, ResultReg);
8583 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8584
8585 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8586 return std::pair(true, NewBB);
8587 }
8588
8589 return std::pair(false, nullptr);
8590}
8591
8592void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8593 MachineDominatorTree *MDT) const {
8594
8595 MachineBasicBlock &MBB = *Inst.getParent();
8596 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8597 MachineBasicBlock::iterator MII = Inst;
8598 const DebugLoc &DL = Inst.getDebugLoc();
8599
8600 MachineOperand &Dest = Inst.getOperand(0);
8601 MachineOperand &Src0 = Inst.getOperand(1);
8602 MachineOperand &Src1 = Inst.getOperand(2);
8603 MachineOperand &Cond = Inst.getOperand(3);
8604
8605 Register CondReg = Cond.getReg();
8606 bool IsSCC = (CondReg == AMDGPU::SCC);
8607
8608 // If this is a trivial select where the condition is effectively not SCC
8609 // (CondReg is a source of copy to SCC), then the select is semantically
8610 // equivalent to copying CondReg. Hence, there is no need to create
8611 // V_CNDMASK, we can just use that and bail out.
8612 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8613 (Src1.getImm() == 0)) {
8614 MRI.replaceRegWith(Dest.getReg(), CondReg);
8615 return;
8616 }
8617
8618 Register NewCondReg = CondReg;
8619 if (IsSCC) {
8620 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8621 NewCondReg = MRI.createVirtualRegister(TC);
8622
8623 // Now look for the closest SCC def if it is a copy
8624 // replacing the CondReg with the COPY source register
8625 bool CopyFound = false;
8626 for (MachineInstr &CandI :
8628 Inst.getParent()->rend())) {
8629 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8630 -1) {
8631 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8632 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8633 .addReg(CandI.getOperand(1).getReg());
8634 CopyFound = true;
8635 }
8636 break;
8637 }
8638 }
8639 if (!CopyFound) {
8640 // SCC def is not a copy
8641 // Insert a trivial select instead of creating a copy, because a copy from
8642 // SCC would semantically mean just copying a single bit, but we may need
8643 // the result to be a vector condition mask that needs preserving.
8644 unsigned Opcode =
8645 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8646 auto NewSelect =
8647 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8648 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8649 }
8650 }
8651
8652 Register NewDestReg = MRI.createVirtualRegister(
8653 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8654 MachineInstr *NewInst;
8655 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8656 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8657 .addImm(0)
8658 .add(Src1) // False
8659 .addImm(0)
8660 .add(Src0) // True
8661 .addReg(NewCondReg);
8662 } else {
8663 NewInst =
8664 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8665 .add(Src1) // False
8666 .add(Src0) // True
8667 .addReg(NewCondReg);
8668 }
8669 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8670 legalizeOperands(*NewInst, MDT);
8671 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8672}
8673
8674void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8675 MachineInstr &Inst) const {
8676 MachineBasicBlock &MBB = *Inst.getParent();
8677 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8678 MachineBasicBlock::iterator MII = Inst;
8679 const DebugLoc &DL = Inst.getDebugLoc();
8680
8681 MachineOperand &Dest = Inst.getOperand(0);
8682 MachineOperand &Src = Inst.getOperand(1);
8683 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8684 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8685
8686 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8687 : AMDGPU::V_SUB_CO_U32_e32;
8688
8689 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8690 .addImm(0)
8691 .addReg(Src.getReg());
8692
8693 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8694 .addReg(Src.getReg())
8695 .addReg(TmpReg);
8696
8697 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8698 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8699}
8700
8701void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8702 MachineInstr &Inst) const {
8703 MachineBasicBlock &MBB = *Inst.getParent();
8704 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8705 MachineBasicBlock::iterator MII = Inst;
8706 const DebugLoc &DL = Inst.getDebugLoc();
8707
8708 MachineOperand &Dest = Inst.getOperand(0);
8709 MachineOperand &Src1 = Inst.getOperand(1);
8710 MachineOperand &Src2 = Inst.getOperand(2);
8711 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8712 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8713 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8714
8715 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8716 : AMDGPU::V_SUB_CO_U32_e32;
8717
8718 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8719 .addReg(Src1.getReg())
8720 .addReg(Src2.getReg());
8721
8722 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8723
8724 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8725 .addReg(SubResultReg)
8726 .addReg(TmpReg);
8727
8728 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8729 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8730}
8731
8732void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8733 MachineInstr &Inst) const {
8734 MachineBasicBlock &MBB = *Inst.getParent();
8735 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8736 MachineBasicBlock::iterator MII = Inst;
8737 const DebugLoc &DL = Inst.getDebugLoc();
8738
8739 MachineOperand &Dest = Inst.getOperand(0);
8740 MachineOperand &Src0 = Inst.getOperand(1);
8741 MachineOperand &Src1 = Inst.getOperand(2);
8742
8743 if (ST.hasDLInsts()) {
8744 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8745 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8746 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8747
8748 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8749 .add(Src0)
8750 .add(Src1);
8751
8752 MRI.replaceRegWith(Dest.getReg(), NewDest);
8753 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8754 } else {
8755 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8756 // invert either source and then perform the XOR. If either source is a
8757 // scalar register, then we can leave the inversion on the scalar unit to
8758 // achieve a better distribution of scalar and vector instructions.
8759 bool Src0IsSGPR = Src0.isReg() &&
8760 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8761 bool Src1IsSGPR = Src1.isReg() &&
8762 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8763 MachineInstr *Xor;
8764 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8765 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8766
8767 // Build a pair of scalar instructions and add them to the work list.
8768 // The next iteration over the work list will lower these to the vector
8769 // unit as necessary.
8770 if (Src0IsSGPR) {
8771 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8772 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8773 .addReg(Temp)
8774 .add(Src1);
8775 } else if (Src1IsSGPR) {
8776 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8777 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8778 .add(Src0)
8779 .addReg(Temp);
8780 } else {
8781 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8782 .add(Src0)
8783 .add(Src1);
8784 MachineInstr *Not =
8785 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8786 Worklist.insert(Not);
8787 }
8788
8789 MRI.replaceRegWith(Dest.getReg(), NewDest);
8790
8791 Worklist.insert(Xor);
8792
8793 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8794 }
8795}
8796
8797void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8798 MachineInstr &Inst,
8799 unsigned Opcode) const {
8800 MachineBasicBlock &MBB = *Inst.getParent();
8801 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8802 MachineBasicBlock::iterator MII = Inst;
8803 const DebugLoc &DL = Inst.getDebugLoc();
8804
8805 MachineOperand &Dest = Inst.getOperand(0);
8806 MachineOperand &Src0 = Inst.getOperand(1);
8807 MachineOperand &Src1 = Inst.getOperand(2);
8808
8809 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8810 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8811
8812 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8813 .add(Src0)
8814 .add(Src1);
8815
8816 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8817 .addReg(Interm);
8818
8819 Worklist.insert(&Op);
8820 Worklist.insert(&Not);
8821
8822 MRI.replaceRegWith(Dest.getReg(), NewDest);
8823 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8824}
8825
8826void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8827 MachineInstr &Inst,
8828 unsigned Opcode) const {
8829 MachineBasicBlock &MBB = *Inst.getParent();
8830 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8831 MachineBasicBlock::iterator MII = Inst;
8832 const DebugLoc &DL = Inst.getDebugLoc();
8833
8834 MachineOperand &Dest = Inst.getOperand(0);
8835 MachineOperand &Src0 = Inst.getOperand(1);
8836 MachineOperand &Src1 = Inst.getOperand(2);
8837
8838 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8839 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8840
8841 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8842 .add(Src1);
8843
8844 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8845 .add(Src0)
8846 .addReg(Interm);
8847
8848 Worklist.insert(&Not);
8849 Worklist.insert(&Op);
8850
8851 MRI.replaceRegWith(Dest.getReg(), NewDest);
8852 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8853}
8854
8855void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8856 MachineInstr &Inst, unsigned Opcode,
8857 bool Swap) const {
8858 MachineBasicBlock &MBB = *Inst.getParent();
8859 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8860
8861 MachineOperand &Dest = Inst.getOperand(0);
8862 MachineOperand &Src0 = Inst.getOperand(1);
8863 const DebugLoc &DL = Inst.getDebugLoc();
8864
8865 MachineBasicBlock::iterator MII = Inst;
8866
8867 const MCInstrDesc &InstDesc = get(Opcode);
8868 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8869 MRI.getRegClass(Src0.getReg()) :
8870 &AMDGPU::SGPR_32RegClass;
8871
8872 const TargetRegisterClass *Src0SubRC =
8873 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8874
8875 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8876 AMDGPU::sub0, Src0SubRC);
8877
8878 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8879 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8880 const TargetRegisterClass *NewDestSubRC =
8881 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8882
8883 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8884 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8885
8886 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8887 AMDGPU::sub1, Src0SubRC);
8888
8889 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8890 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8891
8892 if (Swap)
8893 std::swap(DestSub0, DestSub1);
8894
8895 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8896 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8897 .addReg(DestSub0)
8898 .addImm(AMDGPU::sub0)
8899 .addReg(DestSub1)
8900 .addImm(AMDGPU::sub1);
8901
8902 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8903
8904 Worklist.insert(&LoHalf);
8905 Worklist.insert(&HiHalf);
8906
8907 // We don't need to legalizeOperands here because for a single operand, src0
8908 // will support any kind of input.
8909
8910 // Move all users of this moved value.
8911 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8912}
8913
8914// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8915// split the s_mul_u64 in 32-bit vector multiplications.
8916void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8917 MachineInstr &Inst,
8918 MachineDominatorTree *MDT) const {
8919 MachineBasicBlock &MBB = *Inst.getParent();
8920 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8921
8922 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8923 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8924 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8925
8926 MachineOperand &Dest = Inst.getOperand(0);
8927 MachineOperand &Src0 = Inst.getOperand(1);
8928 MachineOperand &Src1 = Inst.getOperand(2);
8929 const DebugLoc &DL = Inst.getDebugLoc();
8930 MachineBasicBlock::iterator MII = Inst;
8931
8932 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8933 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8934 const TargetRegisterClass *Src0SubRC =
8935 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8936 if (RI.isSGPRClass(Src0SubRC))
8937 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8938 const TargetRegisterClass *Src1SubRC =
8939 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8940 if (RI.isSGPRClass(Src1SubRC))
8941 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8942
8943 // First, we extract the low 32-bit and high 32-bit values from each of the
8944 // operands.
8945 MachineOperand Op0L =
8946 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8947 MachineOperand Op1L =
8948 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8949 MachineOperand Op0H =
8950 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8951 MachineOperand Op1H =
8952 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8953
8954 // The multilication is done as follows:
8955 //
8956 // Op1H Op1L
8957 // * Op0H Op0L
8958 // --------------------
8959 // Op1H*Op0L Op1L*Op0L
8960 // + Op1H*Op0H Op1L*Op0H
8961 // -----------------------------------------
8962 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8963 //
8964 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8965 // value and that would overflow.
8966 // The low 32-bit value is Op1L*Op0L.
8967 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8968
8969 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8970 MachineInstr *Op1L_Op0H =
8971 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8972 .add(Op1L)
8973 .add(Op0H);
8974
8975 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8976 MachineInstr *Op1H_Op0L =
8977 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8978 .add(Op1H)
8979 .add(Op0L);
8980
8981 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8982 MachineInstr *Carry =
8983 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8984 .add(Op1L)
8985 .add(Op0L);
8986
8987 MachineInstr *LoHalf =
8988 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8989 .add(Op1L)
8990 .add(Op0L);
8991
8992 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8993 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8994 .addReg(Op1L_Op0H_Reg)
8995 .addReg(Op1H_Op0L_Reg);
8996
8997 MachineInstr *HiHalf =
8998 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8999 .addReg(AddReg)
9000 .addReg(CarryReg);
9001
9002 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9003 .addReg(DestSub0)
9004 .addImm(AMDGPU::sub0)
9005 .addReg(DestSub1)
9006 .addImm(AMDGPU::sub1);
9007
9008 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9009
9010 // Try to legalize the operands in case we need to swap the order to keep it
9011 // valid.
9012 legalizeOperands(*Op1L_Op0H, MDT);
9013 legalizeOperands(*Op1H_Op0L, MDT);
9014 legalizeOperands(*Carry, MDT);
9015 legalizeOperands(*LoHalf, MDT);
9016 legalizeOperands(*Add, MDT);
9017 legalizeOperands(*HiHalf, MDT);
9018
9019 // Move all users of this moved value.
9020 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9021}
9022
9023// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
9024// multiplications.
9025void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
9026 MachineInstr &Inst,
9027 MachineDominatorTree *MDT) const {
9028 MachineBasicBlock &MBB = *Inst.getParent();
9029 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9030
9031 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9032 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9033 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9034
9035 MachineOperand &Dest = Inst.getOperand(0);
9036 MachineOperand &Src0 = Inst.getOperand(1);
9037 MachineOperand &Src1 = Inst.getOperand(2);
9038 const DebugLoc &DL = Inst.getDebugLoc();
9039 MachineBasicBlock::iterator MII = Inst;
9040
9041 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9042 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9043 const TargetRegisterClass *Src0SubRC =
9044 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9045 if (RI.isSGPRClass(Src0SubRC))
9046 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9047 const TargetRegisterClass *Src1SubRC =
9048 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9049 if (RI.isSGPRClass(Src1SubRC))
9050 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9051
9052 // First, we extract the low 32-bit and high 32-bit values from each of the
9053 // operands.
9054 MachineOperand Op0L =
9055 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9056 MachineOperand Op1L =
9057 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9058
9059 unsigned Opc = Inst.getOpcode();
9060 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9061 ? AMDGPU::V_MUL_HI_U32_e64
9062 : AMDGPU::V_MUL_HI_I32_e64;
9063 MachineInstr *HiHalf =
9064 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
9065
9066 MachineInstr *LoHalf =
9067 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9068 .add(Op1L)
9069 .add(Op0L);
9070
9071 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9072 .addReg(DestSub0)
9073 .addImm(AMDGPU::sub0)
9074 .addReg(DestSub1)
9075 .addImm(AMDGPU::sub1);
9076
9077 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9078
9079 // Try to legalize the operands in case we need to swap the order to keep it
9080 // valid.
9081 legalizeOperands(*HiHalf, MDT);
9082 legalizeOperands(*LoHalf, MDT);
9083
9084 // Move all users of this moved value.
9085 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9086}
9087
9088void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9089 MachineInstr &Inst, unsigned Opcode,
9090 MachineDominatorTree *MDT) const {
9091 MachineBasicBlock &MBB = *Inst.getParent();
9092 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9093
9094 MachineOperand &Dest = Inst.getOperand(0);
9095 MachineOperand &Src0 = Inst.getOperand(1);
9096 MachineOperand &Src1 = Inst.getOperand(2);
9097 const DebugLoc &DL = Inst.getDebugLoc();
9098
9099 MachineBasicBlock::iterator MII = Inst;
9100
9101 const MCInstrDesc &InstDesc = get(Opcode);
9102 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9103 MRI.getRegClass(Src0.getReg()) :
9104 &AMDGPU::SGPR_32RegClass;
9105
9106 const TargetRegisterClass *Src0SubRC =
9107 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9108 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9109 MRI.getRegClass(Src1.getReg()) :
9110 &AMDGPU::SGPR_32RegClass;
9111
9112 const TargetRegisterClass *Src1SubRC =
9113 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9114
9115 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9116 AMDGPU::sub0, Src0SubRC);
9117 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9118 AMDGPU::sub0, Src1SubRC);
9119 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9120 AMDGPU::sub1, Src0SubRC);
9121 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9122 AMDGPU::sub1, Src1SubRC);
9123
9124 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9125 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9126 const TargetRegisterClass *NewDestSubRC =
9127 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9128
9129 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9130 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9131 .add(SrcReg0Sub0)
9132 .add(SrcReg1Sub0);
9133
9134 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9135 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9136 .add(SrcReg0Sub1)
9137 .add(SrcReg1Sub1);
9138
9139 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9140 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9141 .addReg(DestSub0)
9142 .addImm(AMDGPU::sub0)
9143 .addReg(DestSub1)
9144 .addImm(AMDGPU::sub1);
9145
9146 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9147
9148 Worklist.insert(&LoHalf);
9149 Worklist.insert(&HiHalf);
9150
9151 // Move all users of this moved value.
9152 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9153}
9154
9155void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9156 MachineInstr &Inst,
9157 MachineDominatorTree *MDT) const {
9158 MachineBasicBlock &MBB = *Inst.getParent();
9159 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9160
9161 MachineOperand &Dest = Inst.getOperand(0);
9162 MachineOperand &Src0 = Inst.getOperand(1);
9163 MachineOperand &Src1 = Inst.getOperand(2);
9164 const DebugLoc &DL = Inst.getDebugLoc();
9165
9166 MachineBasicBlock::iterator MII = Inst;
9167
9168 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9169
9170 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9171
9172 MachineOperand* Op0;
9173 MachineOperand* Op1;
9174
9175 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9176 Op0 = &Src0;
9177 Op1 = &Src1;
9178 } else {
9179 Op0 = &Src1;
9180 Op1 = &Src0;
9181 }
9182
9183 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9184 .add(*Op0);
9185
9186 Register NewDest = MRI.createVirtualRegister(DestRC);
9187
9188 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9189 .addReg(Interm)
9190 .add(*Op1);
9191
9192 MRI.replaceRegWith(Dest.getReg(), NewDest);
9193
9194 Worklist.insert(&Xor);
9195}
9196
9197void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9198 MachineInstr &Inst) const {
9199 MachineBasicBlock &MBB = *Inst.getParent();
9200 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9201
9202 MachineBasicBlock::iterator MII = Inst;
9203 const DebugLoc &DL = Inst.getDebugLoc();
9204
9205 MachineOperand &Dest = Inst.getOperand(0);
9206 MachineOperand &Src = Inst.getOperand(1);
9207
9208 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9209 const TargetRegisterClass *SrcRC = Src.isReg() ?
9210 MRI.getRegClass(Src.getReg()) :
9211 &AMDGPU::SGPR_32RegClass;
9212
9213 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9214 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9215
9216 const TargetRegisterClass *SrcSubRC =
9217 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9218
9219 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9220 AMDGPU::sub0, SrcSubRC);
9221 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9222 AMDGPU::sub1, SrcSubRC);
9223
9224 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9225
9226 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9227
9228 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9229
9230 // We don't need to legalize operands here. src0 for either instruction can be
9231 // an SGPR, and the second input is unused or determined here.
9232 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9233}
9234
9235void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9236 MachineInstr &Inst) const {
9237 MachineBasicBlock &MBB = *Inst.getParent();
9238 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9239 MachineBasicBlock::iterator MII = Inst;
9240 const DebugLoc &DL = Inst.getDebugLoc();
9241
9242 MachineOperand &Dest = Inst.getOperand(0);
9243 uint32_t Imm = Inst.getOperand(2).getImm();
9244 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9245 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9246
9247 (void) Offset;
9248
9249 // Only sext_inreg cases handled.
9250 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9251 Offset == 0 && "Not implemented");
9252
9253 if (BitWidth < 32) {
9254 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9255 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9256 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9257
9258 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9259 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9260 .addImm(0)
9261 .addImm(BitWidth);
9262
9263 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9264 .addImm(31)
9265 .addReg(MidRegLo);
9266
9267 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9268 .addReg(MidRegLo)
9269 .addImm(AMDGPU::sub0)
9270 .addReg(MidRegHi)
9271 .addImm(AMDGPU::sub1);
9272
9273 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9274 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9275 return;
9276 }
9277
9278 MachineOperand &Src = Inst.getOperand(1);
9279 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9280 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9281
9282 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9283 .addImm(31)
9284 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9285
9286 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9287 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9288 .addImm(AMDGPU::sub0)
9289 .addReg(TmpReg)
9290 .addImm(AMDGPU::sub1);
9291
9292 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9293 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9294}
9295
9296void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9297 MachineInstr &Inst, unsigned Opcode,
9298 MachineDominatorTree *MDT) const {
9299 // (S_FLBIT_I32_B64 hi:lo) ->
9300 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9301 // (S_FF1_I32_B64 hi:lo) ->
9302 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9303
9304 MachineBasicBlock &MBB = *Inst.getParent();
9305 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9306 MachineBasicBlock::iterator MII = Inst;
9307 const DebugLoc &DL = Inst.getDebugLoc();
9308
9309 MachineOperand &Dest = Inst.getOperand(0);
9310 MachineOperand &Src = Inst.getOperand(1);
9311
9312 const MCInstrDesc &InstDesc = get(Opcode);
9313
9314 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9315 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9316 : AMDGPU::V_ADD_CO_U32_e32;
9317
9318 const TargetRegisterClass *SrcRC =
9319 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9320 const TargetRegisterClass *SrcSubRC =
9321 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9322
9323 MachineOperand SrcRegSub0 =
9324 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9325 MachineOperand SrcRegSub1 =
9326 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9327
9328 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9329 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9330 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9331 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9332
9333 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9334
9335 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9336
9337 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9338 .addReg(IsCtlz ? MidReg1 : MidReg2)
9339 .addImm(32)
9340 .addImm(1); // enable clamp
9341
9342 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9343 .addReg(MidReg3)
9344 .addReg(IsCtlz ? MidReg2 : MidReg1);
9345
9346 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9347
9348 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9349}
9350
9351void SIInstrInfo::addUsersToMoveToVALUWorklist(
9352 Register DstReg, MachineRegisterInfo &MRI,
9353 SIInstrWorklist &Worklist) const {
9354 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9355 MachineInstr &UseMI = *MO.getParent();
9356
9357 unsigned OpNo = 0;
9358
9359 switch (UseMI.getOpcode()) {
9360 case AMDGPU::COPY:
9361 case AMDGPU::WQM:
9362 case AMDGPU::SOFT_WQM:
9363 case AMDGPU::STRICT_WWM:
9364 case AMDGPU::STRICT_WQM:
9365 case AMDGPU::REG_SEQUENCE:
9366 case AMDGPU::PHI:
9367 case AMDGPU::INSERT_SUBREG:
9368 break;
9369 default:
9370 OpNo = MO.getOperandNo();
9371 break;
9372 }
9373
9374 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9375 MRI.constrainRegClass(DstReg, OpRC);
9376
9377 if (!RI.hasVectorRegisters(OpRC))
9378 Worklist.insert(&UseMI);
9379 else
9380 // Legalization could change user list.
9381 legalizeOperandsVALUt16(UseMI, OpNo, MRI);
9382 }
9383}
9384
9385void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9387 MachineInstr &Inst) const {
9388 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9389 MachineBasicBlock *MBB = Inst.getParent();
9390 MachineOperand &Src0 = Inst.getOperand(1);
9391 MachineOperand &Src1 = Inst.getOperand(2);
9392 const DebugLoc &DL = Inst.getDebugLoc();
9393
9394 if (ST.useRealTrue16Insts()) {
9395 Register SrcReg0, SrcReg1;
9396 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9397 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9398 BuildMI(*MBB, Inst, DL,
9399 get(Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg0)
9400 .add(Src0);
9401 } else {
9402 SrcReg0 = Src0.getReg();
9403 }
9404
9405 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9406 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9407 BuildMI(*MBB, Inst, DL,
9408 get(Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg1)
9409 .add(Src1);
9410 } else {
9411 SrcReg1 = Src1.getReg();
9412 }
9413
9414 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9415 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9416
9417 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9418 switch (Inst.getOpcode()) {
9419 case AMDGPU::S_PACK_LL_B32_B16:
9420 NewMI
9421 .addReg(SrcReg0, {},
9422 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9423 .addImm(AMDGPU::lo16)
9424 .addReg(SrcReg1, {},
9425 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9426 .addImm(AMDGPU::hi16);
9427 break;
9428 case AMDGPU::S_PACK_LH_B32_B16:
9429 NewMI
9430 .addReg(SrcReg0, {},
9431 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9432 .addImm(AMDGPU::lo16)
9433 .addReg(SrcReg1, {}, AMDGPU::hi16)
9434 .addImm(AMDGPU::hi16);
9435 break;
9436 case AMDGPU::S_PACK_HL_B32_B16:
9437 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9438 .addImm(AMDGPU::lo16)
9439 .addReg(SrcReg1, {},
9440 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9441 .addImm(AMDGPU::hi16);
9442 break;
9443 case AMDGPU::S_PACK_HH_B32_B16:
9444 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9445 .addImm(AMDGPU::lo16)
9446 .addReg(SrcReg1, {}, AMDGPU::hi16)
9447 .addImm(AMDGPU::hi16);
9448 break;
9449 default:
9450 llvm_unreachable("unhandled s_pack_* instruction");
9451 }
9452
9453 MachineOperand &Dest = Inst.getOperand(0);
9454 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9455 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9456 return;
9457 }
9458
9459 switch (Inst.getOpcode()) {
9460 case AMDGPU::S_PACK_LL_B32_B16: {
9461 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9462 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9463
9464 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9465 // 0.
9466 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9467 .addImm(0xffff);
9468
9469 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9470 .addReg(ImmReg, RegState::Kill)
9471 .add(Src0);
9472
9473 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9474 .add(Src1)
9475 .addImm(16)
9476 .addReg(TmpReg, RegState::Kill);
9477 break;
9478 }
9479 case AMDGPU::S_PACK_LH_B32_B16: {
9480 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9481 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9482 .addImm(0xffff);
9483 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9484 .addReg(ImmReg, RegState::Kill)
9485 .add(Src0)
9486 .add(Src1);
9487 break;
9488 }
9489 case AMDGPU::S_PACK_HL_B32_B16: {
9490 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9491 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9492 .addImm(16)
9493 .add(Src0);
9494 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9495 .add(Src1)
9496 .addImm(16)
9497 .addReg(TmpReg, RegState::Kill);
9498 break;
9499 }
9500 case AMDGPU::S_PACK_HH_B32_B16: {
9501 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9502 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9503 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9504 .addImm(16)
9505 .add(Src0);
9506 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9507 .addImm(0xffff0000);
9508 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9509 .add(Src1)
9510 .addReg(ImmReg, RegState::Kill)
9511 .addReg(TmpReg, RegState::Kill);
9512 break;
9513 }
9514 default:
9515 llvm_unreachable("unhandled s_pack_* instruction");
9516 }
9517
9518 MachineOperand &Dest = Inst.getOperand(0);
9519 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9520 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9521}
9522
9523void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9524 MachineInstr &SCCDefInst,
9525 SIInstrWorklist &Worklist,
9526 Register NewCond) const {
9527
9528 // Ensure that def inst defines SCC, which is still live.
9529 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9530 !Op.isDead() && Op.getParent() == &SCCDefInst);
9531 SmallVector<MachineInstr *, 4> CopyToDelete;
9532 // This assumes that all the users of SCC are in the same block
9533 // as the SCC def.
9534 for (MachineInstr &MI : // Skip the def inst itself.
9535 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9536 SCCDefInst.getParent()->end())) {
9537 // Check if SCC is used first.
9538 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9539 if (SCCIdx != -1) {
9540 if (MI.isCopy()) {
9541 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9542 Register DestReg = MI.getOperand(0).getReg();
9543
9544 MRI.replaceRegWith(DestReg, NewCond);
9545 CopyToDelete.push_back(&MI);
9546 } else {
9547
9548 if (NewCond.isValid())
9549 MI.getOperand(SCCIdx).setReg(NewCond);
9550
9551 Worklist.insert(&MI);
9552 }
9553 }
9554 // Exit if we find another SCC def.
9555 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9556 break;
9557 }
9558 for (auto &Copy : CopyToDelete)
9559 Copy->eraseFromParent();
9560}
9561
9562// Instructions that use SCC may be converted to VALU instructions. When that
9563// happens, the SCC register is changed to VCC_LO. The instruction that defines
9564// SCC must be changed to an instruction that defines VCC. This function makes
9565// sure that the instruction that defines SCC is added to the moveToVALU
9566// worklist.
9567void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9568 SIInstrWorklist &Worklist) const {
9569 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9570 // then there is nothing to do because the defining instruction has been
9571 // converted to a VALU already. If SCC then that instruction needs to be
9572 // converted to a VALU.
9573 for (MachineInstr &MI :
9574 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9575 SCCUseInst->getParent()->rend())) {
9576 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9577 break;
9578 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9579 Worklist.insert(&MI);
9580 break;
9581 }
9582 }
9583}
9584
9585const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9586 const MachineInstr &Inst) const {
9587 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9588
9589 switch (Inst.getOpcode()) {
9590 // For target instructions, getOpRegClass just returns the virtual register
9591 // class associated with the operand, so we need to find an equivalent VGPR
9592 // register class in order to move the instruction to the VALU.
9593 case AMDGPU::COPY:
9594 case AMDGPU::PHI:
9595 case AMDGPU::REG_SEQUENCE:
9596 case AMDGPU::INSERT_SUBREG:
9597 case AMDGPU::WQM:
9598 case AMDGPU::SOFT_WQM:
9599 case AMDGPU::STRICT_WWM:
9600 case AMDGPU::STRICT_WQM: {
9601 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9602 if (RI.isAGPRClass(SrcRC)) {
9603 if (RI.isAGPRClass(NewDstRC))
9604 return nullptr;
9605
9606 switch (Inst.getOpcode()) {
9607 case AMDGPU::PHI:
9608 case AMDGPU::REG_SEQUENCE:
9609 case AMDGPU::INSERT_SUBREG:
9610 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9611 break;
9612 default:
9613 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9614 }
9615
9616 if (!NewDstRC)
9617 return nullptr;
9618 } else {
9619 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9620 return nullptr;
9621
9622 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9623 if (!NewDstRC)
9624 return nullptr;
9625 }
9626
9627 return NewDstRC;
9628 }
9629 default:
9630 return NewDstRC;
9631 }
9632}
9633
9634// Find the one SGPR operand we are allowed to use.
9635Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9636 int OpIndices[3]) const {
9637 const MCInstrDesc &Desc = MI.getDesc();
9638
9639 // Find the one SGPR operand we are allowed to use.
9640 //
9641 // First we need to consider the instruction's operand requirements before
9642 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9643 // of VCC, but we are still bound by the constant bus requirement to only use
9644 // one.
9645 //
9646 // If the operand's class is an SGPR, we can never move it.
9647
9648 Register SGPRReg = findImplicitSGPRRead(MI);
9649 if (SGPRReg)
9650 return SGPRReg;
9651
9652 Register UsedSGPRs[3] = {Register()};
9653 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9654
9655 for (unsigned i = 0; i < 3; ++i) {
9656 int Idx = OpIndices[i];
9657 if (Idx == -1)
9658 break;
9659
9660 const MachineOperand &MO = MI.getOperand(Idx);
9661 if (!MO.isReg())
9662 continue;
9663
9664 // Is this operand statically required to be an SGPR based on the operand
9665 // constraints?
9666 const TargetRegisterClass *OpRC =
9667 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9668 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9669 if (IsRequiredSGPR)
9670 return MO.getReg();
9671
9672 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9673 Register Reg = MO.getReg();
9674 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9675 if (RI.isSGPRClass(RegRC))
9676 UsedSGPRs[i] = Reg;
9677 }
9678
9679 // We don't have a required SGPR operand, so we have a bit more freedom in
9680 // selecting operands to move.
9681
9682 // Try to select the most used SGPR. If an SGPR is equal to one of the
9683 // others, we choose that.
9684 //
9685 // e.g.
9686 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9687 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9688
9689 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9690 // prefer those.
9691
9692 if (UsedSGPRs[0]) {
9693 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9694 SGPRReg = UsedSGPRs[0];
9695 }
9696
9697 if (!SGPRReg && UsedSGPRs[1]) {
9698 if (UsedSGPRs[1] == UsedSGPRs[2])
9699 SGPRReg = UsedSGPRs[1];
9700 }
9701
9702 return SGPRReg;
9703}
9704
9706 AMDGPU::OpName OperandName) const {
9707 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9708 return nullptr;
9709
9710 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9711 if (Idx == -1)
9712 return nullptr;
9713
9714 return &MI.getOperand(Idx);
9715}
9716
9718 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9719 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9722 return (Format << 44) |
9723 (1ULL << 56) | // RESOURCE_LEVEL = 1
9724 (3ULL << 60); // OOB_SELECT = 3
9725 }
9726
9727 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9728 if (ST.isAmdHsaOS()) {
9729 // Set ATC = 1. GFX9 doesn't have this bit.
9730 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9731 RsrcDataFormat |= (1ULL << 56);
9732
9733 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9734 // BTW, it disables TC L2 and therefore decreases performance.
9735 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9736 RsrcDataFormat |= (2ULL << 59);
9737 }
9738
9739 return RsrcDataFormat;
9740}
9741
9745 0xffffffff; // Size;
9746
9747 // GFX9 doesn't have ELEMENT_SIZE.
9748 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9749 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9750 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9751 }
9752
9753 // IndexStride = 64 / 32.
9754 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9755 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9756
9757 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9758 // Clear them unless we want a huge stride.
9759 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9760 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9761 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9762
9763 return Rsrc23;
9764}
9765
9767 unsigned Opc = MI.getOpcode();
9768
9769 return isSMRD(Opc);
9770}
9771
9773 return get(Opc).mayLoad() &&
9774 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9775}
9776
9778 TypeSize &MemBytes) const {
9779 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9780 if (!Addr || !Addr->isFI())
9781 return Register();
9782
9783 assert(!MI.memoperands_empty() &&
9784 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9785
9786 FrameIndex = Addr->getIndex();
9787
9788 int VDataIdx =
9789 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
9790 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), VDataIdx));
9791 return MI.getOperand(VDataIdx).getReg();
9792}
9793
9795 TypeSize &MemBytes) const {
9796 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9797 assert(Addr && Addr->isFI());
9798 FrameIndex = Addr->getIndex();
9799
9800 int DataIdx =
9801 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::data);
9802 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), DataIdx));
9803 return MI.getOperand(DataIdx).getReg();
9804}
9805
9807 int &FrameIndex,
9808 TypeSize &MemBytes) const {
9809 if (!MI.mayLoad())
9810 return Register();
9811
9812 if (isMUBUF(MI) || isVGPRSpill(MI))
9813 return isStackAccess(MI, FrameIndex, MemBytes);
9814
9815 if (isSGPRSpill(MI))
9816 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9817
9818 return Register();
9819}
9820
9822 int &FrameIndex,
9823 TypeSize &MemBytes) const {
9824 if (!MI.mayStore())
9825 return Register();
9826
9827 if (isMUBUF(MI) || isVGPRSpill(MI))
9828 return isStackAccess(MI, FrameIndex, MemBytes);
9829
9830 if (isSGPRSpill(MI))
9831 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9832
9833 return Register();
9834}
9835
9837 unsigned Opc = MI.getOpcode();
9839 unsigned DescSize = Desc.getSize();
9840
9841 // If we have a definitive size, we can use it. Otherwise we need to inspect
9842 // the operands to know the size.
9843 if (isFixedSize(MI)) {
9844 unsigned Size = DescSize;
9845
9846 // If we hit the buggy offset, an extra nop will be inserted in MC so
9847 // estimate the worst case.
9848 if (MI.isBranch() && ST.hasOffset3fBug())
9849 Size += 4;
9850
9851 return Size;
9852 }
9853
9854 // Instructions may have a 32-bit literal encoded after them. Check
9855 // operands that could ever be literals.
9856 if (isVALU(MI, /*AllowLDSDMA=*/true) || isSALU(MI)) {
9857 if (isDPP(MI))
9858 return DescSize;
9859 bool HasLiteral = false;
9860 unsigned LiteralSize = 4;
9861 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9862 const MachineOperand &Op = MI.getOperand(I);
9863 const MCOperandInfo &OpInfo = Desc.operands()[I];
9864 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9865 HasLiteral = true;
9866 if (ST.has64BitLiterals()) {
9867 switch (OpInfo.OperandType) {
9868 default:
9869 break;
9872 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9873 LiteralSize = 8;
9874 break;
9877 // A 32-bit literal is only valid when the value fits in BOTH signed
9878 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9879 // emitter's getLit64Encoding logic. This is because of the lack of
9880 // abilility to tell signedness of the literal, therefore we need to
9881 // be conservative and assume values outside this range require a
9882 // 64-bit literal encoding (8 bytes).
9883 if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
9884 !isUInt<32>(Op.getImm()))
9885 LiteralSize = 8;
9886 break;
9887 }
9888 }
9889 break;
9890 }
9891 }
9892 return HasLiteral ? DescSize + LiteralSize : DescSize;
9893 }
9894
9895 // Check whether we have extra NSA words.
9896 if (isMIMG(MI)) {
9897 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9898 if (VAddr0Idx < 0)
9899 return 8;
9900
9901 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9902 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9903 }
9904
9905 switch (Opc) {
9906 case TargetOpcode::BUNDLE:
9907 return getInstBundleSize(MI);
9908 case TargetOpcode::INLINEASM:
9909 case TargetOpcode::INLINEASM_BR: {
9910 const MachineFunction *MF = MI.getMF();
9911 const char *AsmStr = MI.getOperand(0).getSymbolName();
9912 return getInlineAsmLength(AsmStr, MF->getTarget().getMCAsmInfo(), &ST);
9913 }
9914 default:
9915 if (MI.isMetaInstruction())
9916 return 0;
9917
9918 // If D16 Pseudo inst, get correct MC code size
9919 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9920 if (D16Info) {
9921 // Assume d16_lo/hi inst are always in same size
9922 unsigned LoInstOpcode = D16Info->LoOp;
9923 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9924 DescSize = Desc.getSize();
9925 }
9926
9927 // If FMA Pseudo inst, get correct MC code size
9928 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9929 // All potential lowerings are the same size; arbitrarily pick one.
9930 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9931 DescSize = Desc.getSize();
9932 }
9933
9934 return DescSize;
9935 }
9936}
9937
9940 if (MI.isBranch() && ST.hasOffset3fBug())
9941 return InstSizeVerifyMode::NoVerify;
9942 return InstSizeVerifyMode::ExactSize;
9943}
9944
9946 if (!isFLAT(MI))
9947 return false;
9948
9949 if (MI.memoperands_empty())
9950 return true;
9951
9952 for (const MachineMemOperand *MMO : MI.memoperands()) {
9954 return true;
9955 }
9956 return false;
9957}
9958
9961 static const std::pair<int, const char *> TargetIndices[] = {
9962 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9963 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9964 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9965 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9966 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9967 return ArrayRef(TargetIndices);
9968}
9969
9970/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9971/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9977
9978/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9979/// pass.
9985
9986// Called during:
9987// - pre-RA scheduling and post-RA scheduling
9990 const ScheduleDAGMI *DAG) const {
9991 // Borrowed from Arm Target
9992 // We would like to restrict this hazard recognizer to only
9993 // post-RA scheduling; we can tell that we're post-RA because we don't
9994 // track VRegLiveness.
9995 if (!DAG->hasVRegLiveness())
9996 return new GCNHazardRecognizer(DAG->MF);
9998}
9999
10000std::pair<unsigned, unsigned>
10002 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
10003}
10004
10007 static const std::pair<unsigned, const char *> TargetFlags[] = {
10008 {MO_GOTPCREL, "amdgpu-gotprel"},
10009 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
10010 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
10011 {MO_GOTPCREL64, "amdgpu-gotprel64"},
10012 {MO_REL32_LO, "amdgpu-rel32-lo"},
10013 {MO_REL32_HI, "amdgpu-rel32-hi"},
10014 {MO_REL64, "amdgpu-rel64"},
10015 {MO_ABS32_LO, "amdgpu-abs32-lo"},
10016 {MO_ABS32_HI, "amdgpu-abs32-hi"},
10017 {MO_ABS64, "amdgpu-abs64"},
10018 };
10019
10020 return ArrayRef(TargetFlags);
10021}
10022
10025 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10026 {
10027 {MONoClobber, "amdgpu-noclobber"},
10028 {MOLastUse, "amdgpu-last-use"},
10029 {MOCooperative, "amdgpu-cooperative"},
10030 {MOThreadPrivate, "amdgpu-thread-private"},
10031 };
10032
10033 return ArrayRef(TargetFlags);
10034}
10035
10037 const MachineFunction &MF) const {
10039 assert(SrcReg.isVirtual());
10040 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
10041 return AMDGPU::WWM_COPY;
10042
10043 return AMDGPU::COPY;
10044}
10045
10047 uint32_t Opcode = MI.getOpcode();
10048 // Check if it is SGPR spill or wwm-register spill Opcode.
10049 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
10050 return true;
10051
10052 const MachineFunction *MF = MI.getMF();
10053 const MachineRegisterInfo &MRI = MF->getRegInfo();
10055
10056 // See if this is Liverange split instruction inserted for SGPR or
10057 // wwm-register. The implicit def inserted for wwm-registers should also be
10058 // included as they can appear at the bb begin.
10059 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
10060 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
10061 return false;
10062
10063 Register Reg = MI.getOperand(0).getReg();
10064 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
10065 return IsLRSplitInst;
10066
10067 return MFI->isWWMReg(Reg);
10068}
10069
10071 Register Reg) const {
10072 // We need to handle instructions which may be inserted during register
10073 // allocation to handle the prolog. The initial prolog instruction may have
10074 // been separated from the start of the block by spills and copies inserted
10075 // needed by the prolog. However, the insertions for scalar registers can
10076 // always be placed at the BB top as they are independent of the exec mask
10077 // value.
10078 bool IsNullOrVectorRegister = true;
10079 if (Reg) {
10080 const MachineFunction *MF = MI.getMF();
10081 const MachineRegisterInfo &MRI = MF->getRegInfo();
10082 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
10083 }
10084
10085 return IsNullOrVectorRegister &&
10086 (canAddToBBProlog(MI) ||
10087 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10088 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
10089}
10090
10094 const DebugLoc &DL,
10095 Register DestReg) const {
10096 if (ST.hasAddNoCarryInsts())
10097 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
10098
10099 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10100 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
10101 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
10102
10103 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10104 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10105}
10106
10109 const DebugLoc &DL,
10110 Register DestReg,
10111 RegScavenger &RS) const {
10112 if (ST.hasAddNoCarryInsts())
10113 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
10114
10115 // If available, prefer to use vcc.
10116 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
10117 ? Register(RI.getVCC())
10118 : RS.scavengeRegisterBackwards(
10119 *RI.getBoolRC(), I, /* RestoreAfter */ false,
10120 0, /* AllowSpill */ false);
10121
10122 // TODO: Users need to deal with this.
10123 if (!UnusedCarry.isValid())
10124 return MachineInstrBuilder();
10125
10126 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10127 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10128}
10129
10130bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10131 switch (Opcode) {
10132 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10133 case AMDGPU::SI_KILL_I1_TERMINATOR:
10134 return true;
10135 default:
10136 return false;
10137 }
10138}
10139
10141 switch (Opcode) {
10142 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10143 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10144 case AMDGPU::SI_KILL_I1_PSEUDO:
10145 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10146 default:
10147 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10148 }
10149}
10150
10151bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10152 return Imm <= getMaxMUBUFImmOffset(ST);
10153}
10154
10156 // GFX12 field is non-negative 24-bit signed byte offset.
10157 const unsigned OffsetBits =
10158 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10159 return (1 << OffsetBits) - 1;
10160}
10161
10163 if (!ST.isWave32())
10164 return;
10165
10166 if (MI.isInlineAsm())
10167 return;
10168
10169 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10170 return;
10171
10172 for (auto &Op : MI.implicit_operands()) {
10173 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10174 Op.setReg(AMDGPU::VCC_LO);
10175 }
10176}
10177
10179 if (!isSMRD(MI))
10180 return false;
10181
10182 // Check that it is using a buffer resource.
10183 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10184 if (Idx == -1) // e.g. s_memtime
10185 return false;
10186
10187 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10188 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10189}
10190
10191// Given Imm, split it into the values to put into the SOffset and ImmOffset
10192// fields in an MUBUF instruction. Return false if it is not possible (due to a
10193// hardware bug needing a workaround).
10194//
10195// The required alignment ensures that individual address components remain
10196// aligned if they are aligned to begin with. It also ensures that additional
10197// offsets within the given alignment can be added to the resulting ImmOffset.
10199 uint32_t &ImmOffset, Align Alignment) const {
10200 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10201 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10202 uint32_t Overflow = 0;
10203
10204 if (Imm > MaxImm) {
10205 if (Imm <= MaxImm + 64) {
10206 // Use an SOffset inline constant for 4..64
10207 Overflow = Imm - MaxImm;
10208 Imm = MaxImm;
10209 } else {
10210 // Try to keep the same value in SOffset for adjacent loads, so that
10211 // the corresponding register contents can be re-used.
10212 //
10213 // Load values with all low-bits (except for alignment bits) set into
10214 // SOffset, so that a larger range of values can be covered using
10215 // s_movk_i32.
10216 //
10217 // Atomic operations fail to work correctly when individual address
10218 // components are unaligned, even if their sum is aligned.
10219 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10220 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10221 Imm = Low;
10222 Overflow = High - Alignment.value();
10223 }
10224 }
10225
10226 if (Overflow > 0) {
10227 // There is a hardware bug in SI and CI which prevents address clamping in
10228 // MUBUF instructions from working correctly with SOffsets. The immediate
10229 // offset is unaffected.
10230 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10231 return false;
10232
10233 // It is not possible to set immediate in SOffset field on some targets.
10234 if (ST.hasRestrictedSOffset())
10235 return false;
10236 }
10237
10238 ImmOffset = Imm;
10239 SOffset = Overflow;
10240 return true;
10241}
10242
10243// Depending on the used address space and instructions, some immediate offsets
10244// are allowed and some are not.
10245// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10246// scratch instruction offsets can also be negative. On GFX12, offsets can be
10247// negative for all variants.
10248//
10249// There are several bugs related to these offsets:
10250// On gfx10.1, flat instructions that go into the global address space cannot
10251// use an offset.
10252//
10253// For scratch instructions, the address can be either an SGPR or a VGPR.
10254// The following offsets can be used, depending on the architecture (x means
10255// cannot be used):
10256// +----------------------------+------+------+
10257// | Address-Mode | SGPR | VGPR |
10258// +----------------------------+------+------+
10259// | gfx9 | | |
10260// | negative, 4-aligned offset | x | ok |
10261// | negative, unaligned offset | x | ok |
10262// +----------------------------+------+------+
10263// | gfx10 | | |
10264// | negative, 4-aligned offset | ok | ok |
10265// | negative, unaligned offset | ok | x |
10266// +----------------------------+------+------+
10267// | gfx10.3 | | |
10268// | negative, 4-aligned offset | ok | ok |
10269// | negative, unaligned offset | ok | ok |
10270// +----------------------------+------+------+
10271//
10272// This function ignores the addressing mode, so if an offset cannot be used in
10273// one addressing mode, it is considered illegal.
10274bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10275 AMDGPU::FlatAddrSpace FlatVariant) const {
10276 // TODO: Should 0 be special cased?
10277 if (!ST.hasFlatInstOffsets())
10278 return false;
10279
10281 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == FlatAddrSpace::FLAT &&
10282 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10283 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10284 return false;
10285
10286 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10287 FlatVariant == FlatAddrSpace::FlatScratch && Offset < 0 &&
10288 (Offset % 4) != 0) {
10289 return false;
10290 }
10291
10292 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10293 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10294 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10295}
10296
10297// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10298std::pair<int64_t, int64_t>
10299SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10300 AMDGPU::FlatAddrSpace FlatVariant) const {
10301 int64_t RemainderOffset = COffsetVal;
10302 int64_t ImmField = 0;
10303
10304 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10305 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10306
10307 if (AllowNegative) {
10308 // Use signed division by a power of two to truncate towards 0.
10309 int64_t D = 1LL << NumBits;
10310 RemainderOffset = (COffsetVal / D) * D;
10311 ImmField = COffsetVal - RemainderOffset;
10312
10313 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10314 FlatVariant == AMDGPU::FlatAddrSpace::FlatScratch && ImmField < 0 &&
10315 (ImmField % 4) != 0) {
10316 // Make ImmField a multiple of 4
10317 RemainderOffset += ImmField % 4;
10318 ImmField -= ImmField % 4;
10319 }
10320 } else if (COffsetVal >= 0) {
10321 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10322 RemainderOffset = COffsetVal - ImmField;
10323 }
10324
10325 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10326 assert(RemainderOffset + ImmField == COffsetVal);
10327 return {ImmField, RemainderOffset};
10328}
10329
10331 AMDGPU::FlatAddrSpace FlatVariant) const {
10332 if (ST.hasNegativeScratchOffsetBug() &&
10334 return false;
10335
10336 return FlatVariant != AMDGPU::FlatAddrSpace::FLAT || AMDGPU::isGFX12Plus(ST);
10337}
10338
10339static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10340 switch (ST.getGeneration()) {
10341 default:
10342 break;
10345 return SIEncodingFamily::SI;
10348 return SIEncodingFamily::VI;
10352 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10355 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10359 }
10360 llvm_unreachable("Unknown subtarget generation!");
10361}
10362
10363bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10364 switch(MCOp) {
10365 // These opcodes use indirect register addressing so
10366 // they need special handling by codegen (currently missing).
10367 // Therefore it is too risky to allow these opcodes
10368 // to be selected by dpp combiner or sdwa peepholer.
10369 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10370 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10371 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10372 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10373 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10374 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10375 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10376 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10377 return true;
10378 default:
10379 return false;
10380 }
10381}
10382
10383#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10384 case OPCODE##_dpp: \
10385 case OPCODE##_e32: \
10386 case OPCODE##_e64: \
10387 case OPCODE##_e64_dpp: \
10388 case OPCODE##_sdwa:
10389
10390static bool isRenamedInGFX9(int Opcode) {
10391 switch (Opcode) {
10392 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10393 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10394 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10395 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10396 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10397 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10398 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10399 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10400 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10401 //
10402 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10403 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10404 case AMDGPU::V_FMA_F16_gfx9_e64:
10405 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10406 case AMDGPU::V_INTERP_P2_F16:
10407 case AMDGPU::V_MAD_F16_e64:
10408 case AMDGPU::V_MAD_U16_e64:
10409 case AMDGPU::V_MAD_I16_e64:
10410 return true;
10411 default:
10412 return false;
10413 }
10414}
10415
10416int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10417 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10418 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10419
10420 unsigned Gen = subtargetEncodingFamily(ST);
10421
10422 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10424
10425 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10426 // subtarget has UnpackedD16VMem feature.
10427 // TODO: remove this when we discard GFX80 encoding.
10428 if (ST.hasUnpackedD16VMem() && SIInstrFlags::isD16Buf(get(Opcode)))
10430
10431 if (SIInstrFlags::isSDWA(get(Opcode))) {
10432 switch (ST.getGeneration()) {
10433 default:
10435 break;
10438 break;
10441 break;
10442 }
10443 }
10444
10445 if (isMAI(Opcode)) {
10446 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10447 if (MFMAOp != -1)
10448 Opcode = MFMAOp;
10449 }
10450
10451 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10452
10453 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10455
10456 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10458
10459 // -1 means that Opcode is already a native instruction.
10460 if (MCOp == -1)
10461 return Opcode;
10462
10463 if (ST.hasGFX90AInsts()) {
10464 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10465 if (ST.hasGFX940Insts())
10467 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10469 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10471 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10472 MCOp = NMCOp;
10473 }
10474
10475 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10476 // encoding in the given subtarget generation.
10477 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10478 return -1;
10479
10480 if (isAsmOnlyOpcode(MCOp))
10481 return -1;
10482
10483 return MCOp;
10484}
10485
10486static
10488 assert(RegOpnd.isReg());
10489 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10490 getRegSubRegPair(RegOpnd);
10491}
10492
10495 assert(MI.isRegSequence());
10496 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10497 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10498 auto &RegOp = MI.getOperand(1 + 2 * I);
10499 return getRegOrUndef(RegOp);
10500 }
10502}
10503
10504// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10505// Following a subreg of reg:subreg isn't supported
10508 if (!RSR.SubReg)
10509 return false;
10510 switch (MI.getOpcode()) {
10511 default: break;
10512 case AMDGPU::REG_SEQUENCE:
10513 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10514 return true;
10515 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10516 case AMDGPU::INSERT_SUBREG:
10517 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10518 // inserted the subreg we're looking for
10519 RSR = getRegOrUndef(MI.getOperand(2));
10520 else { // the subreg in the rest of the reg
10521 auto R1 = getRegOrUndef(MI.getOperand(1));
10522 if (R1.SubReg) // subreg of subreg isn't supported
10523 return false;
10524 RSR.Reg = R1.Reg;
10525 }
10526 return true;
10527 }
10528 return false;
10529}
10530
10532 const MachineRegisterInfo &MRI) {
10533 assert(MRI.isSSA());
10534 if (!P.Reg.isVirtual())
10535 return nullptr;
10536
10537 auto RSR = P;
10538 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10539 while (auto *MI = DefInst) {
10540 DefInst = nullptr;
10541 switch (MI->getOpcode()) {
10542 case AMDGPU::COPY:
10543 case AMDGPU::V_MOV_B32_e32: {
10544 auto &Op1 = MI->getOperand(1);
10545 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10546 if (Op1.isUndef())
10547 return nullptr;
10548 RSR = getRegSubRegPair(Op1);
10549 DefInst = MRI.getVRegDef(RSR.Reg);
10550 }
10551 break;
10552 }
10553 default:
10554 if (followSubRegDef(*MI, RSR)) {
10555 if (!RSR.Reg)
10556 return nullptr;
10557 DefInst = MRI.getVRegDef(RSR.Reg);
10558 }
10559 }
10560 if (!DefInst)
10561 return MI;
10562 }
10563 return nullptr;
10564}
10565
10567 Register VReg,
10568 const MachineInstr &DefMI,
10569 const MachineInstr &UseMI) {
10570 assert(MRI.isSSA() && "Must be run on SSA");
10571
10572 auto *TRI = MRI.getTargetRegisterInfo();
10573 auto *DefBB = DefMI.getParent();
10574
10575 // Don't bother searching between blocks, although it is possible this block
10576 // doesn't modify exec.
10577 if (UseMI.getParent() != DefBB)
10578 return true;
10579
10580 const int MaxInstScan = 20;
10581 int NumInst = 0;
10582
10583 // Stop scan at the use.
10584 auto E = UseMI.getIterator();
10585 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10586 if (I->isDebugInstr())
10587 continue;
10588
10589 if (++NumInst > MaxInstScan)
10590 return true;
10591
10592 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10593 return true;
10594 }
10595
10596 return false;
10597}
10598
10600 Register VReg,
10601 const MachineInstr &DefMI) {
10602 assert(MRI.isSSA() && "Must be run on SSA");
10603
10604 auto *TRI = MRI.getTargetRegisterInfo();
10605 auto *DefBB = DefMI.getParent();
10606
10607 const int MaxUseScan = 10;
10608 int NumUse = 0;
10609
10610 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10611 auto &UseInst = *Use.getParent();
10612 // Don't bother searching between blocks, although it is possible this block
10613 // doesn't modify exec.
10614 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10615 return true;
10616
10617 if (++NumUse > MaxUseScan)
10618 return true;
10619 }
10620
10621 if (NumUse == 0)
10622 return false;
10623
10624 const int MaxInstScan = 20;
10625 int NumInst = 0;
10626
10627 // Stop scan when we have seen all the uses.
10628 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10629 assert(I != DefBB->end());
10630
10631 if (I->isDebugInstr())
10632 continue;
10633
10634 if (++NumInst > MaxInstScan)
10635 return true;
10636
10637 for (const MachineOperand &Op : I->operands()) {
10638 // We don't check reg masks here as they're used only on calls:
10639 // 1. EXEC is only considered const within one BB
10640 // 2. Call should be a terminator instruction if present in a BB
10641
10642 if (!Op.isReg())
10643 continue;
10644
10645 Register Reg = Op.getReg();
10646 if (Op.isUse()) {
10647 if (Reg == VReg && --NumUse == 0)
10648 return false;
10649 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10650 return true;
10651 }
10652 }
10653}
10654
10657 const DebugLoc &DL, Register Src, Register Dst) const {
10658 auto Cur = MBB.begin();
10659 if (Cur != MBB.end())
10660 do {
10661 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10662 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10663 ++Cur;
10664 } while (Cur != MBB.end() && Cur != LastPHIIt);
10665
10666 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10667 Dst);
10668}
10669
10672 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10673 if (InsPt != MBB.end() &&
10674 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10675 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10676 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10677 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10678 InsPt++;
10679 return BuildMI(MBB, InsPt, DL,
10680 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10681 .addReg(Src, {}, SrcSubReg)
10682 .addReg(AMDGPU::EXEC, RegState::Implicit);
10683 }
10684 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10685 Dst);
10686}
10687
10688bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10689
10691 const MachineInstr &SecondMI) const {
10692 for (const auto &Use : SecondMI.all_uses()) {
10693 if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), &RI))
10694 return true;
10695 }
10696 return false;
10697}
10698
10699/// If OpX is multicycle, anti-dependencies are not allowed.
10700/// isDPMACCInstruction was not designed for VOPD, but it is fit for the
10701/// purpose.
10703 const MachineInstr &OpX) const {
10705}
10706
10709 ArrayRef<unsigned> Ops, int FrameIndex,
10710 MachineInstr *&CopyMI, LiveIntervals *LIS,
10711 VirtRegMap *VRM) const {
10712 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10713 //
10714 // %0:sreg_32 = COPY $m0
10715 //
10716 // We explicitly chose SReg_32 for the virtual register so such a copy might
10717 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10718 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10719 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10720 // TargetInstrInfo::foldMemoryOperand() is going to try.
10721 // A similar issue also exists with spilling and reloading $exec registers.
10722 //
10723 // To prevent that, constrain the %0 register class here.
10724 if (isFullCopyInstr(MI)) {
10725 Register DstReg = MI.getOperand(0).getReg();
10726 Register SrcReg = MI.getOperand(1).getReg();
10727 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10728 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10729 MachineRegisterInfo &MRI = MF.getRegInfo();
10730 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10731 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10732 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10733 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10734 return nullptr;
10735 }
10736 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10737 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10738 return nullptr;
10739 }
10740 }
10741 }
10742
10743 return nullptr;
10744}
10745
10747 const MachineInstr &MI,
10748 unsigned *PredCost) const {
10749 if (MI.isBundle()) {
10751 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10752 unsigned Lat = 0, Count = 0;
10753 for (++I; I != E && I->isBundledWithPred(); ++I) {
10754 ++Count;
10755 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10756 }
10757 return Lat + Count - 1;
10758 }
10759
10760 return SchedModel.computeInstrLatency(&MI);
10761}
10762
10763const MachineOperand &
10765 if (const MachineOperand *CallAddrOp =
10766 getNamedOperand(MI, AMDGPU::OpName::src0))
10767 return *CallAddrOp;
10769}
10770
10773 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10774 unsigned Opcode = MI.getOpcode();
10775
10776 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10777 Register Dst = MI.getOperand(0).getReg();
10778 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10779 : MI.getOperand(1).getReg();
10780 LLT DstTy = MRI.getType(Dst);
10781 LLT SrcTy = MRI.getType(Src);
10782 unsigned DstAS = DstTy.getAddressSpace();
10783 unsigned SrcAS = SrcTy.getAddressSpace();
10784 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10785 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10786 ST.hasGloballyAddressableScratch()
10789 };
10790
10791 // If the target supports globally addressable scratch, the mapping from
10792 // scratch memory to the flat aperture changes therefore an address space cast
10793 // is no longer uniform.
10794 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10795 return HandleAddrSpaceCast(MI);
10796
10797 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10798 auto IID = GI->getIntrinsicID();
10803
10804 switch (IID) {
10805 case Intrinsic::amdgcn_addrspacecast_nonnull:
10806 return HandleAddrSpaceCast(MI);
10807 case Intrinsic::amdgcn_if:
10808 case Intrinsic::amdgcn_else:
10809 // FIXME: Uniform if second result
10810 break;
10811 }
10812
10814 }
10815
10816 // Loads from the private and flat address spaces are divergent, because
10817 // threads can execute the load instruction with the same inputs and get
10818 // different results.
10819 //
10820 // All other loads are not divergent, because if threads issue loads with the
10821 // same arguments, they will always get the same result.
10822 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10823 Opcode == AMDGPU::G_SEXTLOAD) {
10824 if (MI.memoperands_empty())
10825 return ValueUniformity::NeverUniform; // conservative assumption
10826
10827 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10828 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10829 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10830 })) {
10831 // At least one MMO in a non-global address space.
10833 }
10835 }
10836
10837 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10838 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10839 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10840 AMDGPU::isGenericAtomic(Opcode)) {
10842 }
10843
10844 // Result is computed from uniform SP and uniform wave-wide max size.
10845 if (Opcode == TargetOpcode::G_DYN_STACKALLOC)
10847
10848 if (Opcode == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
10850
10852}
10853
10855 if (!Formatter)
10856 Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
10857 return Formatter.get();
10858}
10859
10861
10862 if (isNeverUniform(MI))
10864
10865 unsigned opcode = MI.getOpcode();
10866 if (opcode == AMDGPU::V_READLANE_B32 ||
10867 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10868 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10870
10871 // If any of defs is divergent, report as NeverUniform. isUniformReg will
10872 // calculate in more detail for each def from its reg class, if available.
10873 if (MI.isInlineAsm()) {
10874 for (const MachineOperand &MO : MI.operands()) {
10875 if (!MO.isReg() || !MO.isDef())
10876 continue;
10877 const TargetRegisterClass *RC =
10878 MI.getRegClassConstraint(MO.getOperandNo(), this, &RI);
10879 if (!RC || !RI.isSGPRClass(RC))
10881 }
10882 }
10883
10884 if (isCopyInstr(MI)) {
10885 const MachineOperand &srcOp = MI.getOperand(1);
10886 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10887 const TargetRegisterClass *regClass =
10888 RI.getPhysRegBaseClass(srcOp.getReg());
10889 return RI.isSGPRClass(regClass) ? ValueUniformity::AlwaysUniform
10891 }
10893 }
10894
10895 // GMIR handling
10896 if (MI.isPreISelOpcode())
10898
10899 // Atomics are divergent because they are executed sequentially: when an
10900 // atomic operation refers to the same address in each thread, then each
10901 // thread after the first sees the value written by the previous thread as
10902 // original value.
10903
10904 if (isAtomic(MI))
10906
10907 // Loads from the private and flat address spaces are divergent, because
10908 // threads can execute the load instruction with the same inputs and get
10909 // different results.
10910 if (isFLAT(MI) && MI.mayLoad()) {
10911 if (MI.memoperands_empty())
10912 return ValueUniformity::NeverUniform; // conservative assumption
10913
10914 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10915 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10916 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10917 })) {
10918 // At least one MMO in a non-global address space.
10920 }
10921
10923 }
10924
10925 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10926 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10927
10928 // FIXME: It's conceptually broken to report this for an instruction, and not
10929 // a specific def operand. For inline asm in particular, there could be mixed
10930 // uniform and divergent results.
10931 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10932 const MachineOperand &SrcOp = MI.getOperand(I);
10933 if (!SrcOp.isReg())
10934 continue;
10935
10936 Register Reg = SrcOp.getReg();
10937 if (!Reg || !SrcOp.readsReg())
10938 continue;
10939
10940 // If RegBank is null, this is unassigned or an unallocatable special
10941 // register, which are all scalars.
10942 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10943 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10945 }
10946
10947 // TODO: Uniformity check condtions above can be rearranged for more
10948 // redability
10949
10950 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10951 // currently turned into no-op COPYs by SelectionDAG ISel and are
10952 // therefore no longer recognizable.
10953
10955}
10956
10958 switch (MF.getFunction().getCallingConv()) {
10960 return 1;
10962 return 2;
10964 return 3;
10968 const Function &F = MF.getFunction();
10969 F.getContext().diagnose(DiagnosticInfoUnsupported(
10970 F, "ds_ordered_count unsupported for this calling conv"));
10971 [[fallthrough]];
10972 }
10975 case CallingConv::C:
10976 case CallingConv::Fast:
10977 default:
10978 // Assume other calling conventions are various compute callable functions
10979 return 0;
10980 }
10981}
10982
10984 Register &SrcReg2, int64_t &CmpMask,
10985 int64_t &CmpValue) const {
10986 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10987 return false;
10988
10989 switch (MI.getOpcode()) {
10990 default:
10991 break;
10992 case AMDGPU::S_CMP_EQ_U32:
10993 case AMDGPU::S_CMP_EQ_I32:
10994 case AMDGPU::S_CMP_LG_U32:
10995 case AMDGPU::S_CMP_LG_I32:
10996 case AMDGPU::S_CMP_LT_U32:
10997 case AMDGPU::S_CMP_LT_I32:
10998 case AMDGPU::S_CMP_GT_U32:
10999 case AMDGPU::S_CMP_GT_I32:
11000 case AMDGPU::S_CMP_LE_U32:
11001 case AMDGPU::S_CMP_LE_I32:
11002 case AMDGPU::S_CMP_GE_U32:
11003 case AMDGPU::S_CMP_GE_I32:
11004 case AMDGPU::S_CMP_EQ_U64:
11005 case AMDGPU::S_CMP_LG_U64:
11006 SrcReg = MI.getOperand(0).getReg();
11007 if (MI.getOperand(1).isReg()) {
11008 if (MI.getOperand(1).getSubReg())
11009 return false;
11010 SrcReg2 = MI.getOperand(1).getReg();
11011 CmpValue = 0;
11012 } else if (MI.getOperand(1).isImm()) {
11013 SrcReg2 = Register();
11014 CmpValue = MI.getOperand(1).getImm();
11015 } else {
11016 return false;
11017 }
11018 CmpMask = ~0;
11019 return true;
11020 case AMDGPU::S_CMPK_EQ_U32:
11021 case AMDGPU::S_CMPK_EQ_I32:
11022 case AMDGPU::S_CMPK_LG_U32:
11023 case AMDGPU::S_CMPK_LG_I32:
11024 case AMDGPU::S_CMPK_LT_U32:
11025 case AMDGPU::S_CMPK_LT_I32:
11026 case AMDGPU::S_CMPK_GT_U32:
11027 case AMDGPU::S_CMPK_GT_I32:
11028 case AMDGPU::S_CMPK_LE_U32:
11029 case AMDGPU::S_CMPK_LE_I32:
11030 case AMDGPU::S_CMPK_GE_U32:
11031 case AMDGPU::S_CMPK_GE_I32:
11032 SrcReg = MI.getOperand(0).getReg();
11033 SrcReg2 = Register();
11034 CmpValue = MI.getOperand(1).getImm();
11035 CmpMask = ~0;
11036 return true;
11037 }
11038
11039 return false;
11040}
11041
11043 for (MachineBasicBlock *S : MBB->successors()) {
11044 if (S->isLiveIn(AMDGPU::SCC))
11045 return false;
11046 }
11047 return true;
11048}
11049
11050// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
11051// (incoming SCC) = !(SCC defined by SCCDef).
11052// Return true if all uses can be re-written, false otherwise.
11053bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
11054 MachineBasicBlock *MBB = SCCDef->getParent();
11055 SmallVector<MachineInstr *> InvertInstr;
11056 bool SCCIsDead = false;
11057
11058 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
11059 constexpr unsigned ScanLimit = 12;
11060 unsigned Count = 0;
11061 for (MachineInstr &MI :
11062 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
11063 if (++Count > ScanLimit)
11064 return false;
11065 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
11066 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
11067 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
11068 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11069 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
11070 InvertInstr.push_back(&MI);
11071 else
11072 return false;
11073 }
11074 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
11075 SCCIsDead = true;
11076 break;
11077 }
11078 }
11079 if (!SCCIsDead && isSCCDeadOnExit(MBB))
11080 SCCIsDead = true;
11081
11082 // SCC may have more uses. Can't invert all of them.
11083 if (!SCCIsDead)
11084 return false;
11085
11086 // Invert uses
11087 for (MachineInstr *MI : InvertInstr) {
11088 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
11089 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
11090 swapOperands(*MI);
11091 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11092 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
11093 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
11094 ? AMDGPU::S_CBRANCH_SCC1
11095 : AMDGPU::S_CBRANCH_SCC0));
11096 } else {
11097 llvm_unreachable("SCC used but no inversion handling");
11098 }
11099 }
11100 return true;
11101}
11102
11103// SCC is already valid after SCCValid.
11104// SCCRedefine will redefine SCC to the same value already available after
11105// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
11106// update kill/dead flags if necessary.
11107bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
11108 bool NeedInversion) const {
11109 MachineInstr *KillsSCC = nullptr;
11110 if (SCCValid->getParent() != SCCRedefine->getParent())
11111 return false;
11112 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
11113 SCCRedefine->getIterator())) {
11114 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
11115 return false;
11116 if (MI.killsRegister(AMDGPU::SCC, &RI))
11117 KillsSCC = &MI;
11118 }
11119 if (NeedInversion && !invertSCCUse(SCCRedefine))
11120 return false;
11121 if (MachineOperand *SccDef =
11122 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
11123 SccDef->setIsDead(false);
11124 if (KillsSCC)
11125 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
11126 SCCRedefine->eraseFromParent();
11127 return true;
11128}
11129
11130static bool foldableSelect(const MachineInstr &Def) {
11131 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11132 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11133 return false;
11134 bool Op1IsNonZeroImm =
11135 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
11136 bool Op2IsZeroImm =
11137 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
11138 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11139 return false;
11140 return true;
11141}
11142
11143static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11144 unsigned &NewDefOpc) {
11145 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11146 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11147 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11148 Def.getOpcode() != AMDGPU::S_ADD_U32)
11149 return false;
11150 const MachineOperand &AddSrc1 = Def.getOperand(1);
11151 const MachineOperand &AddSrc2 = Def.getOperand(2);
11152 int64_t addend;
11153
11154 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11155 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11156 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
11157 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
11158 return false;
11159
11160 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11161 const MachineOperand *SccDef =
11162 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
11163 if (!SccDef->isDead())
11164 return false;
11165 NewDefOpc = AMDGPU::S_ADD_U32;
11166 }
11167 NeedInversion = !NeedInversion;
11168 return true;
11169}
11170
11172 Register SrcReg2, int64_t CmpMask,
11173 int64_t CmpValue,
11174 const MachineRegisterInfo *MRI) const {
11175 if (!SrcReg || SrcReg.isPhysical())
11176 return false;
11177
11178 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
11179 return false;
11180
11181 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11182 this](bool NeedInversion) -> bool {
11183 if (CmpValue != 0)
11184 return false;
11185
11186 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11187 if (!Def)
11188 return false;
11189
11190 // For S_OP that set SCC = DST!=0, do the transformation
11191 //
11192 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11193 //
11194 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11195 // do the transformation:
11196 //
11197 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11198 //
11199 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11200 // for S_CSELECT* already has the same value that will be calculated by
11201 // s_cmp_lg_*
11202 //
11203 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11204 // (non-zero imm), 0)
11205
11206 unsigned NewDefOpc = Def->getOpcode();
11207 if (!setsSCCIfResultIsNonZero(*Def) &&
11208 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11209 !foldableSelect(*Def))
11210 return false;
11211
11212 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11213 return false;
11214
11215 if (NewDefOpc != Def->getOpcode())
11216 Def->setDesc(get(NewDefOpc));
11217
11218 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11219 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11220 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11221 // sX = s_cselect_b64 (non-zero imm), 0
11222 // sLo = copy sX.sub0
11223 // sHi = copy sX.sub1
11224 // sY = s_or_b32 sLo, sHi
11225 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11226 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11227 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11228 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11229 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11230 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11231 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11232 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11233 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11234 Def2->getOperand(1).isReg() &&
11235 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11236 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11237 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11238 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11239 if (Select && foldableSelect(*Select))
11240 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11241 }
11242 }
11243 }
11244 return true;
11245 };
11246
11247 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11248 this](int64_t ExpectedValue, unsigned SrcSize,
11249 bool IsReversible, bool IsSigned) -> bool {
11250 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11251 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11252 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11253 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11254 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11255 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11256 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11257 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11258 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11259 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11260 //
11261 // Signed ge/gt are not used for the sign bit.
11262 //
11263 // If result of the AND is unused except in the compare:
11264 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11265 //
11266 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11267 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11268 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11269 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11270 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11271 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11272
11273 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11274 if (!Def)
11275 return false;
11276
11277 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11278 Def->getOpcode() != AMDGPU::S_AND_B64)
11279 return false;
11280
11281 int64_t Mask;
11282 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11283 if (MO->isImm())
11284 Mask = MO->getImm();
11285 else if (!getFoldableImm(MO, Mask))
11286 return false;
11287 Mask &= maxUIntN(SrcSize);
11288 return isPowerOf2_64(Mask);
11289 };
11290
11291 MachineOperand *SrcOp = &Def->getOperand(1);
11292 if (isMask(SrcOp))
11293 SrcOp = &Def->getOperand(2);
11294 else if (isMask(&Def->getOperand(2)))
11295 SrcOp = &Def->getOperand(1);
11296 else
11297 return false;
11298
11299 // A valid Mask is required to have a single bit set, hence a non-zero and
11300 // power-of-two value. This verifies that we will not do 64-bit shift below.
11301 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11302 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11303 if (IsSigned && BitNo == SrcSize - 1)
11304 return false;
11305
11306 ExpectedValue <<= BitNo;
11307
11308 bool IsReversedCC = false;
11309 if (CmpValue != ExpectedValue) {
11310 if (!IsReversible)
11311 return false;
11312 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11313 if (!IsReversedCC)
11314 return false;
11315 }
11316
11317 Register DefReg = Def->getOperand(0).getReg();
11318 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11319 return false;
11320
11321 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11322 return false;
11323
11324 if (!MRI->use_nodbg_empty(DefReg)) {
11325 assert(!IsReversedCC);
11326 return true;
11327 }
11328
11329 // Replace AND with unused result with a S_BITCMP.
11330 MachineBasicBlock *MBB = Def->getParent();
11331
11332 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11333 : AMDGPU::S_BITCMP1_B32
11334 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11335 : AMDGPU::S_BITCMP1_B64;
11336
11337 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11338 .add(*SrcOp)
11339 .addImm(BitNo);
11340 Def->eraseFromParent();
11341
11342 return true;
11343 };
11344
11345 switch (CmpInstr.getOpcode()) {
11346 default:
11347 break;
11348 case AMDGPU::S_CMP_EQ_U32:
11349 case AMDGPU::S_CMP_EQ_I32:
11350 case AMDGPU::S_CMPK_EQ_U32:
11351 case AMDGPU::S_CMPK_EQ_I32:
11352 return optimizeCmpAnd(1, 32, true, false) ||
11353 optimizeCmpSelect(/*NeedInversion=*/true);
11354 case AMDGPU::S_CMP_GE_U32:
11355 case AMDGPU::S_CMPK_GE_U32:
11356 return optimizeCmpAnd(1, 32, false, false);
11357 case AMDGPU::S_CMP_GE_I32:
11358 case AMDGPU::S_CMPK_GE_I32:
11359 return optimizeCmpAnd(1, 32, false, true);
11360 case AMDGPU::S_CMP_EQ_U64:
11361 return optimizeCmpAnd(1, 64, true, false);
11362 case AMDGPU::S_CMP_LG_U32:
11363 case AMDGPU::S_CMP_LG_I32:
11364 case AMDGPU::S_CMPK_LG_U32:
11365 case AMDGPU::S_CMPK_LG_I32:
11366 return optimizeCmpAnd(0, 32, true, false) ||
11367 optimizeCmpSelect(/*NeedInversion=*/false);
11368 case AMDGPU::S_CMP_GT_U32:
11369 case AMDGPU::S_CMPK_GT_U32:
11370 return optimizeCmpAnd(0, 32, false, false);
11371 case AMDGPU::S_CMP_GT_I32:
11372 case AMDGPU::S_CMPK_GT_I32:
11373 return optimizeCmpAnd(0, 32, false, true);
11374 case AMDGPU::S_CMP_LG_U64:
11375 return optimizeCmpAnd(0, 64, true, false) ||
11376 optimizeCmpSelect(/*NeedInversion=*/false);
11377 }
11378
11379 return false;
11380}
11381
11383 AMDGPU::OpName OpName) const {
11384 if (!ST.needsAlignedVGPRs())
11385 return;
11386
11387 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11388 if (OpNo < 0)
11389 return;
11390 MachineOperand &Op = MI.getOperand(OpNo);
11391 if (getOpSize(MI, OpNo) > 4)
11392 return;
11393
11394 // Add implicit aligned super-reg to force alignment on the data operand.
11395 const DebugLoc &DL = MI.getDebugLoc();
11396 MachineBasicBlock *BB = MI.getParent();
11398 Register DataReg = Op.getReg();
11399 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11401 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11402 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11403 Register NewVR =
11404 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11405 : &AMDGPU::VReg_64_Align2RegClass);
11406 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11407 .addReg(DataReg, {}, Op.getSubReg())
11408 .addImm(AMDGPU::sub0)
11409 .addReg(Undef)
11410 .addImm(AMDGPU::sub1);
11411 Op.setReg(NewVR);
11412 Op.setSubReg(AMDGPU::sub0);
11413 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11414}
11415
11417 if (isIGLP(*MI))
11418 return false;
11419
11421}
11422
11424 if (!isWMMA(MI) && !isSWMMAC(MI))
11425 return false;
11426
11427 if (ST.hasGFX1250Insts())
11428 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11429
11430 return true;
11431}
11432
11434 unsigned Opcode = MI.getOpcode();
11435
11436 if (AMDGPU::isGFX12Plus(ST))
11437 return isDOT(MI) || isXDLWMMA(MI);
11438
11439 if (!isMAI(MI) || isDGEMM(Opcode) ||
11440 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11441 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11442 return false;
11443
11444 if (!ST.hasGFX940Insts())
11445 return true;
11446
11447 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11448}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={})
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static MachineBasicBlock * generateWaterFallLoop(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr, ArrayRef< Register > PhySGPRs={})
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static unsigned getAVSpillSaveOpcode(unsigned Size, bool NeedsCFI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI)
static constexpr AMDGPU::OpName ModifierOpNames[]
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:484
#define LLVM_DEBUG(...)
Definition Debug.h:119
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:160
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:347
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:417
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:427
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:213
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
MIRFormater - Interface to format MIR operand based on target.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI void moveOperands(MachineOperand *Dst, MachineOperand *Src, unsigned NumOps)
Move NumOps operands from Src to Dst, updating use-def lists as needed.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
LLVM_ABI void clearVirtRegs()
clearVirtRegs - Remove all virtual registers (after physreg assignment).
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
iterator_range< use_iterator > use_operands(Register Reg) const
LLVM_ABI void removeRegOperandFromUseList(MachineOperand *MO)
Remove MO from its use-def list.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI void addRegOperandToUseList(MachineOperand *MO)
Add MO to the linked list of operands for its register.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
bool isSpill(uint32_t Opcode) const
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, AMDGPU::FlatAddrSpace FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
void storeRegToStackSlotCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
const MIRFormatter * getMIRFormatter() const override
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
InstSizeVerifyMode getInstSizeVerifyMode(const MachineInstr &MI) const override
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
Register isStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool hasRAWDependency(const MachineInstr &FirstMI, const MachineInstr &SecondMI) const
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
void handleCopyToPhysHelper(SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst, MachineRegisterInfo &MRI, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool isVOPDAntidependencyAllowed(const MachineInstr &MI) const
If OpX is multicycle, anti-dependencies are not allowed.
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
static bool isVALU(const MachineInstr &MI, bool AllowLDSDMA)
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void createWaterFallForSiCall(MachineInstr *MI, MachineDominatorTree *MDT, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={}) const
Wrapper function for generating waterfall for instruction MI This function take into consideration of...
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
ValueUniformity getGenericValueUniformity(const MachineInstr &MI) const
static bool isMAI(const MCInstrDesc &Desc)
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const override
static bool usesLGKM_CNT(const MachineInstr &MI)
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
bool isAlwaysGDS(uint32_t Opcode) const
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
bool isLegalGFX12PlusPackedMathFP32or64BitOperand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 or 64 instructions.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI, bool NeedsCFI) const
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, AMDGPU::FlatAddrSpace FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
void createReadFirstLaneFromCopyToPhysReg(MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool isWWMRegSpillOpcode(uint32_t Opcode)
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
ValueUniformity getValueUniformity(const MachineInstr &MI) const final
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool allowNegativeFlatOffset(AMDGPU::FlatAddrSpace FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
unsigned getScratchReservedForDynamicVGPRs() const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:301
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:190
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isDPMACCInstruction(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int32_t getCommuteRev(uint32_t Opcode)
LLVM_READONLY int32_t getCommuteOrig(uint32_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READONLY int32_t getGlobalVaddrOp(uint32_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getIfAddr64Inst(uint32_t Opcode)
Check if Opcode is an Addr64 opcode.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
LLVM_READONLY int32_t getAddr64Inst(uint32_t Opcode)
int32_t getMCOpcode(uint32_t Opcode, unsigned Gen)
bool isPackedFP32or64BitInst(unsigned Opc)
@ OPERAND_REG_IMM_V2FP64
Definition SIDefines.h:430
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:448
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:416
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:423
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:439
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:436
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:441
@ OPERAND_REG_IMM_V2INT64
Definition SIDefines.h:426
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:425
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:420
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:415
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:422
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:421
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:424
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:435
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:433
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:427
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:419
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:442
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:453
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:454
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:428
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:465
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:418
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:438
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:434
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:440
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:459
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:429
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:455
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:437
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:417
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:445
LLVM_READONLY int32_t getBasicFromSDWAOp(uint32_t Opcode)
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:614
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:616
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:613
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:615
@ TI_CONSTDATA_START
Definition AMDGPU.h:612
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
constexpr bool isD16Buf(const T &...O)
Definition SIDefines.h:330
constexpr bool isSDWA(const T &...O)
Definition SIDefines.h:243
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:558
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
constexpr RegState getUndefRegState(bool B)
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
MachineCycleInfo::CycleT MachineCycle
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:63
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
constexpr bool all() const
Definition LaneBitmask.h:54
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:67
MachineInstr * top() const
Definition SIInstrInfo.h:72
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:91
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.